1 //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "AArch64TargetTransformInfo.h"
10 #include "AArch64ExpandImm.h"
11 #include "MCTargetDesc/AArch64AddressingModes.h"
12 #include "llvm/Analysis/LoopInfo.h"
13 #include "llvm/Analysis/TargetTransformInfo.h"
14 #include "llvm/CodeGen/BasicTTIImpl.h"
15 #include "llvm/CodeGen/CostTable.h"
16 #include "llvm/CodeGen/TargetLowering.h"
17 #include "llvm/IR/IntrinsicInst.h"
18 #include "llvm/IR/IntrinsicsAArch64.h"
19 #include "llvm/IR/PatternMatch.h"
20 #include "llvm/Support/Debug.h"
21 #include "llvm/Transforms/InstCombine/InstCombiner.h"
22 #include <algorithm>
23 using namespace llvm;
24 using namespace llvm::PatternMatch;
25 
26 #define DEBUG_TYPE "aarch64tti"
27 
28 static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
29                                                cl::init(true), cl::Hidden);
30 
31 bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
32                                          const Function *Callee) const {
33   const TargetMachine &TM = getTLI()->getTargetMachine();
34 
35   const FeatureBitset &CallerBits =
36       TM.getSubtargetImpl(*Caller)->getFeatureBits();
37   const FeatureBitset &CalleeBits =
38       TM.getSubtargetImpl(*Callee)->getFeatureBits();
39 
40   // Inline a callee if its target-features are a subset of the callers
41   // target-features.
42   return (CallerBits & CalleeBits) == CalleeBits;
43 }
44 
45 /// Calculate the cost of materializing a 64-bit value. This helper
46 /// method might only calculate a fraction of a larger immediate. Therefore it
47 /// is valid to return a cost of ZERO.
48 InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) {
49   // Check if the immediate can be encoded within an instruction.
50   if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
51     return 0;
52 
53   if (Val < 0)
54     Val = ~Val;
55 
56   // Calculate how many moves we will need to materialize this constant.
57   SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
58   AArch64_IMM::expandMOVImm(Val, 64, Insn);
59   return Insn.size();
60 }
61 
62 /// Calculate the cost of materializing the given constant.
63 InstructionCost AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
64                                               TTI::TargetCostKind CostKind) {
65   assert(Ty->isIntegerTy());
66 
67   unsigned BitSize = Ty->getPrimitiveSizeInBits();
68   if (BitSize == 0)
69     return ~0U;
70 
71   // Sign-extend all constants to a multiple of 64-bit.
72   APInt ImmVal = Imm;
73   if (BitSize & 0x3f)
74     ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
75 
76   // Split the constant into 64-bit chunks and calculate the cost for each
77   // chunk.
78   InstructionCost Cost = 0;
79   for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
80     APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
81     int64_t Val = Tmp.getSExtValue();
82     Cost += getIntImmCost(Val);
83   }
84   // We need at least one instruction to materialze the constant.
85   return std::max<InstructionCost>(1, Cost);
86 }
87 
88 InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
89                                                   const APInt &Imm, Type *Ty,
90                                                   TTI::TargetCostKind CostKind,
91                                                   Instruction *Inst) {
92   assert(Ty->isIntegerTy());
93 
94   unsigned BitSize = Ty->getPrimitiveSizeInBits();
95   // There is no cost model for constants with a bit size of 0. Return TCC_Free
96   // here, so that constant hoisting will ignore this constant.
97   if (BitSize == 0)
98     return TTI::TCC_Free;
99 
100   unsigned ImmIdx = ~0U;
101   switch (Opcode) {
102   default:
103     return TTI::TCC_Free;
104   case Instruction::GetElementPtr:
105     // Always hoist the base address of a GetElementPtr.
106     if (Idx == 0)
107       return 2 * TTI::TCC_Basic;
108     return TTI::TCC_Free;
109   case Instruction::Store:
110     ImmIdx = 0;
111     break;
112   case Instruction::Add:
113   case Instruction::Sub:
114   case Instruction::Mul:
115   case Instruction::UDiv:
116   case Instruction::SDiv:
117   case Instruction::URem:
118   case Instruction::SRem:
119   case Instruction::And:
120   case Instruction::Or:
121   case Instruction::Xor:
122   case Instruction::ICmp:
123     ImmIdx = 1;
124     break;
125   // Always return TCC_Free for the shift value of a shift instruction.
126   case Instruction::Shl:
127   case Instruction::LShr:
128   case Instruction::AShr:
129     if (Idx == 1)
130       return TTI::TCC_Free;
131     break;
132   case Instruction::Trunc:
133   case Instruction::ZExt:
134   case Instruction::SExt:
135   case Instruction::IntToPtr:
136   case Instruction::PtrToInt:
137   case Instruction::BitCast:
138   case Instruction::PHI:
139   case Instruction::Call:
140   case Instruction::Select:
141   case Instruction::Ret:
142   case Instruction::Load:
143     break;
144   }
145 
146   if (Idx == ImmIdx) {
147     int NumConstants = (BitSize + 63) / 64;
148     InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
149     return (Cost <= NumConstants * TTI::TCC_Basic)
150                ? static_cast<int>(TTI::TCC_Free)
151                : Cost;
152   }
153   return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
154 }
155 
156 InstructionCost
157 AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
158                                     const APInt &Imm, Type *Ty,
159                                     TTI::TargetCostKind CostKind) {
160   assert(Ty->isIntegerTy());
161 
162   unsigned BitSize = Ty->getPrimitiveSizeInBits();
163   // There is no cost model for constants with a bit size of 0. Return TCC_Free
164   // here, so that constant hoisting will ignore this constant.
165   if (BitSize == 0)
166     return TTI::TCC_Free;
167 
168   // Most (all?) AArch64 intrinsics do not support folding immediates into the
169   // selected instruction, so we compute the materialization cost for the
170   // immediate directly.
171   if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
172     return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
173 
174   switch (IID) {
175   default:
176     return TTI::TCC_Free;
177   case Intrinsic::sadd_with_overflow:
178   case Intrinsic::uadd_with_overflow:
179   case Intrinsic::ssub_with_overflow:
180   case Intrinsic::usub_with_overflow:
181   case Intrinsic::smul_with_overflow:
182   case Intrinsic::umul_with_overflow:
183     if (Idx == 1) {
184       int NumConstants = (BitSize + 63) / 64;
185       InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
186       return (Cost <= NumConstants * TTI::TCC_Basic)
187                  ? static_cast<int>(TTI::TCC_Free)
188                  : Cost;
189     }
190     break;
191   case Intrinsic::experimental_stackmap:
192     if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
193       return TTI::TCC_Free;
194     break;
195   case Intrinsic::experimental_patchpoint_void:
196   case Intrinsic::experimental_patchpoint_i64:
197     if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
198       return TTI::TCC_Free;
199     break;
200   case Intrinsic::experimental_gc_statepoint:
201     if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
202       return TTI::TCC_Free;
203     break;
204   }
205   return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
206 }
207 
208 TargetTransformInfo::PopcntSupportKind
209 AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
210   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
211   if (TyWidth == 32 || TyWidth == 64)
212     return TTI::PSK_FastHardware;
213   // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
214   return TTI::PSK_Software;
215 }
216 
217 InstructionCost
218 AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
219                                       TTI::TargetCostKind CostKind) {
220   auto *RetTy = ICA.getReturnType();
221   switch (ICA.getID()) {
222   case Intrinsic::umin:
223   case Intrinsic::umax: {
224     auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
225     // umin(x,y) -> sub(x,usubsat(x,y))
226     // umax(x,y) -> add(x,usubsat(y,x))
227     if (LT.second == MVT::v2i64)
228       return LT.first * 2;
229     LLVM_FALLTHROUGH;
230   }
231   case Intrinsic::smin:
232   case Intrinsic::smax: {
233     static const auto ValidMinMaxTys = {MVT::v8i8,  MVT::v16i8, MVT::v4i16,
234                                         MVT::v8i16, MVT::v2i32, MVT::v4i32};
235     auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
236     if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
237       return LT.first;
238     break;
239   }
240   case Intrinsic::sadd_sat:
241   case Intrinsic::ssub_sat:
242   case Intrinsic::uadd_sat:
243   case Intrinsic::usub_sat: {
244     static const auto ValidSatTys = {MVT::v8i8,  MVT::v16i8, MVT::v4i16,
245                                      MVT::v8i16, MVT::v2i32, MVT::v4i32,
246                                      MVT::v2i64};
247     auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
248     // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
249     // need to extend the type, as it uses shr(qadd(shl, shl)).
250     unsigned Instrs =
251         LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
252     if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
253       return LT.first * Instrs;
254     break;
255   }
256   case Intrinsic::abs: {
257     static const auto ValidAbsTys = {MVT::v8i8,  MVT::v16i8, MVT::v4i16,
258                                      MVT::v8i16, MVT::v2i32, MVT::v4i32,
259                                      MVT::v2i64};
260     auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
261     if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
262       return LT.first;
263     break;
264   }
265   case Intrinsic::experimental_stepvector: {
266     InstructionCost Cost = 1; // Cost of the `index' instruction
267     auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
268     // Legalisation of illegal vectors involves an `index' instruction plus
269     // (LT.first - 1) vector adds.
270     if (LT.first > 1) {
271       Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
272       InstructionCost AddCost =
273           getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
274       Cost += AddCost * (LT.first - 1);
275     }
276     return Cost;
277   }
278   case Intrinsic::bitreverse: {
279     static const CostTblEntry BitreverseTbl[] = {
280         {Intrinsic::bitreverse, MVT::i32, 1},
281         {Intrinsic::bitreverse, MVT::i64, 1},
282         {Intrinsic::bitreverse, MVT::v8i8, 1},
283         {Intrinsic::bitreverse, MVT::v16i8, 1},
284         {Intrinsic::bitreverse, MVT::v4i16, 2},
285         {Intrinsic::bitreverse, MVT::v8i16, 2},
286         {Intrinsic::bitreverse, MVT::v2i32, 2},
287         {Intrinsic::bitreverse, MVT::v4i32, 2},
288         {Intrinsic::bitreverse, MVT::v1i64, 2},
289         {Intrinsic::bitreverse, MVT::v2i64, 2},
290     };
291     const auto LegalisationCost = TLI->getTypeLegalizationCost(DL, RetTy);
292     const auto *Entry =
293         CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
294     // Cost Model is using the legal type(i32) that i8 and i16 will be converted
295     // to +1 so that we match the actual lowering cost
296     if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
297         TLI->getValueType(DL, RetTy, true) == MVT::i16)
298       return LegalisationCost.first * Entry->Cost + 1;
299     if (Entry)
300       return LegalisationCost.first * Entry->Cost;
301     break;
302   }
303   case Intrinsic::ctpop: {
304     static const CostTblEntry CtpopCostTbl[] = {
305         {ISD::CTPOP, MVT::v2i64, 4},
306         {ISD::CTPOP, MVT::v4i32, 3},
307         {ISD::CTPOP, MVT::v8i16, 2},
308         {ISD::CTPOP, MVT::v16i8, 1},
309         {ISD::CTPOP, MVT::i64,   4},
310         {ISD::CTPOP, MVT::v2i32, 3},
311         {ISD::CTPOP, MVT::v4i16, 2},
312         {ISD::CTPOP, MVT::v8i8,  1},
313         {ISD::CTPOP, MVT::i32,   5},
314     };
315     auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
316     MVT MTy = LT.second;
317     if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
318       // Extra cost of +1 when illegal vector types are legalized by promoting
319       // the integer type.
320       int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
321                                             RetTy->getScalarSizeInBits()
322                           ? 1
323                           : 0;
324       return LT.first * Entry->Cost + ExtraCost;
325     }
326     break;
327   }
328   default:
329     break;
330   }
331   return BaseT::getIntrinsicInstrCost(ICA, CostKind);
332 }
333 
334 /// The function will remove redundant reinterprets casting in the presence
335 /// of the control flow
336 static Optional<Instruction *> processPhiNode(InstCombiner &IC,
337                                               IntrinsicInst &II) {
338   SmallVector<Instruction *, 32> Worklist;
339   auto RequiredType = II.getType();
340 
341   auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
342   assert(PN && "Expected Phi Node!");
343 
344   // Don't create a new Phi unless we can remove the old one.
345   if (!PN->hasOneUse())
346     return None;
347 
348   for (Value *IncValPhi : PN->incoming_values()) {
349     auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
350     if (!Reinterpret ||
351         Reinterpret->getIntrinsicID() !=
352             Intrinsic::aarch64_sve_convert_to_svbool ||
353         RequiredType != Reinterpret->getArgOperand(0)->getType())
354       return None;
355   }
356 
357   // Create the new Phi
358   LLVMContext &Ctx = PN->getContext();
359   IRBuilder<> Builder(Ctx);
360   Builder.SetInsertPoint(PN);
361   PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
362   Worklist.push_back(PN);
363 
364   for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
365     auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
366     NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
367     Worklist.push_back(Reinterpret);
368   }
369 
370   // Cleanup Phi Node and reinterprets
371   return IC.replaceInstUsesWith(II, NPN);
372 }
373 
374 static Optional<Instruction *> instCombineConvertFromSVBool(InstCombiner &IC,
375                                                             IntrinsicInst &II) {
376   // If the reinterpret instruction operand is a PHI Node
377   if (isa<PHINode>(II.getArgOperand(0)))
378     return processPhiNode(IC, II);
379 
380   SmallVector<Instruction *, 32> CandidatesForRemoval;
381   Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
382 
383   const auto *IVTy = cast<VectorType>(II.getType());
384 
385   // Walk the chain of conversions.
386   while (Cursor) {
387     // If the type of the cursor has fewer lanes than the final result, zeroing
388     // must take place, which breaks the equivalence chain.
389     const auto *CursorVTy = cast<VectorType>(Cursor->getType());
390     if (CursorVTy->getElementCount().getKnownMinValue() <
391         IVTy->getElementCount().getKnownMinValue())
392       break;
393 
394     // If the cursor has the same type as I, it is a viable replacement.
395     if (Cursor->getType() == IVTy)
396       EarliestReplacement = Cursor;
397 
398     auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
399 
400     // If this is not an SVE conversion intrinsic, this is the end of the chain.
401     if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
402                                   Intrinsic::aarch64_sve_convert_to_svbool ||
403                               IntrinsicCursor->getIntrinsicID() ==
404                                   Intrinsic::aarch64_sve_convert_from_svbool))
405       break;
406 
407     CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
408     Cursor = IntrinsicCursor->getOperand(0);
409   }
410 
411   // If no viable replacement in the conversion chain was found, there is
412   // nothing to do.
413   if (!EarliestReplacement)
414     return None;
415 
416   return IC.replaceInstUsesWith(II, EarliestReplacement);
417 }
418 
419 static Optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
420                                                  IntrinsicInst &II) {
421   IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
422   if (!Pg)
423     return None;
424 
425   if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
426     return None;
427 
428   const auto PTruePattern =
429       cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
430   if (PTruePattern != AArch64SVEPredPattern::vl1)
431     return None;
432 
433   // The intrinsic is inserting into lane zero so use an insert instead.
434   auto *IdxTy = Type::getInt64Ty(II.getContext());
435   auto *Insert = InsertElementInst::Create(
436       II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
437   Insert->insertBefore(&II);
438   Insert->takeName(&II);
439 
440   return IC.replaceInstUsesWith(II, Insert);
441 }
442 
443 static Optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
444                                                    IntrinsicInst &II) {
445   LLVMContext &Ctx = II.getContext();
446   IRBuilder<> Builder(Ctx);
447   Builder.SetInsertPoint(&II);
448 
449   // Check that the predicate is all active
450   auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
451   if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
452     return None;
453 
454   const auto PTruePattern =
455       cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
456   if (PTruePattern != AArch64SVEPredPattern::all)
457     return None;
458 
459   // Check that we have a compare of zero..
460   auto *DupX = dyn_cast<IntrinsicInst>(II.getArgOperand(2));
461   if (!DupX || DupX->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x)
462     return None;
463 
464   auto *DupXArg = dyn_cast<ConstantInt>(DupX->getArgOperand(0));
465   if (!DupXArg || !DupXArg->isZero())
466     return None;
467 
468   // ..against a dupq
469   auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
470   if (!DupQLane ||
471       DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
472     return None;
473 
474   // Where the dupq is a lane 0 replicate of a vector insert
475   if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero())
476     return None;
477 
478   auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
479   if (!VecIns ||
480       VecIns->getIntrinsicID() != Intrinsic::experimental_vector_insert)
481     return None;
482 
483   // Where the vector insert is a fixed constant vector insert into undef at
484   // index zero
485   if (!isa<UndefValue>(VecIns->getArgOperand(0)))
486     return None;
487 
488   if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
489     return None;
490 
491   auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
492   if (!ConstVec)
493     return None;
494 
495   auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
496   auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
497   if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
498     return None;
499 
500   unsigned NumElts = VecTy->getNumElements();
501   unsigned PredicateBits = 0;
502 
503   // Expand intrinsic operands to a 16-bit byte level predicate
504   for (unsigned I = 0; I < NumElts; ++I) {
505     auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
506     if (!Arg)
507       return None;
508     if (!Arg->isZero())
509       PredicateBits |= 1 << (I * (16 / NumElts));
510   }
511 
512   // If all bits are zero bail early with an empty predicate
513   if (PredicateBits == 0) {
514     auto *PFalse = Constant::getNullValue(II.getType());
515     PFalse->takeName(&II);
516     return IC.replaceInstUsesWith(II, PFalse);
517   }
518 
519   // Calculate largest predicate type used (where byte predicate is largest)
520   unsigned Mask = 8;
521   for (unsigned I = 0; I < 16; ++I)
522     if ((PredicateBits & (1 << I)) != 0)
523       Mask |= (I % 8);
524 
525   unsigned PredSize = Mask & -Mask;
526   auto *PredType = ScalableVectorType::get(
527       Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
528 
529   // Ensure all relevant bits are set
530   for (unsigned I = 0; I < 16; I += PredSize)
531     if ((PredicateBits & (1 << I)) == 0)
532       return None;
533 
534   auto *PTruePat =
535       ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
536   auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
537                                         {PredType}, {PTruePat});
538   auto *ConvertToSVBool = Builder.CreateIntrinsic(
539       Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
540   auto *ConvertFromSVBool =
541       Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
542                               {II.getType()}, {ConvertToSVBool});
543 
544   ConvertFromSVBool->takeName(&II);
545   return IC.replaceInstUsesWith(II, ConvertFromSVBool);
546 }
547 
548 static Optional<Instruction *> instCombineSVELast(InstCombiner &IC,
549                                                   IntrinsicInst &II) {
550   Value *Pg = II.getArgOperand(0);
551   Value *Vec = II.getArgOperand(1);
552   bool IsAfter = II.getIntrinsicID() == Intrinsic::aarch64_sve_lasta;
553 
554   auto *C = dyn_cast<Constant>(Pg);
555   if (IsAfter && C && C->isNullValue()) {
556     // The intrinsic is extracting lane 0 so use an extract instead.
557     auto *IdxTy = Type::getInt64Ty(II.getContext());
558     auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
559     Extract->insertBefore(&II);
560     Extract->takeName(&II);
561     return IC.replaceInstUsesWith(II, Extract);
562   }
563 
564   auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
565   if (!IntrPG)
566     return None;
567 
568   if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
569     return None;
570 
571   const auto PTruePattern =
572       cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
573 
574   // Can the intrinsic's predicate be converted to a known constant index?
575   unsigned Idx;
576   switch (PTruePattern) {
577   default:
578     return None;
579   case AArch64SVEPredPattern::vl1:
580     Idx = 0;
581     break;
582   case AArch64SVEPredPattern::vl2:
583     Idx = 1;
584     break;
585   case AArch64SVEPredPattern::vl3:
586     Idx = 2;
587     break;
588   case AArch64SVEPredPattern::vl4:
589     Idx = 3;
590     break;
591   case AArch64SVEPredPattern::vl5:
592     Idx = 4;
593     break;
594   case AArch64SVEPredPattern::vl6:
595     Idx = 5;
596     break;
597   case AArch64SVEPredPattern::vl7:
598     Idx = 6;
599     break;
600   case AArch64SVEPredPattern::vl8:
601     Idx = 7;
602     break;
603   case AArch64SVEPredPattern::vl16:
604     Idx = 15;
605     break;
606   }
607 
608   // Increment the index if extracting the element after the last active
609   // predicate element.
610   if (IsAfter)
611     ++Idx;
612 
613   // Ignore extracts whose index is larger than the known minimum vector
614   // length. NOTE: This is an artificial constraint where we prefer to
615   // maintain what the user asked for until an alternative is proven faster.
616   auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
617   if (Idx >= PgVTy->getMinNumElements())
618     return None;
619 
620   // The intrinsic is extracting a fixed lane so use an extract instead.
621   auto *IdxTy = Type::getInt64Ty(II.getContext());
622   auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
623   Extract->insertBefore(&II);
624   Extract->takeName(&II);
625   return IC.replaceInstUsesWith(II, Extract);
626 }
627 
628 static Optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
629                                                 IntrinsicInst &II) {
630   LLVMContext &Ctx = II.getContext();
631   IRBuilder<> Builder(Ctx);
632   Builder.SetInsertPoint(&II);
633   // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
634   // can work with RDFFR_PP for ptest elimination.
635   auto *AllPat =
636       ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
637   auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
638                                         {II.getType()}, {AllPat});
639   auto *RDFFR =
640       Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue});
641   RDFFR->takeName(&II);
642   return IC.replaceInstUsesWith(II, RDFFR);
643 }
644 
645 Optional<Instruction *>
646 AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
647                                      IntrinsicInst &II) const {
648   Intrinsic::ID IID = II.getIntrinsicID();
649   switch (IID) {
650   default:
651     break;
652   case Intrinsic::aarch64_sve_convert_from_svbool:
653     return instCombineConvertFromSVBool(IC, II);
654   case Intrinsic::aarch64_sve_dup:
655     return instCombineSVEDup(IC, II);
656   case Intrinsic::aarch64_sve_cmpne:
657   case Intrinsic::aarch64_sve_cmpne_wide:
658     return instCombineSVECmpNE(IC, II);
659   case Intrinsic::aarch64_sve_rdffr:
660     return instCombineRDFFR(IC, II);
661   case Intrinsic::aarch64_sve_lasta:
662   case Intrinsic::aarch64_sve_lastb:
663     return instCombineSVELast(IC, II);
664   }
665 
666   return None;
667 }
668 
669 bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
670                                            ArrayRef<const Value *> Args) {
671 
672   // A helper that returns a vector type from the given type. The number of
673   // elements in type Ty determine the vector width.
674   auto toVectorTy = [&](Type *ArgTy) {
675     return VectorType::get(ArgTy->getScalarType(),
676                            cast<VectorType>(DstTy)->getElementCount());
677   };
678 
679   // Exit early if DstTy is not a vector type whose elements are at least
680   // 16-bits wide.
681   if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16)
682     return false;
683 
684   // Determine if the operation has a widening variant. We consider both the
685   // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
686   // instructions.
687   //
688   // TODO: Add additional widening operations (e.g., mul, shl, etc.) once we
689   //       verify that their extending operands are eliminated during code
690   //       generation.
691   switch (Opcode) {
692   case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
693   case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
694     break;
695   default:
696     return false;
697   }
698 
699   // To be a widening instruction (either the "wide" or "long" versions), the
700   // second operand must be a sign- or zero extend having a single user. We
701   // only consider extends having a single user because they may otherwise not
702   // be eliminated.
703   if (Args.size() != 2 ||
704       (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) ||
705       !Args[1]->hasOneUse())
706     return false;
707   auto *Extend = cast<CastInst>(Args[1]);
708 
709   // Legalize the destination type and ensure it can be used in a widening
710   // operation.
711   auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy);
712   unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
713   if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits())
714     return false;
715 
716   // Legalize the source type and ensure it can be used in a widening
717   // operation.
718   auto *SrcTy = toVectorTy(Extend->getSrcTy());
719   auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy);
720   unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
721   if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
722     return false;
723 
724   // Get the total number of vector elements in the legalized types.
725   InstructionCost NumDstEls =
726       DstTyL.first * DstTyL.second.getVectorMinNumElements();
727   InstructionCost NumSrcEls =
728       SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
729 
730   // Return true if the legalized types have the same number of vector elements
731   // and the destination element type size is twice that of the source type.
732   return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
733 }
734 
735 InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
736                                                  Type *Src,
737                                                  TTI::CastContextHint CCH,
738                                                  TTI::TargetCostKind CostKind,
739                                                  const Instruction *I) {
740   int ISD = TLI->InstructionOpcodeToISD(Opcode);
741   assert(ISD && "Invalid opcode");
742 
743   // If the cast is observable, and it is used by a widening instruction (e.g.,
744   // uaddl, saddw, etc.), it may be free.
745   if (I && I->hasOneUse()) {
746     auto *SingleUser = cast<Instruction>(*I->user_begin());
747     SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
748     if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
749       // If the cast is the second operand, it is free. We will generate either
750       // a "wide" or "long" version of the widening instruction.
751       if (I == SingleUser->getOperand(1))
752         return 0;
753       // If the cast is not the second operand, it will be free if it looks the
754       // same as the second operand. In this case, we will generate a "long"
755       // version of the widening instruction.
756       if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
757         if (I->getOpcode() == unsigned(Cast->getOpcode()) &&
758             cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
759           return 0;
760     }
761   }
762 
763   // TODO: Allow non-throughput costs that aren't binary.
764   auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
765     if (CostKind != TTI::TCK_RecipThroughput)
766       return Cost == 0 ? 0 : 1;
767     return Cost;
768   };
769 
770   EVT SrcTy = TLI->getValueType(DL, Src);
771   EVT DstTy = TLI->getValueType(DL, Dst);
772 
773   if (!SrcTy.isSimple() || !DstTy.isSimple())
774     return AdjustCost(
775         BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
776 
777   static const TypeConversionCostTblEntry
778   ConversionTbl[] = {
779     { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32,  1 },
780     { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64,  0 },
781     { ISD::TRUNCATE, MVT::v8i8,  MVT::v8i32,  3 },
782     { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
783 
784     // Truncations on nxvmiN
785     { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 },
786     { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 },
787     { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 },
788     { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 },
789     { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 },
790     { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 },
791     { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 },
792     { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 },
793     { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 },
794     { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 },
795     { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 },
796     { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 },
797     { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 },
798     { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 },
799     { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 },
800 
801     // The number of shll instructions for the extension.
802     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
803     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
804     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32, 2 },
805     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32, 2 },
806     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,  3 },
807     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,  3 },
808     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16, 2 },
809     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16, 2 },
810     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i8,  7 },
811     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i8,  7 },
812     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i16, 6 },
813     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i16, 6 },
814     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
815     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
816     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
817     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
818 
819     // LowerVectorINT_TO_FP:
820     { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
821     { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
822     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
823     { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
824     { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
825     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
826 
827     // Complex: to v2f32
828     { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8,  3 },
829     { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
830     { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
831     { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8,  3 },
832     { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
833     { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
834 
835     // Complex: to v4f32
836     { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8,  4 },
837     { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
838     { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8,  3 },
839     { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
840 
841     // Complex: to v8f32
842     { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8,  10 },
843     { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
844     { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8,  10 },
845     { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
846 
847     // Complex: to v16f32
848     { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
849     { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
850 
851     // Complex: to v2f64
852     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8,  4 },
853     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
854     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
855     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8,  4 },
856     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
857     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
858 
859 
860     // LowerVectorFP_TO_INT
861     { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
862     { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
863     { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
864     { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
865     { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
866     { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
867 
868     // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
869     { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
870     { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
871     { ISD::FP_TO_SINT, MVT::v2i8,  MVT::v2f32, 1 },
872     { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
873     { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
874     { ISD::FP_TO_UINT, MVT::v2i8,  MVT::v2f32, 1 },
875 
876     // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
877     { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
878     { ISD::FP_TO_SINT, MVT::v4i8,  MVT::v4f32, 2 },
879     { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
880     { ISD::FP_TO_UINT, MVT::v4i8,  MVT::v4f32, 2 },
881 
882     // Complex, from nxv2f32.
883     { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
884     { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
885     { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
886     { ISD::FP_TO_SINT, MVT::nxv2i8,  MVT::nxv2f32, 1 },
887     { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
888     { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
889     { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
890     { ISD::FP_TO_UINT, MVT::nxv2i8,  MVT::nxv2f32, 1 },
891 
892     // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
893     { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
894     { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
895     { ISD::FP_TO_SINT, MVT::v2i8,  MVT::v2f64, 2 },
896     { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
897     { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
898     { ISD::FP_TO_UINT, MVT::v2i8,  MVT::v2f64, 2 },
899 
900     // Complex, from nxv2f64.
901     { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
902     { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
903     { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
904     { ISD::FP_TO_SINT, MVT::nxv2i8,  MVT::nxv2f64, 1 },
905     { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
906     { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
907     { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
908     { ISD::FP_TO_UINT, MVT::nxv2i8,  MVT::nxv2f64, 1 },
909 
910     // Complex, from nxv4f32.
911     { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
912     { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
913     { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
914     { ISD::FP_TO_SINT, MVT::nxv4i8,  MVT::nxv4f32, 1 },
915     { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
916     { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
917     { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
918     { ISD::FP_TO_UINT, MVT::nxv4i8,  MVT::nxv4f32, 1 },
919 
920     // Complex, from nxv8f64. Illegal -> illegal conversions not required.
921     { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
922     { ISD::FP_TO_SINT, MVT::nxv8i8,  MVT::nxv8f64, 7 },
923     { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
924     { ISD::FP_TO_UINT, MVT::nxv8i8,  MVT::nxv8f64, 7 },
925 
926     // Complex, from nxv4f64. Illegal -> illegal conversions not required.
927     { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
928     { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
929     { ISD::FP_TO_SINT, MVT::nxv4i8,  MVT::nxv4f64, 3 },
930     { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
931     { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
932     { ISD::FP_TO_UINT, MVT::nxv4i8,  MVT::nxv4f64, 3 },
933 
934     // Complex, from nxv8f32. Illegal -> illegal conversions not required.
935     { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
936     { ISD::FP_TO_SINT, MVT::nxv8i8,  MVT::nxv8f32, 3 },
937     { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
938     { ISD::FP_TO_UINT, MVT::nxv8i8,  MVT::nxv8f32, 3 },
939 
940     // Complex, from nxv8f16.
941     { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
942     { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
943     { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
944     { ISD::FP_TO_SINT, MVT::nxv8i8,  MVT::nxv8f16, 1 },
945     { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
946     { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
947     { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
948     { ISD::FP_TO_UINT, MVT::nxv8i8,  MVT::nxv8f16, 1 },
949 
950     // Complex, from nxv4f16.
951     { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
952     { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
953     { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
954     { ISD::FP_TO_SINT, MVT::nxv4i8,  MVT::nxv4f16, 1 },
955     { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
956     { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
957     { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
958     { ISD::FP_TO_UINT, MVT::nxv4i8,  MVT::nxv4f16, 1 },
959 
960     // Complex, from nxv2f16.
961     { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
962     { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
963     { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
964     { ISD::FP_TO_SINT, MVT::nxv2i8,  MVT::nxv2f16, 1 },
965     { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
966     { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
967     { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
968     { ISD::FP_TO_UINT, MVT::nxv2i8,  MVT::nxv2f16, 1 },
969 
970     // Truncate from nxvmf32 to nxvmf16.
971     { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 },
972     { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 },
973     { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 },
974 
975     // Truncate from nxvmf64 to nxvmf16.
976     { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 },
977     { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 },
978     { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 },
979 
980     // Truncate from nxvmf64 to nxvmf32.
981     { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 },
982     { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 },
983     { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 },
984 
985     // Extend from nxvmf16 to nxvmf32.
986     { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
987     { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
988     { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
989 
990     // Extend from nxvmf16 to nxvmf64.
991     { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
992     { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
993     { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
994 
995     // Extend from nxvmf32 to nxvmf64.
996     { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
997     { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
998     { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
999 
1000   };
1001 
1002   if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
1003                                                  DstTy.getSimpleVT(),
1004                                                  SrcTy.getSimpleVT()))
1005     return AdjustCost(Entry->Cost);
1006 
1007   return AdjustCost(
1008       BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
1009 }
1010 
1011 InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode,
1012                                                          Type *Dst,
1013                                                          VectorType *VecTy,
1014                                                          unsigned Index) {
1015 
1016   // Make sure we were given a valid extend opcode.
1017   assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
1018          "Invalid opcode");
1019 
1020   // We are extending an element we extract from a vector, so the source type
1021   // of the extend is the element type of the vector.
1022   auto *Src = VecTy->getElementType();
1023 
1024   // Sign- and zero-extends are for integer types only.
1025   assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
1026 
1027   // Get the cost for the extract. We compute the cost (if any) for the extend
1028   // below.
1029   InstructionCost Cost =
1030       getVectorInstrCost(Instruction::ExtractElement, VecTy, Index);
1031 
1032   // Legalize the types.
1033   auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy);
1034   auto DstVT = TLI->getValueType(DL, Dst);
1035   auto SrcVT = TLI->getValueType(DL, Src);
1036   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1037 
1038   // If the resulting type is still a vector and the destination type is legal,
1039   // we may get the extension for free. If not, get the default cost for the
1040   // extend.
1041   if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
1042     return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
1043                                    CostKind);
1044 
1045   // The destination type should be larger than the element type. If not, get
1046   // the default cost for the extend.
1047   if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
1048     return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
1049                                    CostKind);
1050 
1051   switch (Opcode) {
1052   default:
1053     llvm_unreachable("Opcode should be either SExt or ZExt");
1054 
1055   // For sign-extends, we only need a smov, which performs the extension
1056   // automatically.
1057   case Instruction::SExt:
1058     return Cost;
1059 
1060   // For zero-extends, the extend is performed automatically by a umov unless
1061   // the destination type is i64 and the element type is i8 or i16.
1062   case Instruction::ZExt:
1063     if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
1064       return Cost;
1065   }
1066 
1067   // If we are unable to perform the extend for free, get the default cost.
1068   return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
1069                                  CostKind);
1070 }
1071 
1072 InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
1073                                                TTI::TargetCostKind CostKind,
1074                                                const Instruction *I) {
1075   if (CostKind != TTI::TCK_RecipThroughput)
1076     return Opcode == Instruction::PHI ? 0 : 1;
1077   assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
1078   // Branches are assumed to be predicted.
1079   return 0;
1080 }
1081 
1082 InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
1083                                                    unsigned Index) {
1084   assert(Val->isVectorTy() && "This must be a vector type");
1085 
1086   if (Index != -1U) {
1087     // Legalize the type.
1088     std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
1089 
1090     // This type is legalized to a scalar type.
1091     if (!LT.second.isVector())
1092       return 0;
1093 
1094     // The type may be split. Normalize the index to the new type.
1095     unsigned Width = LT.second.getVectorNumElements();
1096     Index = Index % Width;
1097 
1098     // The element at index zero is already inside the vector.
1099     if (Index == 0)
1100       return 0;
1101   }
1102 
1103   // All other insert/extracts cost this much.
1104   return ST->getVectorInsertExtractBaseCost();
1105 }
1106 
1107 InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
1108     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1109     TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info,
1110     TTI::OperandValueProperties Opd1PropInfo,
1111     TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
1112     const Instruction *CxtI) {
1113   // TODO: Handle more cost kinds.
1114   if (CostKind != TTI::TCK_RecipThroughput)
1115     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
1116                                          Opd2Info, Opd1PropInfo,
1117                                          Opd2PropInfo, Args, CxtI);
1118 
1119   // Legalize the type.
1120   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
1121 
1122   // If the instruction is a widening instruction (e.g., uaddl, saddw, etc.),
1123   // add in the widening overhead specified by the sub-target. Since the
1124   // extends feeding widening instructions are performed automatically, they
1125   // aren't present in the generated code and have a zero cost. By adding a
1126   // widening overhead here, we attach the total cost of the combined operation
1127   // to the widening instruction.
1128   InstructionCost Cost = 0;
1129   if (isWideningInstruction(Ty, Opcode, Args))
1130     Cost += ST->getWideningBaseCost();
1131 
1132   int ISD = TLI->InstructionOpcodeToISD(Opcode);
1133 
1134   switch (ISD) {
1135   default:
1136     return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
1137                                                 Opd2Info,
1138                                                 Opd1PropInfo, Opd2PropInfo);
1139   case ISD::SDIV:
1140     if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue &&
1141         Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
1142       // On AArch64, scalar signed division by constants power-of-two are
1143       // normally expanded to the sequence ADD + CMP + SELECT + SRA.
1144       // The OperandValue properties many not be same as that of previous
1145       // operation; conservatively assume OP_None.
1146       Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
1147                                      Opd1Info, Opd2Info,
1148                                      TargetTransformInfo::OP_None,
1149                                      TargetTransformInfo::OP_None);
1150       Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
1151                                      Opd1Info, Opd2Info,
1152                                      TargetTransformInfo::OP_None,
1153                                      TargetTransformInfo::OP_None);
1154       Cost += getArithmeticInstrCost(Instruction::Select, Ty, CostKind,
1155                                      Opd1Info, Opd2Info,
1156                                      TargetTransformInfo::OP_None,
1157                                      TargetTransformInfo::OP_None);
1158       Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
1159                                      Opd1Info, Opd2Info,
1160                                      TargetTransformInfo::OP_None,
1161                                      TargetTransformInfo::OP_None);
1162       return Cost;
1163     }
1164     LLVM_FALLTHROUGH;
1165   case ISD::UDIV:
1166     if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue) {
1167       auto VT = TLI->getValueType(DL, Ty);
1168       if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
1169         // Vector signed division by constant are expanded to the
1170         // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
1171         // to MULHS + SUB + SRL + ADD + SRL.
1172         InstructionCost MulCost = getArithmeticInstrCost(
1173             Instruction::Mul, Ty, CostKind, Opd1Info, Opd2Info,
1174             TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
1175         InstructionCost AddCost = getArithmeticInstrCost(
1176             Instruction::Add, Ty, CostKind, Opd1Info, Opd2Info,
1177             TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
1178         InstructionCost ShrCost = getArithmeticInstrCost(
1179             Instruction::AShr, Ty, CostKind, Opd1Info, Opd2Info,
1180             TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
1181         return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
1182       }
1183     }
1184 
1185     Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
1186                                           Opd2Info,
1187                                           Opd1PropInfo, Opd2PropInfo);
1188     if (Ty->isVectorTy()) {
1189       // On AArch64, vector divisions are not supported natively and are
1190       // expanded into scalar divisions of each pair of elements.
1191       Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, CostKind,
1192                                      Opd1Info, Opd2Info, Opd1PropInfo,
1193                                      Opd2PropInfo);
1194       Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
1195                                      Opd1Info, Opd2Info, Opd1PropInfo,
1196                                      Opd2PropInfo);
1197       // TODO: if one of the arguments is scalar, then it's not necessary to
1198       // double the cost of handling the vector elements.
1199       Cost += Cost;
1200     }
1201     return Cost;
1202 
1203   case ISD::MUL:
1204     if (LT.second != MVT::v2i64)
1205       return (Cost + 1) * LT.first;
1206     // Since we do not have a MUL.2d instruction, a mul <2 x i64> is expensive
1207     // as elements are extracted from the vectors and the muls scalarized.
1208     // As getScalarizationOverhead is a bit too pessimistic, we estimate the
1209     // cost for a i64 vector directly here, which is:
1210     // - four i64 extracts,
1211     // - two i64 inserts, and
1212     // - two muls.
1213     // So, for a v2i64 with LT.First = 1 the cost is 8, and for a v4i64 with
1214     // LT.first = 2 the cost is 16.
1215     return LT.first * 8;
1216   case ISD::ADD:
1217   case ISD::XOR:
1218   case ISD::OR:
1219   case ISD::AND:
1220     // These nodes are marked as 'custom' for combining purposes only.
1221     // We know that they are legal. See LowerAdd in ISelLowering.
1222     return (Cost + 1) * LT.first;
1223 
1224   case ISD::FADD:
1225     // These nodes are marked as 'custom' just to lower them to SVE.
1226     // We know said lowering will incur no additional cost.
1227     if (isa<FixedVectorType>(Ty) && !Ty->getScalarType()->isFP128Ty())
1228       return (Cost + 2) * LT.first;
1229 
1230     return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
1231                                                 Opd2Info,
1232                                                 Opd1PropInfo, Opd2PropInfo);
1233   }
1234 }
1235 
1236 InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty,
1237                                                           ScalarEvolution *SE,
1238                                                           const SCEV *Ptr) {
1239   // Address computations in vectorized code with non-consecutive addresses will
1240   // likely result in more instructions compared to scalar code where the
1241   // computation can more often be merged into the index mode. The resulting
1242   // extra micro-ops can significantly decrease throughput.
1243   unsigned NumVectorInstToHideOverhead = 10;
1244   int MaxMergeDistance = 64;
1245 
1246   if (Ty->isVectorTy() && SE &&
1247       !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
1248     return NumVectorInstToHideOverhead;
1249 
1250   // In many cases the address computation is not merged into the instruction
1251   // addressing mode.
1252   return 1;
1253 }
1254 
1255 InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
1256                                                    Type *CondTy,
1257                                                    CmpInst::Predicate VecPred,
1258                                                    TTI::TargetCostKind CostKind,
1259                                                    const Instruction *I) {
1260   // TODO: Handle other cost kinds.
1261   if (CostKind != TTI::TCK_RecipThroughput)
1262     return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1263                                      I);
1264 
1265   int ISD = TLI->InstructionOpcodeToISD(Opcode);
1266   // We don't lower some vector selects well that are wider than the register
1267   // width.
1268   if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {
1269     // We would need this many instructions to hide the scalarization happening.
1270     const int AmortizationCost = 20;
1271 
1272     // If VecPred is not set, check if we can get a predicate from the context
1273     // instruction, if its type matches the requested ValTy.
1274     if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
1275       CmpInst::Predicate CurrentPred;
1276       if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
1277                             m_Value())))
1278         VecPred = CurrentPred;
1279     }
1280     // Check if we have a compare/select chain that can be lowered using CMxx &
1281     // BFI pair.
1282     if (CmpInst::isIntPredicate(VecPred)) {
1283       static const auto ValidMinMaxTys = {MVT::v8i8,  MVT::v16i8, MVT::v4i16,
1284                                           MVT::v8i16, MVT::v2i32, MVT::v4i32,
1285                                           MVT::v2i64};
1286       auto LT = TLI->getTypeLegalizationCost(DL, ValTy);
1287       if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
1288         return LT.first;
1289     }
1290 
1291     static const TypeConversionCostTblEntry
1292     VectorSelectTbl[] = {
1293       { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
1294       { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
1295       { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
1296       { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
1297       { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
1298       { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
1299     };
1300 
1301     EVT SelCondTy = TLI->getValueType(DL, CondTy);
1302     EVT SelValTy = TLI->getValueType(DL, ValTy);
1303     if (SelCondTy.isSimple() && SelValTy.isSimple()) {
1304       if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
1305                                                      SelCondTy.getSimpleVT(),
1306                                                      SelValTy.getSimpleVT()))
1307         return Entry->Cost;
1308     }
1309   }
1310   // The base case handles scalable vectors fine for now, since it treats the
1311   // cost as 1 * legalization cost.
1312   return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1313 }
1314 
1315 AArch64TTIImpl::TTI::MemCmpExpansionOptions
1316 AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
1317   TTI::MemCmpExpansionOptions Options;
1318   if (ST->requiresStrictAlign()) {
1319     // TODO: Add cost modeling for strict align. Misaligned loads expand to
1320     // a bunch of instructions when strict align is enabled.
1321     return Options;
1322   }
1323   Options.AllowOverlappingLoads = true;
1324   Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
1325   Options.NumLoadsPerBlock = Options.MaxNumLoads;
1326   // TODO: Though vector loads usually perform well on AArch64, in some targets
1327   // they may wake up the FP unit, which raises the power consumption.  Perhaps
1328   // they could be used with no holds barred (-O3).
1329   Options.LoadSizes = {8, 4, 2, 1};
1330   return Options;
1331 }
1332 
1333 InstructionCost
1334 AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
1335                                       Align Alignment, unsigned AddressSpace,
1336                                       TTI::TargetCostKind CostKind) {
1337   if (!isa<ScalableVectorType>(Src))
1338     return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1339                                         CostKind);
1340   auto LT = TLI->getTypeLegalizationCost(DL, Src);
1341   if (!LT.first.isValid())
1342     return InstructionCost::getInvalid();
1343   return LT.first * 2;
1344 }
1345 
1346 InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
1347     unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1348     Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
1349 
1350   if (!isa<ScalableVectorType>(DataTy))
1351     return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1352                                          Alignment, CostKind, I);
1353   auto *VT = cast<VectorType>(DataTy);
1354   auto LT = TLI->getTypeLegalizationCost(DL, DataTy);
1355   if (!LT.first.isValid())
1356     return InstructionCost::getInvalid();
1357 
1358   ElementCount LegalVF = LT.second.getVectorElementCount();
1359   Optional<unsigned> MaxNumVScale = getMaxVScale();
1360   assert(MaxNumVScale && "Expected valid max vscale value");
1361 
1362   InstructionCost MemOpCost =
1363       getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I);
1364   unsigned MaxNumElementsPerGather =
1365       MaxNumVScale.getValue() * LegalVF.getKnownMinValue();
1366   return LT.first * MaxNumElementsPerGather * MemOpCost;
1367 }
1368 
1369 bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {
1370   return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
1371 }
1372 
1373 InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
1374                                                 MaybeAlign Alignment,
1375                                                 unsigned AddressSpace,
1376                                                 TTI::TargetCostKind CostKind,
1377                                                 const Instruction *I) {
1378   // Type legalization can't handle structs
1379   if (TLI->getValueType(DL, Ty,  true) == MVT::Other)
1380     return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
1381                                   CostKind);
1382 
1383   auto LT = TLI->getTypeLegalizationCost(DL, Ty);
1384   if (!LT.first.isValid())
1385     return InstructionCost::getInvalid();
1386 
1387   // TODO: consider latency as well for TCK_SizeAndLatency.
1388   if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
1389     return LT.first;
1390 
1391   if (CostKind != TTI::TCK_RecipThroughput)
1392     return 1;
1393 
1394   if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
1395       LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
1396     // Unaligned stores are extremely inefficient. We don't split all
1397     // unaligned 128-bit stores because the negative impact that has shown in
1398     // practice on inlined block copy code.
1399     // We make such stores expensive so that we will only vectorize if there
1400     // are 6 other instructions getting vectorized.
1401     const int AmortizationCost = 6;
1402 
1403     return LT.first * 2 * AmortizationCost;
1404   }
1405 
1406   if (useNeonVector(Ty) &&
1407       cast<VectorType>(Ty)->getElementType()->isIntegerTy(8)) {
1408     unsigned ProfitableNumElements;
1409     if (Opcode == Instruction::Store)
1410       // We use a custom trunc store lowering so v.4b should be profitable.
1411       ProfitableNumElements = 4;
1412     else
1413       // We scalarize the loads because there is not v.4b register and we
1414       // have to promote the elements to v.2.
1415       ProfitableNumElements = 8;
1416 
1417     if (cast<FixedVectorType>(Ty)->getNumElements() < ProfitableNumElements) {
1418       unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
1419       unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
1420       // We generate 2 instructions per vector element.
1421       return NumVectorizableInstsToAmortize * NumVecElts * 2;
1422     }
1423   }
1424 
1425   return LT.first;
1426 }
1427 
1428 InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
1429     unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1430     Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1431     bool UseMaskForCond, bool UseMaskForGaps) {
1432   assert(Factor >= 2 && "Invalid interleave factor");
1433   auto *VecVTy = cast<FixedVectorType>(VecTy);
1434 
1435   if (!UseMaskForCond && !UseMaskForGaps &&
1436       Factor <= TLI->getMaxSupportedInterleaveFactor()) {
1437     unsigned NumElts = VecVTy->getNumElements();
1438     auto *SubVecTy =
1439         FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
1440 
1441     // ldN/stN only support legal vector types of size 64 or 128 in bits.
1442     // Accesses having vector types that are a multiple of 128 bits can be
1443     // matched to more than one ldN/stN instruction.
1444     if (NumElts % Factor == 0 &&
1445         TLI->isLegalInterleavedAccessType(SubVecTy, DL))
1446       return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1447   }
1448 
1449   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1450                                            Alignment, AddressSpace, CostKind,
1451                                            UseMaskForCond, UseMaskForGaps);
1452 }
1453 
1454 InstructionCost
1455 AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
1456   InstructionCost Cost = 0;
1457   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1458   for (auto *I : Tys) {
1459     if (!I->isVectorTy())
1460       continue;
1461     if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
1462         128)
1463       Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
1464               getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
1465   }
1466   return Cost;
1467 }
1468 
1469 unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
1470   return ST->getMaxInterleaveFactor();
1471 }
1472 
1473 // For Falkor, we want to avoid having too many strided loads in a loop since
1474 // that can exhaust the HW prefetcher resources.  We adjust the unroller
1475 // MaxCount preference below to attempt to ensure unrolling doesn't create too
1476 // many strided loads.
1477 static void
1478 getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1479                               TargetTransformInfo::UnrollingPreferences &UP) {
1480   enum { MaxStridedLoads = 7 };
1481   auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
1482     int StridedLoads = 0;
1483     // FIXME? We could make this more precise by looking at the CFG and
1484     // e.g. not counting loads in each side of an if-then-else diamond.
1485     for (const auto BB : L->blocks()) {
1486       for (auto &I : *BB) {
1487         LoadInst *LMemI = dyn_cast<LoadInst>(&I);
1488         if (!LMemI)
1489           continue;
1490 
1491         Value *PtrValue = LMemI->getPointerOperand();
1492         if (L->isLoopInvariant(PtrValue))
1493           continue;
1494 
1495         const SCEV *LSCEV = SE.getSCEV(PtrValue);
1496         const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
1497         if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
1498           continue;
1499 
1500         // FIXME? We could take pairing of unrolled load copies into account
1501         // by looking at the AddRec, but we would probably have to limit this
1502         // to loops with no stores or other memory optimization barriers.
1503         ++StridedLoads;
1504         // We've seen enough strided loads that seeing more won't make a
1505         // difference.
1506         if (StridedLoads > MaxStridedLoads / 2)
1507           return StridedLoads;
1508       }
1509     }
1510     return StridedLoads;
1511   };
1512 
1513   int StridedLoads = countStridedLoads(L, SE);
1514   LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
1515                     << " strided loads\n");
1516   // Pick the largest power of 2 unroll count that won't result in too many
1517   // strided loads.
1518   if (StridedLoads) {
1519     UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
1520     LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
1521                       << UP.MaxCount << '\n');
1522   }
1523 }
1524 
1525 void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1526                                              TTI::UnrollingPreferences &UP) {
1527   // Enable partial unrolling and runtime unrolling.
1528   BaseT::getUnrollingPreferences(L, SE, UP);
1529 
1530   // For inner loop, it is more likely to be a hot one, and the runtime check
1531   // can be promoted out from LICM pass, so the overhead is less, let's try
1532   // a larger threshold to unroll more loops.
1533   if (L->getLoopDepth() > 1)
1534     UP.PartialThreshold *= 2;
1535 
1536   // Disable partial & runtime unrolling on -Os.
1537   UP.PartialOptSizeThreshold = 0;
1538 
1539   if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
1540       EnableFalkorHWPFUnrollFix)
1541     getFalkorUnrollingPreferences(L, SE, UP);
1542 
1543   // Scan the loop: don't unroll loops with calls as this could prevent
1544   // inlining. Don't unroll vector loops either, as they don't benefit much from
1545   // unrolling.
1546   for (auto *BB : L->getBlocks()) {
1547     for (auto &I : *BB) {
1548       // Don't unroll vectorised loop.
1549       if (I.getType()->isVectorTy())
1550         return;
1551 
1552       if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1553         if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
1554           if (!isLoweredToCall(F))
1555             continue;
1556         }
1557         return;
1558       }
1559     }
1560   }
1561 
1562   // Enable runtime unrolling for in-order models
1563   // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
1564   // checking for that case, we can ensure that the default behaviour is
1565   // unchanged
1566   if (ST->getProcFamily() != AArch64Subtarget::Others &&
1567       !ST->getSchedModel().isOutOfOrder()) {
1568     UP.Runtime = true;
1569     UP.Partial = true;
1570     UP.UpperBound = true;
1571     UP.UnrollRemainder = true;
1572     UP.DefaultUnrollRuntimeCount = 4;
1573 
1574     UP.UnrollAndJam = true;
1575     UP.UnrollAndJamInnerLoopThreshold = 60;
1576   }
1577 }
1578 
1579 void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1580                                            TTI::PeelingPreferences &PP) {
1581   BaseT::getPeelingPreferences(L, SE, PP);
1582 }
1583 
1584 Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
1585                                                          Type *ExpectedType) {
1586   switch (Inst->getIntrinsicID()) {
1587   default:
1588     return nullptr;
1589   case Intrinsic::aarch64_neon_st2:
1590   case Intrinsic::aarch64_neon_st3:
1591   case Intrinsic::aarch64_neon_st4: {
1592     // Create a struct type
1593     StructType *ST = dyn_cast<StructType>(ExpectedType);
1594     if (!ST)
1595       return nullptr;
1596     unsigned NumElts = Inst->getNumArgOperands() - 1;
1597     if (ST->getNumElements() != NumElts)
1598       return nullptr;
1599     for (unsigned i = 0, e = NumElts; i != e; ++i) {
1600       if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
1601         return nullptr;
1602     }
1603     Value *Res = UndefValue::get(ExpectedType);
1604     IRBuilder<> Builder(Inst);
1605     for (unsigned i = 0, e = NumElts; i != e; ++i) {
1606       Value *L = Inst->getArgOperand(i);
1607       Res = Builder.CreateInsertValue(Res, L, i);
1608     }
1609     return Res;
1610   }
1611   case Intrinsic::aarch64_neon_ld2:
1612   case Intrinsic::aarch64_neon_ld3:
1613   case Intrinsic::aarch64_neon_ld4:
1614     if (Inst->getType() == ExpectedType)
1615       return Inst;
1616     return nullptr;
1617   }
1618 }
1619 
1620 bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
1621                                         MemIntrinsicInfo &Info) {
1622   switch (Inst->getIntrinsicID()) {
1623   default:
1624     break;
1625   case Intrinsic::aarch64_neon_ld2:
1626   case Intrinsic::aarch64_neon_ld3:
1627   case Intrinsic::aarch64_neon_ld4:
1628     Info.ReadMem = true;
1629     Info.WriteMem = false;
1630     Info.PtrVal = Inst->getArgOperand(0);
1631     break;
1632   case Intrinsic::aarch64_neon_st2:
1633   case Intrinsic::aarch64_neon_st3:
1634   case Intrinsic::aarch64_neon_st4:
1635     Info.ReadMem = false;
1636     Info.WriteMem = true;
1637     Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1);
1638     break;
1639   }
1640 
1641   switch (Inst->getIntrinsicID()) {
1642   default:
1643     return false;
1644   case Intrinsic::aarch64_neon_ld2:
1645   case Intrinsic::aarch64_neon_st2:
1646     Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
1647     break;
1648   case Intrinsic::aarch64_neon_ld3:
1649   case Intrinsic::aarch64_neon_st3:
1650     Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
1651     break;
1652   case Intrinsic::aarch64_neon_ld4:
1653   case Intrinsic::aarch64_neon_st4:
1654     Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
1655     break;
1656   }
1657   return true;
1658 }
1659 
1660 /// See if \p I should be considered for address type promotion. We check if \p
1661 /// I is a sext with right type and used in memory accesses. If it used in a
1662 /// "complex" getelementptr, we allow it to be promoted without finding other
1663 /// sext instructions that sign extended the same initial value. A getelementptr
1664 /// is considered as "complex" if it has more than 2 operands.
1665 bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
1666     const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
1667   bool Considerable = false;
1668   AllowPromotionWithoutCommonHeader = false;
1669   if (!isa<SExtInst>(&I))
1670     return false;
1671   Type *ConsideredSExtType =
1672       Type::getInt64Ty(I.getParent()->getParent()->getContext());
1673   if (I.getType() != ConsideredSExtType)
1674     return false;
1675   // See if the sext is the one with the right type and used in at least one
1676   // GetElementPtrInst.
1677   for (const User *U : I.users()) {
1678     if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
1679       Considerable = true;
1680       // A getelementptr is considered as "complex" if it has more than 2
1681       // operands. We will promote a SExt used in such complex GEP as we
1682       // expect some computation to be merged if they are done on 64 bits.
1683       if (GEPInst->getNumOperands() > 2) {
1684         AllowPromotionWithoutCommonHeader = true;
1685         break;
1686       }
1687     }
1688   }
1689   return Considerable;
1690 }
1691 
1692 bool AArch64TTIImpl::isLegalToVectorizeReduction(
1693     const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
1694   if (!VF.isScalable())
1695     return true;
1696 
1697   Type *Ty = RdxDesc.getRecurrenceType();
1698   if (Ty->isBFloatTy() || !isLegalElementTypeForSVE(Ty))
1699     return false;
1700 
1701   switch (RdxDesc.getRecurrenceKind()) {
1702   case RecurKind::Add:
1703   case RecurKind::FAdd:
1704   case RecurKind::And:
1705   case RecurKind::Or:
1706   case RecurKind::Xor:
1707   case RecurKind::SMin:
1708   case RecurKind::SMax:
1709   case RecurKind::UMin:
1710   case RecurKind::UMax:
1711   case RecurKind::FMin:
1712   case RecurKind::FMax:
1713     return true;
1714   default:
1715     return false;
1716   }
1717 }
1718 
1719 InstructionCost
1720 AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
1721                                        bool IsPairwise, bool IsUnsigned,
1722                                        TTI::TargetCostKind CostKind) {
1723   if (!isa<ScalableVectorType>(Ty))
1724     return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned,
1725                                          CostKind);
1726   assert((isa<ScalableVectorType>(Ty) && isa<ScalableVectorType>(CondTy)) &&
1727          "Both vector needs to be scalable");
1728 
1729   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
1730   InstructionCost LegalizationCost = 0;
1731   if (LT.first > 1) {
1732     Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
1733     unsigned CmpOpcode =
1734         Ty->isFPOrFPVectorTy() ? Instruction::FCmp : Instruction::ICmp;
1735     LegalizationCost =
1736         getCmpSelInstrCost(CmpOpcode, LegalVTy, LegalVTy,
1737                            CmpInst::BAD_ICMP_PREDICATE, CostKind) +
1738         getCmpSelInstrCost(Instruction::Select, LegalVTy, LegalVTy,
1739                            CmpInst::BAD_ICMP_PREDICATE, CostKind);
1740     LegalizationCost *= LT.first - 1;
1741   }
1742 
1743   return LegalizationCost + /*Cost of horizontal reduction*/ 2;
1744 }
1745 
1746 InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE(
1747     unsigned Opcode, VectorType *ValTy, bool IsPairwise,
1748     TTI::TargetCostKind CostKind) {
1749   assert(!IsPairwise && "Cannot be pair wise to continue");
1750 
1751   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1752   InstructionCost LegalizationCost = 0;
1753   if (LT.first > 1) {
1754     Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
1755     LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
1756     LegalizationCost *= LT.first - 1;
1757   }
1758 
1759   int ISD = TLI->InstructionOpcodeToISD(Opcode);
1760   assert(ISD && "Invalid opcode");
1761   // Add the final reduction cost for the legal horizontal reduction
1762   switch (ISD) {
1763   case ISD::ADD:
1764   case ISD::AND:
1765   case ISD::OR:
1766   case ISD::XOR:
1767   case ISD::FADD:
1768     return LegalizationCost + 2;
1769   default:
1770     return InstructionCost::getInvalid();
1771   }
1772 }
1773 
1774 InstructionCost
1775 AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
1776                                            bool IsPairwiseForm,
1777                                            TTI::TargetCostKind CostKind) {
1778 
1779   if (isa<ScalableVectorType>(ValTy))
1780     return getArithmeticReductionCostSVE(Opcode, ValTy, IsPairwiseForm,
1781                                          CostKind);
1782   if (IsPairwiseForm)
1783     return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
1784                                              CostKind);
1785 
1786   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1787   MVT MTy = LT.second;
1788   int ISD = TLI->InstructionOpcodeToISD(Opcode);
1789   assert(ISD && "Invalid opcode");
1790 
1791   // Horizontal adds can use the 'addv' instruction. We model the cost of these
1792   // instructions as normal vector adds. This is the only arithmetic vector
1793   // reduction operation for which we have an instruction.
1794   // OR, XOR and AND costs should match the codegen from:
1795   // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
1796   // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
1797   // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
1798   static const CostTblEntry CostTblNoPairwise[]{
1799       {ISD::ADD, MVT::v8i8,   1},
1800       {ISD::ADD, MVT::v16i8,  1},
1801       {ISD::ADD, MVT::v4i16,  1},
1802       {ISD::ADD, MVT::v8i16,  1},
1803       {ISD::ADD, MVT::v4i32,  1},
1804       {ISD::OR,  MVT::v8i8,  15},
1805       {ISD::OR,  MVT::v16i8, 17},
1806       {ISD::OR,  MVT::v4i16,  7},
1807       {ISD::OR,  MVT::v8i16,  9},
1808       {ISD::OR,  MVT::v2i32,  3},
1809       {ISD::OR,  MVT::v4i32,  5},
1810       {ISD::OR,  MVT::v2i64,  3},
1811       {ISD::XOR, MVT::v8i8,  15},
1812       {ISD::XOR, MVT::v16i8, 17},
1813       {ISD::XOR, MVT::v4i16,  7},
1814       {ISD::XOR, MVT::v8i16,  9},
1815       {ISD::XOR, MVT::v2i32,  3},
1816       {ISD::XOR, MVT::v4i32,  5},
1817       {ISD::XOR, MVT::v2i64,  3},
1818       {ISD::AND, MVT::v8i8,  15},
1819       {ISD::AND, MVT::v16i8, 17},
1820       {ISD::AND, MVT::v4i16,  7},
1821       {ISD::AND, MVT::v8i16,  9},
1822       {ISD::AND, MVT::v2i32,  3},
1823       {ISD::AND, MVT::v4i32,  5},
1824       {ISD::AND, MVT::v2i64,  3},
1825   };
1826   switch (ISD) {
1827   default:
1828     break;
1829   case ISD::ADD:
1830     if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
1831       return LT.first * Entry->Cost;
1832     break;
1833   case ISD::XOR:
1834   case ISD::AND:
1835   case ISD::OR:
1836     const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
1837     if (!Entry)
1838       break;
1839     auto *ValVTy = cast<FixedVectorType>(ValTy);
1840     if (!ValVTy->getElementType()->isIntegerTy(1) &&
1841         MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
1842         isPowerOf2_32(ValVTy->getNumElements())) {
1843       InstructionCost ExtraCost = 0;
1844       if (LT.first != 1) {
1845         // Type needs to be split, so there is an extra cost of LT.first - 1
1846         // arithmetic ops.
1847         auto *Ty = FixedVectorType::get(ValTy->getElementType(),
1848                                         MTy.getVectorNumElements());
1849         ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
1850         ExtraCost *= LT.first - 1;
1851       }
1852       return Entry->Cost + ExtraCost;
1853     }
1854     break;
1855   }
1856   return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
1857                                            CostKind);
1858 }
1859 
1860 InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1861                                                VectorType *Tp,
1862                                                ArrayRef<int> Mask, int Index,
1863                                                VectorType *SubTp) {
1864   Kind = improveShuffleKindFromMask(Kind, Mask);
1865   if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
1866       Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
1867       Kind == TTI::SK_Reverse) {
1868     static const CostTblEntry ShuffleTbl[] = {
1869       // Broadcast shuffle kinds can be performed with 'dup'.
1870       { TTI::SK_Broadcast, MVT::v8i8,  1 },
1871       { TTI::SK_Broadcast, MVT::v16i8, 1 },
1872       { TTI::SK_Broadcast, MVT::v4i16, 1 },
1873       { TTI::SK_Broadcast, MVT::v8i16, 1 },
1874       { TTI::SK_Broadcast, MVT::v2i32, 1 },
1875       { TTI::SK_Broadcast, MVT::v4i32, 1 },
1876       { TTI::SK_Broadcast, MVT::v2i64, 1 },
1877       { TTI::SK_Broadcast, MVT::v2f32, 1 },
1878       { TTI::SK_Broadcast, MVT::v4f32, 1 },
1879       { TTI::SK_Broadcast, MVT::v2f64, 1 },
1880       // Transpose shuffle kinds can be performed with 'trn1/trn2' and
1881       // 'zip1/zip2' instructions.
1882       { TTI::SK_Transpose, MVT::v8i8,  1 },
1883       { TTI::SK_Transpose, MVT::v16i8, 1 },
1884       { TTI::SK_Transpose, MVT::v4i16, 1 },
1885       { TTI::SK_Transpose, MVT::v8i16, 1 },
1886       { TTI::SK_Transpose, MVT::v2i32, 1 },
1887       { TTI::SK_Transpose, MVT::v4i32, 1 },
1888       { TTI::SK_Transpose, MVT::v2i64, 1 },
1889       { TTI::SK_Transpose, MVT::v2f32, 1 },
1890       { TTI::SK_Transpose, MVT::v4f32, 1 },
1891       { TTI::SK_Transpose, MVT::v2f64, 1 },
1892       // Select shuffle kinds.
1893       // TODO: handle vXi8/vXi16.
1894       { TTI::SK_Select, MVT::v2i32, 1 }, // mov.
1895       { TTI::SK_Select, MVT::v4i32, 2 }, // rev+trn (or similar).
1896       { TTI::SK_Select, MVT::v2i64, 1 }, // mov.
1897       { TTI::SK_Select, MVT::v2f32, 1 }, // mov.
1898       { TTI::SK_Select, MVT::v4f32, 2 }, // rev+trn (or similar).
1899       { TTI::SK_Select, MVT::v2f64, 1 }, // mov.
1900       // PermuteSingleSrc shuffle kinds.
1901       // TODO: handle vXi8/vXi16.
1902       { TTI::SK_PermuteSingleSrc, MVT::v2i32, 1 }, // mov.
1903       { TTI::SK_PermuteSingleSrc, MVT::v4i32, 3 }, // perfectshuffle worst case.
1904       { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // mov.
1905       { TTI::SK_PermuteSingleSrc, MVT::v2f32, 1 }, // mov.
1906       { TTI::SK_PermuteSingleSrc, MVT::v4f32, 3 }, // perfectshuffle worst case.
1907       { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // mov.
1908       // Reverse can be lowered with `rev`.
1909       { TTI::SK_Reverse, MVT::v2i32, 1 }, // mov.
1910       { TTI::SK_Reverse, MVT::v4i32, 2 }, // REV64; EXT
1911       { TTI::SK_Reverse, MVT::v2i64, 1 }, // mov.
1912       { TTI::SK_Reverse, MVT::v2f32, 1 }, // mov.
1913       { TTI::SK_Reverse, MVT::v4f32, 2 }, // REV64; EXT
1914       { TTI::SK_Reverse, MVT::v2f64, 1 }, // mov.
1915       // Broadcast shuffle kinds for scalable vectors
1916       { TTI::SK_Broadcast, MVT::nxv16i8,  1 },
1917       { TTI::SK_Broadcast, MVT::nxv8i16,  1 },
1918       { TTI::SK_Broadcast, MVT::nxv4i32,  1 },
1919       { TTI::SK_Broadcast, MVT::nxv2i64,  1 },
1920       { TTI::SK_Broadcast, MVT::nxv2f16,  1 },
1921       { TTI::SK_Broadcast, MVT::nxv4f16,  1 },
1922       { TTI::SK_Broadcast, MVT::nxv8f16,  1 },
1923       { TTI::SK_Broadcast, MVT::nxv2bf16, 1 },
1924       { TTI::SK_Broadcast, MVT::nxv4bf16, 1 },
1925       { TTI::SK_Broadcast, MVT::nxv8bf16, 1 },
1926       { TTI::SK_Broadcast, MVT::nxv2f32,  1 },
1927       { TTI::SK_Broadcast, MVT::nxv4f32,  1 },
1928       { TTI::SK_Broadcast, MVT::nxv2f64,  1 },
1929       { TTI::SK_Broadcast, MVT::nxv16i1,  1 },
1930       { TTI::SK_Broadcast, MVT::nxv8i1,   1 },
1931       { TTI::SK_Broadcast, MVT::nxv4i1,   1 },
1932       { TTI::SK_Broadcast, MVT::nxv2i1,   1 },
1933       // Handle the cases for vector.reverse with scalable vectors
1934       { TTI::SK_Reverse, MVT::nxv16i8,  1 },
1935       { TTI::SK_Reverse, MVT::nxv8i16,  1 },
1936       { TTI::SK_Reverse, MVT::nxv4i32,  1 },
1937       { TTI::SK_Reverse, MVT::nxv2i64,  1 },
1938       { TTI::SK_Reverse, MVT::nxv2f16,  1 },
1939       { TTI::SK_Reverse, MVT::nxv4f16,  1 },
1940       { TTI::SK_Reverse, MVT::nxv8f16,  1 },
1941       { TTI::SK_Reverse, MVT::nxv2bf16, 1 },
1942       { TTI::SK_Reverse, MVT::nxv4bf16, 1 },
1943       { TTI::SK_Reverse, MVT::nxv8bf16, 1 },
1944       { TTI::SK_Reverse, MVT::nxv2f32,  1 },
1945       { TTI::SK_Reverse, MVT::nxv4f32,  1 },
1946       { TTI::SK_Reverse, MVT::nxv2f64,  1 },
1947       { TTI::SK_Reverse, MVT::nxv16i1,  1 },
1948       { TTI::SK_Reverse, MVT::nxv8i1,   1 },
1949       { TTI::SK_Reverse, MVT::nxv4i1,   1 },
1950       { TTI::SK_Reverse, MVT::nxv2i1,   1 },
1951     };
1952     std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1953     if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
1954       return LT.first * Entry->Cost;
1955   }
1956 
1957   return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
1958 }
1959