1 //===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "ARMTargetTransformInfo.h"
10 #include "ARMSubtarget.h"
11 #include "MCTargetDesc/ARMAddressingModes.h"
12 #include "llvm/ADT/APInt.h"
13 #include "llvm/ADT/SmallVector.h"
14 #include "llvm/Analysis/LoopInfo.h"
15 #include "llvm/CodeGen/CostTable.h"
16 #include "llvm/CodeGen/ISDOpcodes.h"
17 #include "llvm/CodeGen/ValueTypes.h"
18 #include "llvm/IR/BasicBlock.h"
19 #include "llvm/IR/DataLayout.h"
20 #include "llvm/IR/DerivedTypes.h"
21 #include "llvm/IR/Instruction.h"
22 #include "llvm/IR/Instructions.h"
23 #include "llvm/IR/Intrinsics.h"
24 #include "llvm/IR/IntrinsicInst.h"
25 #include "llvm/IR/IntrinsicsARM.h"
26 #include "llvm/IR/PatternMatch.h"
27 #include "llvm/IR/Type.h"
28 #include "llvm/MC/SubtargetFeature.h"
29 #include "llvm/Support/Casting.h"
30 #include "llvm/Support/KnownBits.h"
31 #include "llvm/Support/MachineValueType.h"
32 #include "llvm/Target/TargetMachine.h"
33 #include "llvm/Transforms/InstCombine/InstCombiner.h"
34 #include "llvm/Transforms/Utils/Local.h"
35 #include "llvm/Transforms/Utils/LoopUtils.h"
36 #include <algorithm>
37 #include <cassert>
38 #include <cstdint>
39 #include <utility>
40 
41 using namespace llvm;
42 
43 #define DEBUG_TYPE "armtti"
44 
45 static cl::opt<bool> EnableMaskedLoadStores(
46   "enable-arm-maskedldst", cl::Hidden, cl::init(true),
47   cl::desc("Enable the generation of masked loads and stores"));
48 
49 static cl::opt<bool> DisableLowOverheadLoops(
50   "disable-arm-loloops", cl::Hidden, cl::init(false),
51   cl::desc("Disable the generation of low-overhead loops"));
52 
53 static cl::opt<bool>
54     AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
55                   cl::desc("Enable the generation of WLS loops"));
56 
57 extern cl::opt<TailPredication::Mode> EnableTailPredication;
58 
59 extern cl::opt<bool> EnableMaskedGatherScatters;
60 
61 extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor;
62 
63 /// Convert a vector load intrinsic into a simple llvm load instruction.
64 /// This is beneficial when the underlying object being addressed comes
65 /// from a constant, since we get constant-folding for free.
66 static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
67                                InstCombiner::BuilderTy &Builder) {
68   auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
69 
70   if (!IntrAlign)
71     return nullptr;
72 
73   unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
74                            ? MemAlign
75                            : IntrAlign->getLimitedValue();
76 
77   if (!isPowerOf2_32(Alignment))
78     return nullptr;
79 
80   auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
81                                           PointerType::get(II.getType(), 0));
82   return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
83 }
84 
85 bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
86                                      const Function *Callee) const {
87   const TargetMachine &TM = getTLI()->getTargetMachine();
88   const FeatureBitset &CallerBits =
89       TM.getSubtargetImpl(*Caller)->getFeatureBits();
90   const FeatureBitset &CalleeBits =
91       TM.getSubtargetImpl(*Callee)->getFeatureBits();
92 
93   // To inline a callee, all features not in the allowed list must match exactly.
94   bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
95                     (CalleeBits & ~InlineFeaturesAllowed);
96   // For features in the allowed list, the callee's features must be a subset of
97   // the callers'.
98   bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
99                      (CalleeBits & InlineFeaturesAllowed);
100   return MatchExact && MatchSubset;
101 }
102 
103 TTI::AddressingModeKind
104 ARMTTIImpl::getPreferredAddressingMode(const Loop *L,
105                                        ScalarEvolution *SE) const {
106   if (ST->hasMVEIntegerOps())
107     return TTI::AMK_PostIndexed;
108 
109   if (L->getHeader()->getParent()->hasOptSize())
110     return TTI::AMK_None;
111 
112   if (ST->isMClass() && ST->isThumb2() &&
113       L->getNumBlocks() == 1)
114     return TTI::AMK_PreIndexed;
115 
116   return TTI::AMK_None;
117 }
118 
119 Optional<Instruction *>
120 ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
121   using namespace PatternMatch;
122   Intrinsic::ID IID = II.getIntrinsicID();
123   switch (IID) {
124   default:
125     break;
126   case Intrinsic::arm_neon_vld1: {
127     Align MemAlign =
128         getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
129                           &IC.getAssumptionCache(), &IC.getDominatorTree());
130     if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
131       return IC.replaceInstUsesWith(II, V);
132     }
133     break;
134   }
135 
136   case Intrinsic::arm_neon_vld2:
137   case Intrinsic::arm_neon_vld3:
138   case Intrinsic::arm_neon_vld4:
139   case Intrinsic::arm_neon_vld2lane:
140   case Intrinsic::arm_neon_vld3lane:
141   case Intrinsic::arm_neon_vld4lane:
142   case Intrinsic::arm_neon_vst1:
143   case Intrinsic::arm_neon_vst2:
144   case Intrinsic::arm_neon_vst3:
145   case Intrinsic::arm_neon_vst4:
146   case Intrinsic::arm_neon_vst2lane:
147   case Intrinsic::arm_neon_vst3lane:
148   case Intrinsic::arm_neon_vst4lane: {
149     Align MemAlign =
150         getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
151                           &IC.getAssumptionCache(), &IC.getDominatorTree());
152     unsigned AlignArg = II.arg_size() - 1;
153     Value *AlignArgOp = II.getArgOperand(AlignArg);
154     MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
155     if (Align && *Align < MemAlign) {
156       return IC.replaceOperand(
157           II, AlignArg,
158           ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(),
159                            false));
160     }
161     break;
162   }
163 
164   case Intrinsic::arm_mve_pred_i2v: {
165     Value *Arg = II.getArgOperand(0);
166     Value *ArgArg;
167     if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
168                        PatternMatch::m_Value(ArgArg))) &&
169         II.getType() == ArgArg->getType()) {
170       return IC.replaceInstUsesWith(II, ArgArg);
171     }
172     Constant *XorMask;
173     if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
174                              PatternMatch::m_Value(ArgArg)),
175                          PatternMatch::m_Constant(XorMask))) &&
176         II.getType() == ArgArg->getType()) {
177       if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
178         if (CI->getValue().trunc(16).isAllOnesValue()) {
179           auto TrueVector = IC.Builder.CreateVectorSplat(
180               cast<FixedVectorType>(II.getType())->getNumElements(),
181               IC.Builder.getTrue());
182           return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
183         }
184       }
185     }
186     KnownBits ScalarKnown(32);
187     if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
188                                 ScalarKnown, 0)) {
189       return &II;
190     }
191     break;
192   }
193   case Intrinsic::arm_mve_pred_v2i: {
194     Value *Arg = II.getArgOperand(0);
195     Value *ArgArg;
196     if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
197                        PatternMatch::m_Value(ArgArg)))) {
198       return IC.replaceInstUsesWith(II, ArgArg);
199     }
200     if (!II.getMetadata(LLVMContext::MD_range)) {
201       Type *IntTy32 = Type::getInt32Ty(II.getContext());
202       Metadata *M[] = {
203           ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)),
204           ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0x10000))};
205       II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M));
206       return &II;
207     }
208     break;
209   }
210   case Intrinsic::arm_mve_vadc:
211   case Intrinsic::arm_mve_vadc_predicated: {
212     unsigned CarryOp =
213         (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
214     assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
215            "Bad type for intrinsic!");
216 
217     KnownBits CarryKnown(32);
218     if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
219                                 CarryKnown)) {
220       return &II;
221     }
222     break;
223   }
224   case Intrinsic::arm_mve_vmldava: {
225     Instruction *I = cast<Instruction>(&II);
226     if (I->hasOneUse()) {
227       auto *User = cast<Instruction>(*I->user_begin());
228       Value *OpZ;
229       if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
230           match(I->getOperand(3), m_Zero())) {
231         Value *OpX = I->getOperand(4);
232         Value *OpY = I->getOperand(5);
233         Type *OpTy = OpX->getType();
234 
235         IC.Builder.SetInsertPoint(User);
236         Value *V =
237             IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
238                                        {I->getOperand(0), I->getOperand(1),
239                                         I->getOperand(2), OpZ, OpX, OpY});
240 
241         IC.replaceInstUsesWith(*User, V);
242         return IC.eraseInstFromFunction(*User);
243       }
244     }
245     return None;
246   }
247   }
248   return None;
249 }
250 
251 Optional<Value *> ARMTTIImpl::simplifyDemandedVectorEltsIntrinsic(
252     InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
253     APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
254     std::function<void(Instruction *, unsigned, APInt, APInt &)>
255         SimplifyAndSetOp) const {
256 
257   // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
258   // opcode specifying a Top/Bottom instruction, which can change between
259   // instructions.
260   auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
261     unsigned NumElts = cast<FixedVectorType>(II.getType())->getNumElements();
262     unsigned IsTop = cast<ConstantInt>(II.getOperand(TopOpc))->getZExtValue();
263 
264     // The only odd/even lanes of operand 0 will only be demanded depending
265     // on whether this is a top/bottom instruction.
266     APInt DemandedElts =
267         APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
268                                        : APInt::getHighBitsSet(2, 1));
269     SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts);
270     // The other lanes will be defined from the inserted elements.
271     UndefElts &= APInt::getSplat(NumElts, !IsTop ? APInt::getLowBitsSet(2, 1)
272                                                  : APInt::getHighBitsSet(2, 1));
273     return None;
274   };
275 
276   switch (II.getIntrinsicID()) {
277   default:
278     break;
279   case Intrinsic::arm_mve_vcvt_narrow:
280     SimplifyNarrowInstrTopBottom(2);
281     break;
282   case Intrinsic::arm_mve_vqmovn:
283     SimplifyNarrowInstrTopBottom(4);
284     break;
285   case Intrinsic::arm_mve_vshrn:
286     SimplifyNarrowInstrTopBottom(7);
287     break;
288   }
289 
290   return None;
291 }
292 
293 InstructionCost ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
294                                           TTI::TargetCostKind CostKind) {
295   assert(Ty->isIntegerTy());
296 
297  unsigned Bits = Ty->getPrimitiveSizeInBits();
298  if (Bits == 0 || Imm.getActiveBits() >= 64)
299    return 4;
300 
301   int64_t SImmVal = Imm.getSExtValue();
302   uint64_t ZImmVal = Imm.getZExtValue();
303   if (!ST->isThumb()) {
304     if ((SImmVal >= 0 && SImmVal < 65536) ||
305         (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
306         (ARM_AM::getSOImmVal(~ZImmVal) != -1))
307       return 1;
308     return ST->hasV6T2Ops() ? 2 : 3;
309   }
310   if (ST->isThumb2()) {
311     if ((SImmVal >= 0 && SImmVal < 65536) ||
312         (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
313         (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
314       return 1;
315     return ST->hasV6T2Ops() ? 2 : 3;
316   }
317   // Thumb1, any i8 imm cost 1.
318   if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
319     return 1;
320   if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
321     return 2;
322   // Load from constantpool.
323   return 3;
324 }
325 
326 // Constants smaller than 256 fit in the immediate field of
327 // Thumb1 instructions so we return a zero cost and 1 otherwise.
328 InstructionCost ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
329                                                   const APInt &Imm, Type *Ty) {
330   if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
331     return 0;
332 
333   return 1;
334 }
335 
336 // Checks whether Inst is part of a min(max()) or max(min()) pattern
337 // that will match to an SSAT instruction
338 static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
339   Value *LHS, *RHS;
340   ConstantInt *C;
341   SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor;
342 
343   if (InstSPF == SPF_SMAX &&
344       PatternMatch::match(RHS, PatternMatch::m_ConstantInt(C)) &&
345       C->getValue() == Imm && Imm.isNegative() && (-Imm).isPowerOf2()) {
346 
347     auto isSSatMin = [&](Value *MinInst) {
348       if (isa<SelectInst>(MinInst)) {
349         Value *MinLHS, *MinRHS;
350         ConstantInt *MinC;
351         SelectPatternFlavor MinSPF =
352             matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
353         if (MinSPF == SPF_SMIN &&
354             PatternMatch::match(MinRHS, PatternMatch::m_ConstantInt(MinC)) &&
355             MinC->getValue() == ((-Imm) - 1))
356           return true;
357       }
358       return false;
359     };
360 
361     if (isSSatMin(Inst->getOperand(1)) ||
362         (Inst->hasNUses(2) && (isSSatMin(*Inst->user_begin()) ||
363                                isSSatMin(*(++Inst->user_begin())))))
364       return true;
365   }
366   return false;
367 }
368 
369 InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
370                                               const APInt &Imm, Type *Ty,
371                                               TTI::TargetCostKind CostKind,
372                                               Instruction *Inst) {
373   // Division by a constant can be turned into multiplication, but only if we
374   // know it's constant. So it's not so much that the immediate is cheap (it's
375   // not), but that the alternative is worse.
376   // FIXME: this is probably unneeded with GlobalISel.
377   if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
378        Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
379       Idx == 1)
380     return 0;
381 
382   // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
383   // splitting any large offsets.
384   if (Opcode == Instruction::GetElementPtr && Idx != 0)
385     return 0;
386 
387   if (Opcode == Instruction::And) {
388     // UXTB/UXTH
389     if (Imm == 255 || Imm == 65535)
390       return 0;
391     // Conversion to BIC is free, and means we can use ~Imm instead.
392     return std::min(getIntImmCost(Imm, Ty, CostKind),
393                     getIntImmCost(~Imm, Ty, CostKind));
394   }
395 
396   if (Opcode == Instruction::Add)
397     // Conversion to SUB is free, and means we can use -Imm instead.
398     return std::min(getIntImmCost(Imm, Ty, CostKind),
399                     getIntImmCost(-Imm, Ty, CostKind));
400 
401   if (Opcode == Instruction::ICmp && Imm.isNegative() &&
402       Ty->getIntegerBitWidth() == 32) {
403     int64_t NegImm = -Imm.getSExtValue();
404     if (ST->isThumb2() && NegImm < 1<<12)
405       // icmp X, #-C -> cmn X, #C
406       return 0;
407     if (ST->isThumb() && NegImm < 1<<8)
408       // icmp X, #-C -> adds X, #C
409       return 0;
410   }
411 
412   // xor a, -1 can always be folded to MVN
413   if (Opcode == Instruction::Xor && Imm.isAllOnesValue())
414     return 0;
415 
416   // Ensures negative constant of min(max()) or max(min()) patterns that
417   // match to SSAT instructions don't get hoisted
418   if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
419       Ty->getIntegerBitWidth() <= 32) {
420     if (isSSATMinMaxPattern(Inst, Imm) ||
421         (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
422          isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
423       return 0;
424   }
425 
426   return getIntImmCost(Imm, Ty, CostKind);
427 }
428 
429 InstructionCost ARMTTIImpl::getCFInstrCost(unsigned Opcode,
430                                            TTI::TargetCostKind CostKind,
431                                            const Instruction *I) {
432   if (CostKind == TTI::TCK_RecipThroughput &&
433       (ST->hasNEON() || ST->hasMVEIntegerOps())) {
434     // FIXME: The vectorizer is highly sensistive to the cost of these
435     // instructions, which suggests that it may be using the costs incorrectly.
436     // But, for now, just make them free to avoid performance regressions for
437     // vector targets.
438     return 0;
439   }
440   return BaseT::getCFInstrCost(Opcode, CostKind, I);
441 }
442 
443 InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
444                                              Type *Src,
445                                              TTI::CastContextHint CCH,
446                                              TTI::TargetCostKind CostKind,
447                                              const Instruction *I) {
448   int ISD = TLI->InstructionOpcodeToISD(Opcode);
449   assert(ISD && "Invalid opcode");
450 
451   // TODO: Allow non-throughput costs that aren't binary.
452   auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
453     if (CostKind != TTI::TCK_RecipThroughput)
454       return Cost == 0 ? 0 : 1;
455     return Cost;
456   };
457   auto IsLegalFPType = [this](EVT VT) {
458     EVT EltVT = VT.getScalarType();
459     return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
460             (EltVT == MVT::f64 && ST->hasFP64()) ||
461             (EltVT == MVT::f16 && ST->hasFullFP16());
462   };
463 
464   EVT SrcTy = TLI->getValueType(DL, Src);
465   EVT DstTy = TLI->getValueType(DL, Dst);
466 
467   if (!SrcTy.isSimple() || !DstTy.isSimple())
468     return AdjustCost(
469         BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
470 
471   // Extending masked load/Truncating masked stores is expensive because we
472   // currently don't split them. This means that we'll likely end up
473   // loading/storing each element individually (hence the high cost).
474   if ((ST->hasMVEIntegerOps() &&
475        (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
476         Opcode == Instruction::SExt)) ||
477       (ST->hasMVEFloatOps() &&
478        (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
479        IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
480     if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
481       return 2 * DstTy.getVectorNumElements() *
482              ST->getMVEVectorCostFactor(CostKind);
483 
484   // The extend of other kinds of load is free
485   if (CCH == TTI::CastContextHint::Normal ||
486       CCH == TTI::CastContextHint::Masked) {
487     static const TypeConversionCostTblEntry LoadConversionTbl[] = {
488         {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
489         {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
490         {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
491         {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
492         {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
493         {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
494         {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
495         {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
496         {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
497         {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
498         {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
499         {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
500     };
501     if (const auto *Entry = ConvertCostTableLookup(
502             LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
503       return AdjustCost(Entry->Cost);
504 
505     static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
506         {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
507         {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
508         {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
509         {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
510         {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
511         {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
512         // The following extend from a legal type to an illegal type, so need to
513         // split the load. This introduced an extra load operation, but the
514         // extend is still "free".
515         {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1},
516         {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1},
517         {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3},
518         {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3},
519         {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1},
520         {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1},
521     };
522     if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
523       if (const auto *Entry =
524               ConvertCostTableLookup(MVELoadConversionTbl, ISD,
525                                      DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
526         return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
527     }
528 
529     static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
530         // FPExtends are similar but also require the VCVT instructions.
531         {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1},
532         {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3},
533     };
534     if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
535       if (const auto *Entry =
536               ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
537                                      DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
538         return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
539     }
540 
541     // The truncate of a store is free. This is the mirror of extends above.
542     static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
543         {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
544         {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
545         {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
546         {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1},
547         {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1},
548         {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3},
549         {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1},
550     };
551     if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
552       if (const auto *Entry =
553               ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
554                                      SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
555         return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
556     }
557 
558     static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
559         {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
560         {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
561     };
562     if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
563       if (const auto *Entry =
564               ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
565                                      SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
566         return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
567     }
568   }
569 
570   // NEON vector operations that can extend their inputs.
571   if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
572       I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
573     static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
574       // vaddl
575       { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
576       { ISD::ADD, MVT::v8i16, MVT::v8i8,  0 },
577       // vsubl
578       { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
579       { ISD::SUB, MVT::v8i16, MVT::v8i8,  0 },
580       // vmull
581       { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
582       { ISD::MUL, MVT::v8i16, MVT::v8i8,  0 },
583       // vshll
584       { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
585       { ISD::SHL, MVT::v8i16, MVT::v8i8,  0 },
586     };
587 
588     auto *User = cast<Instruction>(*I->user_begin());
589     int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
590     if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
591                                              DstTy.getSimpleVT(),
592                                              SrcTy.getSimpleVT())) {
593       return AdjustCost(Entry->Cost);
594     }
595   }
596 
597   // Single to/from double precision conversions.
598   if (Src->isVectorTy() && ST->hasNEON() &&
599       ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
600         DstTy.getScalarType() == MVT::f32) ||
601        (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
602         DstTy.getScalarType() == MVT::f64))) {
603     static const CostTblEntry NEONFltDblTbl[] = {
604         // Vector fptrunc/fpext conversions.
605         {ISD::FP_ROUND, MVT::v2f64, 2},
606         {ISD::FP_EXTEND, MVT::v2f32, 2},
607         {ISD::FP_EXTEND, MVT::v4f32, 4}};
608 
609     std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
610     if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
611       return AdjustCost(LT.first * Entry->Cost);
612   }
613 
614   // Some arithmetic, load and store operations have specific instructions
615   // to cast up/down their types automatically at no extra cost.
616   // TODO: Get these tables to know at least what the related operations are.
617   static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
618     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
619     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
620     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
621     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
622     { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64, 0 },
623     { ISD::TRUNCATE,    MVT::v4i16, MVT::v4i32, 1 },
624 
625     // The number of vmovl instructions for the extension.
626     { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8,  1 },
627     { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8,  1 },
628     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8,  2 },
629     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8,  2 },
630     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8,  3 },
631     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8,  3 },
632     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
633     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
634     { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
635     { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
636     { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
637     { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
638     { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
639     { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
640     { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
641     { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
642     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
643     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
644 
645     // Operations that we legalize using splitting.
646     { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i32, 6 },
647     { ISD::TRUNCATE,    MVT::v8i8, MVT::v8i32, 3 },
648 
649     // Vector float <-> i32 conversions.
650     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
651     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
652 
653     { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
654     { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
655     { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
656     { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
657     { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
658     { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
659     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
660     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
661     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
662     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
663     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
664     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
665     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
666     { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
667     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
668     { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
669     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
670     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
671     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },
672     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },
673 
674     { ISD::FP_TO_SINT,  MVT::v4i32, MVT::v4f32, 1 },
675     { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f32, 1 },
676     { ISD::FP_TO_SINT,  MVT::v4i8, MVT::v4f32, 3 },
677     { ISD::FP_TO_UINT,  MVT::v4i8, MVT::v4f32, 3 },
678     { ISD::FP_TO_SINT,  MVT::v4i16, MVT::v4f32, 2 },
679     { ISD::FP_TO_UINT,  MVT::v4i16, MVT::v4f32, 2 },
680 
681     // Vector double <-> i32 conversions.
682     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
683     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
684 
685     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
686     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
687     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
688     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
689     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
690     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
691 
692     { ISD::FP_TO_SINT,  MVT::v2i32, MVT::v2f64, 2 },
693     { ISD::FP_TO_UINT,  MVT::v2i32, MVT::v2f64, 2 },
694     { ISD::FP_TO_SINT,  MVT::v8i16, MVT::v8f32, 4 },
695     { ISD::FP_TO_UINT,  MVT::v8i16, MVT::v8f32, 4 },
696     { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v16f32, 8 },
697     { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v16f32, 8 }
698   };
699 
700   if (SrcTy.isVector() && ST->hasNEON()) {
701     if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
702                                                    DstTy.getSimpleVT(),
703                                                    SrcTy.getSimpleVT()))
704       return AdjustCost(Entry->Cost);
705   }
706 
707   // Scalar float to integer conversions.
708   static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
709     { ISD::FP_TO_SINT,  MVT::i1, MVT::f32, 2 },
710     { ISD::FP_TO_UINT,  MVT::i1, MVT::f32, 2 },
711     { ISD::FP_TO_SINT,  MVT::i1, MVT::f64, 2 },
712     { ISD::FP_TO_UINT,  MVT::i1, MVT::f64, 2 },
713     { ISD::FP_TO_SINT,  MVT::i8, MVT::f32, 2 },
714     { ISD::FP_TO_UINT,  MVT::i8, MVT::f32, 2 },
715     { ISD::FP_TO_SINT,  MVT::i8, MVT::f64, 2 },
716     { ISD::FP_TO_UINT,  MVT::i8, MVT::f64, 2 },
717     { ISD::FP_TO_SINT,  MVT::i16, MVT::f32, 2 },
718     { ISD::FP_TO_UINT,  MVT::i16, MVT::f32, 2 },
719     { ISD::FP_TO_SINT,  MVT::i16, MVT::f64, 2 },
720     { ISD::FP_TO_UINT,  MVT::i16, MVT::f64, 2 },
721     { ISD::FP_TO_SINT,  MVT::i32, MVT::f32, 2 },
722     { ISD::FP_TO_UINT,  MVT::i32, MVT::f32, 2 },
723     { ISD::FP_TO_SINT,  MVT::i32, MVT::f64, 2 },
724     { ISD::FP_TO_UINT,  MVT::i32, MVT::f64, 2 },
725     { ISD::FP_TO_SINT,  MVT::i64, MVT::f32, 10 },
726     { ISD::FP_TO_UINT,  MVT::i64, MVT::f32, 10 },
727     { ISD::FP_TO_SINT,  MVT::i64, MVT::f64, 10 },
728     { ISD::FP_TO_UINT,  MVT::i64, MVT::f64, 10 }
729   };
730   if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
731     if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
732                                                    DstTy.getSimpleVT(),
733                                                    SrcTy.getSimpleVT()))
734       return AdjustCost(Entry->Cost);
735   }
736 
737   // Scalar integer to float conversions.
738   static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
739     { ISD::SINT_TO_FP,  MVT::f32, MVT::i1, 2 },
740     { ISD::UINT_TO_FP,  MVT::f32, MVT::i1, 2 },
741     { ISD::SINT_TO_FP,  MVT::f64, MVT::i1, 2 },
742     { ISD::UINT_TO_FP,  MVT::f64, MVT::i1, 2 },
743     { ISD::SINT_TO_FP,  MVT::f32, MVT::i8, 2 },
744     { ISD::UINT_TO_FP,  MVT::f32, MVT::i8, 2 },
745     { ISD::SINT_TO_FP,  MVT::f64, MVT::i8, 2 },
746     { ISD::UINT_TO_FP,  MVT::f64, MVT::i8, 2 },
747     { ISD::SINT_TO_FP,  MVT::f32, MVT::i16, 2 },
748     { ISD::UINT_TO_FP,  MVT::f32, MVT::i16, 2 },
749     { ISD::SINT_TO_FP,  MVT::f64, MVT::i16, 2 },
750     { ISD::UINT_TO_FP,  MVT::f64, MVT::i16, 2 },
751     { ISD::SINT_TO_FP,  MVT::f32, MVT::i32, 2 },
752     { ISD::UINT_TO_FP,  MVT::f32, MVT::i32, 2 },
753     { ISD::SINT_TO_FP,  MVT::f64, MVT::i32, 2 },
754     { ISD::UINT_TO_FP,  MVT::f64, MVT::i32, 2 },
755     { ISD::SINT_TO_FP,  MVT::f32, MVT::i64, 10 },
756     { ISD::UINT_TO_FP,  MVT::f32, MVT::i64, 10 },
757     { ISD::SINT_TO_FP,  MVT::f64, MVT::i64, 10 },
758     { ISD::UINT_TO_FP,  MVT::f64, MVT::i64, 10 }
759   };
760 
761   if (SrcTy.isInteger() && ST->hasNEON()) {
762     if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
763                                                    ISD, DstTy.getSimpleVT(),
764                                                    SrcTy.getSimpleVT()))
765       return AdjustCost(Entry->Cost);
766   }
767 
768   // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
769   // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
770   // are linearised so take more.
771   static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
772     { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
773     { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
774     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
775     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
776     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
777     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
778     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
779     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
780     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
781     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
782     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
783     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
784   };
785 
786   if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
787     if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
788                                                    ISD, DstTy.getSimpleVT(),
789                                                    SrcTy.getSimpleVT()))
790       return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
791   }
792 
793   if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
794     // As general rule, fp converts that were not matched above are scalarized
795     // and cost 1 vcvt for each lane, so long as the instruction is available.
796     // If not it will become a series of function calls.
797     const InstructionCost CallCost =
798         getCallInstrCost(nullptr, Dst, {Src}, CostKind);
799     int Lanes = 1;
800     if (SrcTy.isFixedLengthVector())
801       Lanes = SrcTy.getVectorNumElements();
802 
803     if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
804       return Lanes;
805     else
806       return Lanes * CallCost;
807   }
808 
809   if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
810       SrcTy.isFixedLengthVector()) {
811     // Treat a truncate with larger than legal source (128bits for MVE) as
812     // expensive, 2 instructions per lane.
813     if ((SrcTy.getScalarType() == MVT::i8 ||
814          SrcTy.getScalarType() == MVT::i16 ||
815          SrcTy.getScalarType() == MVT::i32) &&
816         SrcTy.getSizeInBits() > 128 &&
817         SrcTy.getSizeInBits() > DstTy.getSizeInBits())
818       return SrcTy.getVectorNumElements() * 2;
819   }
820 
821   // Scalar integer conversion costs.
822   static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
823     // i16 -> i64 requires two dependent operations.
824     { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
825 
826     // Truncates on i64 are assumed to be free.
827     { ISD::TRUNCATE,    MVT::i32, MVT::i64, 0 },
828     { ISD::TRUNCATE,    MVT::i16, MVT::i64, 0 },
829     { ISD::TRUNCATE,    MVT::i8,  MVT::i64, 0 },
830     { ISD::TRUNCATE,    MVT::i1,  MVT::i64, 0 }
831   };
832 
833   if (SrcTy.isInteger()) {
834     if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
835                                                    DstTy.getSimpleVT(),
836                                                    SrcTy.getSimpleVT()))
837       return AdjustCost(Entry->Cost);
838   }
839 
840   int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
841                      ? ST->getMVEVectorCostFactor(CostKind)
842                      : 1;
843   return AdjustCost(
844       BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
845 }
846 
847 InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
848                                                unsigned Index) {
849   // Penalize inserting into an D-subregister. We end up with a three times
850   // lower estimated throughput on swift.
851   if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
852       ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
853     return 3;
854 
855   if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
856                         Opcode == Instruction::ExtractElement)) {
857     // Cross-class copies are expensive on many microarchitectures,
858     // so assume they are expensive by default.
859     if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
860       return 3;
861 
862     // Even if it's not a cross class copy, this likely leads to mixing
863     // of NEON and VFP code and should be therefore penalized.
864     if (ValTy->isVectorTy() &&
865         ValTy->getScalarSizeInBits() <= 32)
866       return std::max<InstructionCost>(
867           BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
868   }
869 
870   if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
871                                  Opcode == Instruction::ExtractElement)) {
872     // Integer cross-lane moves are more expensive than float, which can
873     // sometimes just be vmovs. Integer involve being passes to GPR registers,
874     // causing more of a delay.
875     std::pair<InstructionCost, MVT> LT =
876         getTLI()->getTypeLegalizationCost(DL, ValTy->getScalarType());
877     return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
878   }
879 
880   return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
881 }
882 
883 InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
884                                                Type *CondTy,
885                                                CmpInst::Predicate VecPred,
886                                                TTI::TargetCostKind CostKind,
887                                                const Instruction *I) {
888   int ISD = TLI->InstructionOpcodeToISD(Opcode);
889 
890   // Thumb scalar code size cost for select.
891   if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
892       ST->isThumb() && !ValTy->isVectorTy()) {
893     // Assume expensive structs.
894     if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
895       return TTI::TCC_Expensive;
896 
897     // Select costs can vary because they:
898     // - may require one or more conditional mov (including an IT),
899     // - can't operate directly on immediates,
900     // - require live flags, which we can't copy around easily.
901     InstructionCost Cost = TLI->getTypeLegalizationCost(DL, ValTy).first;
902 
903     // Possible IT instruction for Thumb2, or more for Thumb1.
904     ++Cost;
905 
906     // i1 values may need rematerialising by using mov immediates and/or
907     // flag setting instructions.
908     if (ValTy->isIntegerTy(1))
909       ++Cost;
910 
911     return Cost;
912   }
913 
914   // If this is a vector min/max/abs, use the cost of that intrinsic directly
915   // instead. Hopefully when min/max intrinsics are more prevalent this code
916   // will not be needed.
917   const Instruction *Sel = I;
918   if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
919       Sel->hasOneUse())
920     Sel = cast<Instruction>(Sel->user_back());
921   if (Sel && ValTy->isVectorTy() &&
922       (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
923     const Value *LHS, *RHS;
924     SelectPatternFlavor SPF = matchSelectPattern(Sel, LHS, RHS).Flavor;
925     unsigned IID = 0;
926     switch (SPF) {
927     case SPF_ABS:
928       IID = Intrinsic::abs;
929       break;
930     case SPF_SMIN:
931       IID = Intrinsic::smin;
932       break;
933     case SPF_SMAX:
934       IID = Intrinsic::smax;
935       break;
936     case SPF_UMIN:
937       IID = Intrinsic::umin;
938       break;
939     case SPF_UMAX:
940       IID = Intrinsic::umax;
941       break;
942     case SPF_FMINNUM:
943       IID = Intrinsic::minnum;
944       break;
945     case SPF_FMAXNUM:
946       IID = Intrinsic::maxnum;
947       break;
948     default:
949       break;
950     }
951     if (IID) {
952       // The ICmp is free, the select gets the cost of the min/max/etc
953       if (Sel != I)
954         return 0;
955       IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
956       return getIntrinsicInstrCost(CostAttrs, CostKind);
957     }
958   }
959 
960   // On NEON a vector select gets lowered to vbsl.
961   if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
962     // Lowering of some vector selects is currently far from perfect.
963     static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
964       { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
965       { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
966       { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
967     };
968 
969     EVT SelCondTy = TLI->getValueType(DL, CondTy);
970     EVT SelValTy = TLI->getValueType(DL, ValTy);
971     if (SelCondTy.isSimple() && SelValTy.isSimple()) {
972       if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
973                                                      SelCondTy.getSimpleVT(),
974                                                      SelValTy.getSimpleVT()))
975         return Entry->Cost;
976     }
977 
978     std::pair<InstructionCost, MVT> LT =
979         TLI->getTypeLegalizationCost(DL, ValTy);
980     return LT.first;
981   }
982 
983   if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
984       (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
985       cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
986     FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
987     FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy);
988     if (!VecCondTy)
989       VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy));
990 
991     // If we don't have mve.fp any fp operations will need to be scalarized.
992     if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
993       // One scalaization insert, one scalarization extract and the cost of the
994       // fcmps.
995       return BaseT::getScalarizationOverhead(VecValTy, false, true) +
996              BaseT::getScalarizationOverhead(VecCondTy, true, false) +
997              VecValTy->getNumElements() *
998                  getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
999                                     VecCondTy->getScalarType(), VecPred, CostKind,
1000                                     I);
1001     }
1002 
1003     std::pair<InstructionCost, MVT> LT =
1004         TLI->getTypeLegalizationCost(DL, ValTy);
1005     int BaseCost = ST->getMVEVectorCostFactor(CostKind);
1006     // There are two types - the input that specifies the type of the compare
1007     // and the output vXi1 type. Because we don't know how the output will be
1008     // split, we may need an expensive shuffle to get two in sync. This has the
1009     // effect of making larger than legal compares (v8i32 for example)
1010     // expensive.
1011     if (LT.second.getVectorNumElements() > 2) {
1012       if (LT.first > 1)
1013         return LT.first * BaseCost +
1014                BaseT::getScalarizationOverhead(VecCondTy, true, false);
1015       return BaseCost;
1016     }
1017   }
1018 
1019   // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1020   // for "multiple beats" potentially needed by MVE instructions.
1021   int BaseCost = 1;
1022   if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
1023     BaseCost = ST->getMVEVectorCostFactor(CostKind);
1024 
1025   return BaseCost *
1026          BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1027 }
1028 
1029 InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty,
1030                                                       ScalarEvolution *SE,
1031                                                       const SCEV *Ptr) {
1032   // Address computations in vectorized code with non-consecutive addresses will
1033   // likely result in more instructions compared to scalar code where the
1034   // computation can more often be merged into the index mode. The resulting
1035   // extra micro-ops can significantly decrease throughput.
1036   unsigned NumVectorInstToHideOverhead = 10;
1037   int MaxMergeDistance = 64;
1038 
1039   if (ST->hasNEON()) {
1040     if (Ty->isVectorTy() && SE &&
1041         !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
1042       return NumVectorInstToHideOverhead;
1043 
1044     // In many cases the address computation is not merged into the instruction
1045     // addressing mode.
1046     return 1;
1047   }
1048   return BaseT::getAddressComputationCost(Ty, SE, Ptr);
1049 }
1050 
1051 bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) {
1052   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
1053     // If a VCTP is part of a chain, it's already profitable and shouldn't be
1054     // optimized, else LSR may block tail-predication.
1055     switch (II->getIntrinsicID()) {
1056     case Intrinsic::arm_mve_vctp8:
1057     case Intrinsic::arm_mve_vctp16:
1058     case Intrinsic::arm_mve_vctp32:
1059     case Intrinsic::arm_mve_vctp64:
1060       return true;
1061     default:
1062       break;
1063     }
1064   }
1065   return false;
1066 }
1067 
1068 bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
1069   if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
1070     return false;
1071 
1072   if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
1073     // Don't support v2i1 yet.
1074     if (VecTy->getNumElements() == 2)
1075       return false;
1076 
1077     // We don't support extending fp types.
1078      unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
1079     if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
1080       return false;
1081   }
1082 
1083   unsigned EltWidth = DataTy->getScalarSizeInBits();
1084   return (EltWidth == 32 && Alignment >= 4) ||
1085          (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
1086 }
1087 
1088 bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) {
1089   if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
1090     return false;
1091 
1092   // This method is called in 2 places:
1093   //  - from the vectorizer with a scalar type, in which case we need to get
1094   //  this as good as we can with the limited info we have (and rely on the cost
1095   //  model for the rest).
1096   //  - from the masked intrinsic lowering pass with the actual vector type.
1097   // For MVE, we have a custom lowering pass that will already have custom
1098   // legalised any gathers that we can to MVE intrinsics, and want to expand all
1099   // the rest. The pass runs before the masked intrinsic lowering pass, so if we
1100   // are here, we know we want to expand.
1101   if (isa<VectorType>(Ty))
1102     return false;
1103 
1104   unsigned EltWidth = Ty->getScalarSizeInBits();
1105   return ((EltWidth == 32 && Alignment >= 4) ||
1106           (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1107 }
1108 
1109 /// Given a memcpy/memset/memmove instruction, return the number of memory
1110 /// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1111 /// call is used.
1112 int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const {
1113   MemOp MOp;
1114   unsigned DstAddrSpace = ~0u;
1115   unsigned SrcAddrSpace = ~0u;
1116   const Function *F = I->getParent()->getParent();
1117 
1118   if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
1119     ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
1120     // If 'size' is not a constant, a library call will be generated.
1121     if (!C)
1122       return -1;
1123 
1124     const unsigned Size = C->getValue().getZExtValue();
1125     const Align DstAlign = *MC->getDestAlign();
1126     const Align SrcAlign = *MC->getSourceAlign();
1127 
1128     MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1129                       /*IsVolatile*/ false);
1130     DstAddrSpace = MC->getDestAddressSpace();
1131     SrcAddrSpace = MC->getSourceAddressSpace();
1132   }
1133   else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
1134     ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
1135     // If 'size' is not a constant, a library call will be generated.
1136     if (!C)
1137       return -1;
1138 
1139     const unsigned Size = C->getValue().getZExtValue();
1140     const Align DstAlign = *MS->getDestAlign();
1141 
1142     MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1143                      /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1144     DstAddrSpace = MS->getDestAddressSpace();
1145   }
1146   else
1147     llvm_unreachable("Expected a memcpy/move or memset!");
1148 
1149   unsigned Limit, Factor = 2;
1150   switch(I->getIntrinsicID()) {
1151     case Intrinsic::memcpy:
1152       Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
1153       break;
1154     case Intrinsic::memmove:
1155       Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
1156       break;
1157     case Intrinsic::memset:
1158       Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
1159       Factor = 1;
1160       break;
1161     default:
1162       llvm_unreachable("Expected a memcpy/move or memset!");
1163   }
1164 
1165   // MemOps will be poplulated with a list of data types that needs to be
1166   // loaded and stored. That's why we multiply the number of elements by 2 to
1167   // get the cost for this memcpy.
1168   std::vector<EVT> MemOps;
1169   if (getTLI()->findOptimalMemOpLowering(
1170           MemOps, Limit, MOp, DstAddrSpace,
1171           SrcAddrSpace, F->getAttributes()))
1172     return MemOps.size() * Factor;
1173 
1174   // If we can't find an optimal memop lowering, return the default cost
1175   return -1;
1176 }
1177 
1178 InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction *I) {
1179   int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
1180 
1181   // To model the cost of a library call, we assume 1 for the call, and
1182   // 3 for the argument setup.
1183   if (NumOps == -1)
1184     return 4;
1185   return NumOps;
1186 }
1187 
1188 InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1189                                            VectorType *Tp, ArrayRef<int> Mask,
1190                                            int Index, VectorType *SubTp) {
1191   Kind = improveShuffleKindFromMask(Kind, Mask);
1192   if (ST->hasNEON()) {
1193     if (Kind == TTI::SK_Broadcast) {
1194       static const CostTblEntry NEONDupTbl[] = {
1195           // VDUP handles these cases.
1196           {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1197           {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1198           {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1199           {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1200           {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1201           {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1202 
1203           {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1204           {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1205           {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1206           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
1207 
1208       std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1209       if (const auto *Entry =
1210               CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
1211         return LT.first * Entry->Cost;
1212     }
1213     if (Kind == TTI::SK_Reverse) {
1214       static const CostTblEntry NEONShuffleTbl[] = {
1215           // Reverse shuffle cost one instruction if we are shuffling within a
1216           // double word (vrev) or two if we shuffle a quad word (vrev, vext).
1217           {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1218           {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1219           {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1220           {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1221           {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1222           {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1223 
1224           {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1225           {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1226           {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
1227           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
1228 
1229       std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1230       if (const auto *Entry =
1231               CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
1232         return LT.first * Entry->Cost;
1233     }
1234     if (Kind == TTI::SK_Select) {
1235       static const CostTblEntry NEONSelShuffleTbl[] = {
1236           // Select shuffle cost table for ARM. Cost is the number of
1237           // instructions
1238           // required to create the shuffled vector.
1239 
1240           {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1241           {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1242           {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1243           {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1244 
1245           {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1246           {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1247           {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
1248 
1249           {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
1250 
1251           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
1252 
1253       std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1254       if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
1255                                               ISD::VECTOR_SHUFFLE, LT.second))
1256         return LT.first * Entry->Cost;
1257     }
1258   }
1259   if (ST->hasMVEIntegerOps()) {
1260     if (Kind == TTI::SK_Broadcast) {
1261       static const CostTblEntry MVEDupTbl[] = {
1262           // VDUP handles these cases.
1263           {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1264           {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1265           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
1266           {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1267           {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
1268 
1269       std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1270       if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
1271                                               LT.second))
1272         return LT.first * Entry->Cost *
1273                ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput);
1274     }
1275 
1276     if (!Mask.empty()) {
1277       std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1278       if (Mask.size() <= LT.second.getVectorNumElements() &&
1279           (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
1280            isVREVMask(Mask, LT.second, 64)))
1281         return ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) * LT.first;
1282     }
1283   }
1284 
1285   int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
1286                      ? ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput)
1287                      : 1;
1288   return BaseCost * BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
1289 }
1290 
1291 InstructionCost ARMTTIImpl::getArithmeticInstrCost(
1292     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1293     TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
1294     TTI::OperandValueProperties Opd1PropInfo,
1295     TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
1296     const Instruction *CxtI) {
1297   int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1298   if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
1299     // Make operations on i1 relatively expensive as this often involves
1300     // combining predicates. AND and XOR should be easier to handle with IT
1301     // blocks.
1302     switch (ISDOpcode) {
1303     default:
1304       break;
1305     case ISD::AND:
1306     case ISD::XOR:
1307       return 2;
1308     case ISD::OR:
1309       return 3;
1310     }
1311   }
1312 
1313   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
1314 
1315   if (ST->hasNEON()) {
1316     const unsigned FunctionCallDivCost = 20;
1317     const unsigned ReciprocalDivCost = 10;
1318     static const CostTblEntry CostTbl[] = {
1319       // Division.
1320       // These costs are somewhat random. Choose a cost of 20 to indicate that
1321       // vectorizing devision (added function call) is going to be very expensive.
1322       // Double registers types.
1323       { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1324       { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1325       { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
1326       { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
1327       { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1328       { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1329       { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
1330       { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
1331       { ISD::SDIV, MVT::v4i16,     ReciprocalDivCost},
1332       { ISD::UDIV, MVT::v4i16,     ReciprocalDivCost},
1333       { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
1334       { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
1335       { ISD::SDIV, MVT::v8i8,      ReciprocalDivCost},
1336       { ISD::UDIV, MVT::v8i8,      ReciprocalDivCost},
1337       { ISD::SREM, MVT::v8i8,  8 * FunctionCallDivCost},
1338       { ISD::UREM, MVT::v8i8,  8 * FunctionCallDivCost},
1339       // Quad register types.
1340       { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1341       { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1342       { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
1343       { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
1344       { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1345       { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1346       { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
1347       { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
1348       { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1349       { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1350       { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
1351       { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
1352       { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1353       { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1354       { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
1355       { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
1356       // Multiplication.
1357     };
1358 
1359     if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
1360       return LT.first * Entry->Cost;
1361 
1362     InstructionCost Cost = BaseT::getArithmeticInstrCost(
1363         Opcode, Ty, CostKind, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo);
1364 
1365     // This is somewhat of a hack. The problem that we are facing is that SROA
1366     // creates a sequence of shift, and, or instructions to construct values.
1367     // These sequences are recognized by the ISel and have zero-cost. Not so for
1368     // the vectorized code. Because we have support for v2i64 but not i64 those
1369     // sequences look particularly beneficial to vectorize.
1370     // To work around this we increase the cost of v2i64 operations to make them
1371     // seem less beneficial.
1372     if (LT.second == MVT::v2i64 &&
1373         Op2Info == TargetTransformInfo::OK_UniformConstantValue)
1374       Cost += 4;
1375 
1376     return Cost;
1377   }
1378 
1379   // If this operation is a shift on arm/thumb2, it might well be folded into
1380   // the following instruction, hence having a cost of 0.
1381   auto LooksLikeAFreeShift = [&]() {
1382     if (ST->isThumb1Only() || Ty->isVectorTy())
1383       return false;
1384 
1385     if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1386       return false;
1387     if (Op2Info != TargetTransformInfo::OK_UniformConstantValue)
1388       return false;
1389 
1390     // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1391     switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
1392     case Instruction::Add:
1393     case Instruction::Sub:
1394     case Instruction::And:
1395     case Instruction::Xor:
1396     case Instruction::Or:
1397     case Instruction::ICmp:
1398       return true;
1399     default:
1400       return false;
1401     }
1402   };
1403   if (LooksLikeAFreeShift())
1404     return 0;
1405 
1406   // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1407   // for "multiple beats" potentially needed by MVE instructions.
1408   int BaseCost = 1;
1409   if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1410     BaseCost = ST->getMVEVectorCostFactor(CostKind);
1411 
1412   // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
1413   // without treating floats as more expensive that scalars or increasing the
1414   // costs for custom operations. The results is also multiplied by the
1415   // MVEVectorCostFactor where appropriate.
1416   if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
1417     return LT.first * BaseCost;
1418 
1419   // Else this is expand, assume that we need to scalarize this op.
1420   if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
1421     unsigned Num = VTy->getNumElements();
1422     InstructionCost Cost =
1423         getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind);
1424     // Return the cost of multiple scalar invocation plus the cost of
1425     // inserting and extracting the values.
1426     SmallVector<Type *> Tys(Args.size(), Ty);
1427     return BaseT::getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
1428   }
1429 
1430   return BaseCost;
1431 }
1432 
1433 InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1434                                             MaybeAlign Alignment,
1435                                             unsigned AddressSpace,
1436                                             TTI::TargetCostKind CostKind,
1437                                             const Instruction *I) {
1438   // TODO: Handle other cost kinds.
1439   if (CostKind != TTI::TCK_RecipThroughput)
1440     return 1;
1441 
1442   // Type legalization can't handle structs
1443   if (TLI->getValueType(DL, Src, true) == MVT::Other)
1444     return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1445                                   CostKind);
1446 
1447   if (ST->hasNEON() && Src->isVectorTy() &&
1448       (Alignment && *Alignment != Align(16)) &&
1449       cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
1450     // Unaligned loads/stores are extremely inefficient.
1451     // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1452     std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
1453     return LT.first * 4;
1454   }
1455 
1456   // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
1457   // Same for stores.
1458   if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
1459       ((Opcode == Instruction::Load && I->hasOneUse() &&
1460         isa<FPExtInst>(*I->user_begin())) ||
1461        (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
1462     FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);
1463     Type *DstTy =
1464         Opcode == Instruction::Load
1465             ? (*I->user_begin())->getType()
1466             : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
1467     if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
1468         DstTy->getScalarType()->isFloatTy())
1469       return ST->getMVEVectorCostFactor(CostKind);
1470   }
1471 
1472   int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1473                      ? ST->getMVEVectorCostFactor(CostKind)
1474                      : 1;
1475   return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1476                                            CostKind, I);
1477 }
1478 
1479 InstructionCost
1480 ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1481                                   unsigned AddressSpace,
1482                                   TTI::TargetCostKind CostKind) {
1483   if (ST->hasMVEIntegerOps()) {
1484     if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment))
1485       return ST->getMVEVectorCostFactor(CostKind);
1486     if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment))
1487       return ST->getMVEVectorCostFactor(CostKind);
1488   }
1489   if (!isa<FixedVectorType>(Src))
1490     return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1491                                         CostKind);
1492   // Scalar cost, which is currently very high due to the efficiency of the
1493   // generated code.
1494   return cast<FixedVectorType>(Src)->getNumElements() * 8;
1495 }
1496 
1497 InstructionCost ARMTTIImpl::getInterleavedMemoryOpCost(
1498     unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1499     Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1500     bool UseMaskForCond, bool UseMaskForGaps) {
1501   assert(Factor >= 2 && "Invalid interleave factor");
1502   assert(isa<VectorType>(VecTy) && "Expect a vector type");
1503 
1504   // vldN/vstN doesn't support vector types of i64/f64 element.
1505   bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
1506 
1507   if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
1508       !UseMaskForCond && !UseMaskForGaps) {
1509     unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1510     auto *SubVecTy =
1511         FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
1512 
1513     // vldN/vstN only support legal vector types of size 64 or 128 in bits.
1514     // Accesses having vector types that are a multiple of 128 bits can be
1515     // matched to more than one vldN/vstN instruction.
1516     int BaseCost =
1517         ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;
1518     if (NumElts % Factor == 0 &&
1519         TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
1520       return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1521 
1522     // Some smaller than legal interleaved patterns are cheap as we can make
1523     // use of the vmovn or vrev patterns to interleave a standard load. This is
1524     // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1525     // promoted differently). The cost of 2 here is then a load and vrev or
1526     // vmovn.
1527     if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1528         VecTy->isIntOrIntVectorTy() &&
1529         DL.getTypeSizeInBits(SubVecTy).getFixedSize() <= 64)
1530       return 2 * BaseCost;
1531   }
1532 
1533   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1534                                            Alignment, AddressSpace, CostKind,
1535                                            UseMaskForCond, UseMaskForGaps);
1536 }
1537 
1538 InstructionCost ARMTTIImpl::getGatherScatterOpCost(
1539     unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1540     Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
1541   using namespace PatternMatch;
1542   if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
1543     return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1544                                          Alignment, CostKind, I);
1545 
1546   assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
1547   auto *VTy = cast<FixedVectorType>(DataTy);
1548 
1549   // TODO: Splitting, once we do that.
1550 
1551   unsigned NumElems = VTy->getNumElements();
1552   unsigned EltSize = VTy->getScalarSizeInBits();
1553   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, DataTy);
1554 
1555   // For now, it is assumed that for the MVE gather instructions the loads are
1556   // all effectively serialised. This means the cost is the scalar cost
1557   // multiplied by the number of elements being loaded. This is possibly very
1558   // conservative, but even so we still end up vectorising loops because the
1559   // cost per iteration for many loops is lower than for scalar loops.
1560   InstructionCost VectorCost =
1561       NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
1562   // The scalarization cost should be a lot higher. We use the number of vector
1563   // elements plus the scalarization overhead.
1564   InstructionCost ScalarCost =
1565       NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, true, false) +
1566       BaseT::getScalarizationOverhead(VTy, false, true);
1567 
1568   if (EltSize < 8 || Alignment < EltSize / 8)
1569     return ScalarCost;
1570 
1571   unsigned ExtSize = EltSize;
1572   // Check whether there's a single user that asks for an extended type
1573   if (I != nullptr) {
1574     // Dependent of the caller of this function, a gather instruction will
1575     // either have opcode Instruction::Load or be a call to the masked_gather
1576     // intrinsic
1577     if ((I->getOpcode() == Instruction::Load ||
1578          match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
1579         I->hasOneUse()) {
1580       const User *Us = *I->users().begin();
1581       if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
1582         // only allow valid type combinations
1583         unsigned TypeSize =
1584             cast<Instruction>(Us)->getType()->getScalarSizeInBits();
1585         if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
1586              (TypeSize == 16 && EltSize == 8)) &&
1587             TypeSize * NumElems == 128) {
1588           ExtSize = TypeSize;
1589         }
1590       }
1591     }
1592     // Check whether the input data needs to be truncated
1593     TruncInst *T;
1594     if ((I->getOpcode() == Instruction::Store ||
1595          match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
1596         (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
1597       // Only allow valid type combinations
1598       unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
1599       if (((EltSize == 16 && TypeSize == 32) ||
1600            (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
1601           TypeSize * NumElems == 128)
1602         ExtSize = TypeSize;
1603     }
1604   }
1605 
1606   if (ExtSize * NumElems != 128 || NumElems < 4)
1607     return ScalarCost;
1608 
1609   // Any (aligned) i32 gather will not need to be scalarised.
1610   if (ExtSize == 32)
1611     return VectorCost;
1612   // For smaller types, we need to ensure that the gep's inputs are correctly
1613   // extended from a small enough value. Other sizes (including i64) are
1614   // scalarized for now.
1615   if (ExtSize != 8 && ExtSize != 16)
1616     return ScalarCost;
1617 
1618   if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
1619     Ptr = BC->getOperand(0);
1620   if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
1621     if (GEP->getNumOperands() != 2)
1622       return ScalarCost;
1623     unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
1624     // Scale needs to be correct (which is only relevant for i16s).
1625     if (Scale != 1 && Scale * 8 != ExtSize)
1626       return ScalarCost;
1627     // And we need to zext (not sext) the indexes from a small enough type.
1628     if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
1629       if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
1630         return VectorCost;
1631     }
1632     return ScalarCost;
1633   }
1634   return ScalarCost;
1635 }
1636 
1637 InstructionCost
1638 ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
1639                                        Optional<FastMathFlags> FMF,
1640                                        TTI::TargetCostKind CostKind) {
1641   if (TTI::requiresOrderedReduction(FMF))
1642     return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1643 
1644   EVT ValVT = TLI->getValueType(DL, ValTy);
1645   int ISD = TLI->InstructionOpcodeToISD(Opcode);
1646   if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD)
1647     return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1648 
1649   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1650 
1651   static const CostTblEntry CostTblAdd[]{
1652       {ISD::ADD, MVT::v16i8, 1},
1653       {ISD::ADD, MVT::v8i16, 1},
1654       {ISD::ADD, MVT::v4i32, 1},
1655   };
1656   if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
1657     return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1658 
1659   return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1660 }
1661 
1662 InstructionCost
1663 ARMTTIImpl::getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
1664                                         Type *ResTy, VectorType *ValTy,
1665                                         TTI::TargetCostKind CostKind) {
1666   EVT ValVT = TLI->getValueType(DL, ValTy);
1667   EVT ResVT = TLI->getValueType(DL, ResTy);
1668 
1669   if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1670     std::pair<InstructionCost, MVT> LT =
1671         TLI->getTypeLegalizationCost(DL, ValTy);
1672 
1673     // The legal cases are:
1674     //   VADDV u/s 8/16/32
1675     //   VMLAV u/s 8/16/32
1676     //   VADDLV u/s 32
1677     //   VMLALV u/s 16/32
1678     // Codegen currently cannot always handle larger than legal vectors very
1679     // well, especially for predicated reductions where the mask needs to be
1680     // split, so restrict to 128bit or smaller input types.
1681     unsigned RevVTSize = ResVT.getSizeInBits();
1682     if (ValVT.getSizeInBits() <= 128 &&
1683         ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1684          (LT.second == MVT::v8i16 && RevVTSize <= (IsMLA ? 64u : 32u)) ||
1685          (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1686       return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1687   }
1688 
1689   return BaseT::getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, ValTy,
1690                                             CostKind);
1691 }
1692 
1693 InstructionCost
1694 ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1695                                   TTI::TargetCostKind CostKind) {
1696   switch (ICA.getID()) {
1697   case Intrinsic::get_active_lane_mask:
1698     // Currently we make a somewhat optimistic assumption that
1699     // active_lane_mask's are always free. In reality it may be freely folded
1700     // into a tail predicated loop, expanded into a VCPT or expanded into a lot
1701     // of add/icmp code. We may need to improve this in the future, but being
1702     // able to detect if it is free or not involves looking at a lot of other
1703     // code. We currently assume that the vectorizer inserted these, and knew
1704     // what it was doing in adding one.
1705     if (ST->hasMVEIntegerOps())
1706       return 0;
1707     break;
1708   case Intrinsic::sadd_sat:
1709   case Intrinsic::ssub_sat:
1710   case Intrinsic::uadd_sat:
1711   case Intrinsic::usub_sat: {
1712     if (!ST->hasMVEIntegerOps())
1713       break;
1714     Type *VT = ICA.getReturnType();
1715 
1716     std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1717     if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1718         LT.second == MVT::v16i8) {
1719       // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
1720       // need to extend the type, as it uses shr(qadd(shl, shl)).
1721       unsigned Instrs =
1722           LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4;
1723       return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
1724     }
1725     break;
1726   }
1727   case Intrinsic::abs:
1728   case Intrinsic::smin:
1729   case Intrinsic::smax:
1730   case Intrinsic::umin:
1731   case Intrinsic::umax: {
1732     if (!ST->hasMVEIntegerOps())
1733       break;
1734     Type *VT = ICA.getReturnType();
1735 
1736     std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1737     if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1738         LT.second == MVT::v16i8)
1739       return LT.first * ST->getMVEVectorCostFactor(CostKind);
1740     break;
1741   }
1742   case Intrinsic::minnum:
1743   case Intrinsic::maxnum: {
1744     if (!ST->hasMVEFloatOps())
1745       break;
1746     Type *VT = ICA.getReturnType();
1747     std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1748     if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
1749       return LT.first * ST->getMVEVectorCostFactor(CostKind);
1750     break;
1751   }
1752   }
1753 
1754   return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1755 }
1756 
1757 bool ARMTTIImpl::isLoweredToCall(const Function *F) {
1758   if (!F->isIntrinsic())
1759     BaseT::isLoweredToCall(F);
1760 
1761   // Assume all Arm-specific intrinsics map to an instruction.
1762   if (F->getName().startswith("llvm.arm"))
1763     return false;
1764 
1765   switch (F->getIntrinsicID()) {
1766   default: break;
1767   case Intrinsic::powi:
1768   case Intrinsic::sin:
1769   case Intrinsic::cos:
1770   case Intrinsic::pow:
1771   case Intrinsic::log:
1772   case Intrinsic::log10:
1773   case Intrinsic::log2:
1774   case Intrinsic::exp:
1775   case Intrinsic::exp2:
1776     return true;
1777   case Intrinsic::sqrt:
1778   case Intrinsic::fabs:
1779   case Intrinsic::copysign:
1780   case Intrinsic::floor:
1781   case Intrinsic::ceil:
1782   case Intrinsic::trunc:
1783   case Intrinsic::rint:
1784   case Intrinsic::nearbyint:
1785   case Intrinsic::round:
1786   case Intrinsic::canonicalize:
1787   case Intrinsic::lround:
1788   case Intrinsic::llround:
1789   case Intrinsic::lrint:
1790   case Intrinsic::llrint:
1791     if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
1792       return true;
1793     if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
1794       return true;
1795     // Some operations can be handled by vector instructions and assume
1796     // unsupported vectors will be expanded into supported scalar ones.
1797     // TODO Handle scalar operations properly.
1798     return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
1799   case Intrinsic::masked_store:
1800   case Intrinsic::masked_load:
1801   case Intrinsic::masked_gather:
1802   case Intrinsic::masked_scatter:
1803     return !ST->hasMVEIntegerOps();
1804   case Intrinsic::sadd_with_overflow:
1805   case Intrinsic::uadd_with_overflow:
1806   case Intrinsic::ssub_with_overflow:
1807   case Intrinsic::usub_with_overflow:
1808   case Intrinsic::sadd_sat:
1809   case Intrinsic::uadd_sat:
1810   case Intrinsic::ssub_sat:
1811   case Intrinsic::usub_sat:
1812     return false;
1813   }
1814 
1815   return BaseT::isLoweredToCall(F);
1816 }
1817 
1818 bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) {
1819   unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
1820   EVT VT = TLI->getValueType(DL, I.getType(), true);
1821   if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
1822     return true;
1823 
1824   // Check if an intrinsic will be lowered to a call and assume that any
1825   // other CallInst will generate a bl.
1826   if (auto *Call = dyn_cast<CallInst>(&I)) {
1827     if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
1828       switch(II->getIntrinsicID()) {
1829         case Intrinsic::memcpy:
1830         case Intrinsic::memset:
1831         case Intrinsic::memmove:
1832           return getNumMemOps(II) == -1;
1833         default:
1834           if (const Function *F = Call->getCalledFunction())
1835             return isLoweredToCall(F);
1836       }
1837     }
1838     return true;
1839   }
1840 
1841   // FPv5 provides conversions between integer, double-precision,
1842   // single-precision, and half-precision formats.
1843   switch (I.getOpcode()) {
1844   default:
1845     break;
1846   case Instruction::FPToSI:
1847   case Instruction::FPToUI:
1848   case Instruction::SIToFP:
1849   case Instruction::UIToFP:
1850   case Instruction::FPTrunc:
1851   case Instruction::FPExt:
1852     return !ST->hasFPARMv8Base();
1853   }
1854 
1855   // FIXME: Unfortunately the approach of checking the Operation Action does
1856   // not catch all cases of Legalization that use library calls. Our
1857   // Legalization step categorizes some transformations into library calls as
1858   // Custom, Expand or even Legal when doing type legalization. So for now
1859   // we have to special case for instance the SDIV of 64bit integers and the
1860   // use of floating point emulation.
1861   if (VT.isInteger() && VT.getSizeInBits() >= 64) {
1862     switch (ISD) {
1863     default:
1864       break;
1865     case ISD::SDIV:
1866     case ISD::UDIV:
1867     case ISD::SREM:
1868     case ISD::UREM:
1869     case ISD::SDIVREM:
1870     case ISD::UDIVREM:
1871       return true;
1872     }
1873   }
1874 
1875   // Assume all other non-float operations are supported.
1876   if (!VT.isFloatingPoint())
1877     return false;
1878 
1879   // We'll need a library call to handle most floats when using soft.
1880   if (TLI->useSoftFloat()) {
1881     switch (I.getOpcode()) {
1882     default:
1883       return true;
1884     case Instruction::Alloca:
1885     case Instruction::Load:
1886     case Instruction::Store:
1887     case Instruction::Select:
1888     case Instruction::PHI:
1889       return false;
1890     }
1891   }
1892 
1893   // We'll need a libcall to perform double precision operations on a single
1894   // precision only FPU.
1895   if (I.getType()->isDoubleTy() && !ST->hasFP64())
1896     return true;
1897 
1898   // Likewise for half precision arithmetic.
1899   if (I.getType()->isHalfTy() && !ST->hasFullFP16())
1900     return true;
1901 
1902   return false;
1903 }
1904 
1905 bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
1906                                           AssumptionCache &AC,
1907                                           TargetLibraryInfo *LibInfo,
1908                                           HardwareLoopInfo &HWLoopInfo) {
1909   // Low-overhead branches are only supported in the 'low-overhead branch'
1910   // extension of v8.1-m.
1911   if (!ST->hasLOB() || DisableLowOverheadLoops) {
1912     LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
1913     return false;
1914   }
1915 
1916   if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
1917     LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
1918     return false;
1919   }
1920 
1921   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1922   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
1923     LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
1924     return false;
1925   }
1926 
1927   const SCEV *TripCountSCEV =
1928     SE.getAddExpr(BackedgeTakenCount,
1929                   SE.getOne(BackedgeTakenCount->getType()));
1930 
1931   // We need to store the trip count in LR, a 32-bit register.
1932   if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
1933     LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
1934     return false;
1935   }
1936 
1937   // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
1938   // point in generating a hardware loop if that's going to happen.
1939 
1940   auto IsHardwareLoopIntrinsic = [](Instruction &I) {
1941     if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
1942       switch (Call->getIntrinsicID()) {
1943       default:
1944         break;
1945       case Intrinsic::start_loop_iterations:
1946       case Intrinsic::test_start_loop_iterations:
1947       case Intrinsic::loop_decrement:
1948       case Intrinsic::loop_decrement_reg:
1949         return true;
1950       }
1951     }
1952     return false;
1953   };
1954 
1955   // Scan the instructions to see if there's any that we know will turn into a
1956   // call or if this loop is already a low-overhead loop or will become a tail
1957   // predicated loop.
1958   bool IsTailPredLoop = false;
1959   auto ScanLoop = [&](Loop *L) {
1960     for (auto *BB : L->getBlocks()) {
1961       for (auto &I : *BB) {
1962         if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
1963             isa<InlineAsm>(I)) {
1964           LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
1965           return false;
1966         }
1967         if (auto *II = dyn_cast<IntrinsicInst>(&I))
1968           IsTailPredLoop |=
1969               II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
1970               II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
1971               II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
1972               II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
1973               II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
1974       }
1975     }
1976     return true;
1977   };
1978 
1979   // Visit inner loops.
1980   for (auto Inner : *L)
1981     if (!ScanLoop(Inner))
1982       return false;
1983 
1984   if (!ScanLoop(L))
1985     return false;
1986 
1987   // TODO: Check whether the trip count calculation is expensive. If L is the
1988   // inner loop but we know it has a low trip count, calculating that trip
1989   // count (in the parent loop) may be detrimental.
1990 
1991   LLVMContext &C = L->getHeader()->getContext();
1992   HWLoopInfo.CounterInReg = true;
1993   HWLoopInfo.IsNestingLegal = false;
1994   HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
1995   HWLoopInfo.CountType = Type::getInt32Ty(C);
1996   HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
1997   return true;
1998 }
1999 
2000 static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
2001   // We don't allow icmp's, and because we only look at single block loops,
2002   // we simply count the icmps, i.e. there should only be 1 for the backedge.
2003   if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
2004     return false;
2005   // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
2006   // not currently canonical, but soon will be. Code without them uses icmp, and
2007   // so is not tail predicated as per the condition above. In order to get the
2008   // same performance we treat min and max the same as an icmp for tailpred
2009   // purposes for the moment (we often rely on non-tailpred and higher VF's to
2010   // pick more optimial instructions like VQDMULH. They need to be recognized
2011   // directly by the vectorizer).
2012   if (auto *II = dyn_cast<IntrinsicInst>(&I))
2013     if ((II->getIntrinsicID() == Intrinsic::smin ||
2014          II->getIntrinsicID() == Intrinsic::smax ||
2015          II->getIntrinsicID() == Intrinsic::umin ||
2016          II->getIntrinsicID() == Intrinsic::umax) &&
2017         ++ICmpCount > 1)
2018       return false;
2019 
2020   if (isa<FCmpInst>(&I))
2021     return false;
2022 
2023   // We could allow extending/narrowing FP loads/stores, but codegen is
2024   // too inefficient so reject this for now.
2025   if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
2026     return false;
2027 
2028   // Extends have to be extending-loads
2029   if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
2030     if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
2031       return false;
2032 
2033   // Truncs have to be narrowing-stores
2034   if (isa<TruncInst>(&I) )
2035     if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
2036       return false;
2037 
2038   return true;
2039 }
2040 
2041 // To set up a tail-predicated loop, we need to know the total number of
2042 // elements processed by that loop. Thus, we need to determine the element
2043 // size and:
2044 // 1) it should be uniform for all operations in the vector loop, so we
2045 //    e.g. don't want any widening/narrowing operations.
2046 // 2) it should be smaller than i64s because we don't have vector operations
2047 //    that work on i64s.
2048 // 3) we don't want elements to be reversed or shuffled, to make sure the
2049 //    tail-predication masks/predicates the right lanes.
2050 //
2051 static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
2052                                  const DataLayout &DL,
2053                                  const LoopAccessInfo *LAI) {
2054   LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
2055 
2056   // If there are live-out values, it is probably a reduction. We can predicate
2057   // most reduction operations freely under MVE using a combination of
2058   // prefer-predicated-reduction-select and inloop reductions. We limit this to
2059   // floating point and integer reductions, but don't check for operators
2060   // specifically here. If the value ends up not being a reduction (and so the
2061   // vectorizer cannot tailfold the loop), we should fall back to standard
2062   // vectorization automatically.
2063   SmallVector< Instruction *, 8 > LiveOuts;
2064   LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2065   bool ReductionsDisabled =
2066       EnableTailPredication == TailPredication::EnabledNoReductions ||
2067       EnableTailPredication == TailPredication::ForceEnabledNoReductions;
2068 
2069   for (auto *I : LiveOuts) {
2070     if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2071         !I->getType()->isHalfTy()) {
2072       LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
2073                            "live-out value\n");
2074       return false;
2075     }
2076     if (ReductionsDisabled) {
2077       LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
2078       return false;
2079     }
2080   }
2081 
2082   // Next, check that all instructions can be tail-predicated.
2083   PredicatedScalarEvolution PSE = LAI->getPSE();
2084   SmallVector<Instruction *, 16> LoadStores;
2085   int ICmpCount = 0;
2086 
2087   for (BasicBlock *BB : L->blocks()) {
2088     for (Instruction &I : BB->instructionsWithoutDebug()) {
2089       if (isa<PHINode>(&I))
2090         continue;
2091       if (!canTailPredicateInstruction(I, ICmpCount)) {
2092         LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2093         return false;
2094       }
2095 
2096       Type *T  = I.getType();
2097       if (T->isPointerTy())
2098         T = T->getPointerElementType();
2099 
2100       if (T->getScalarSizeInBits() > 32) {
2101         LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2102         return false;
2103       }
2104       if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
2105         Value *Ptr = getLoadStorePointerOperand(&I);
2106         Type *AccessTy = getLoadStoreType(&I);
2107         int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L);
2108         if (NextStride == 1) {
2109           // TODO: for now only allow consecutive strides of 1. We could support
2110           // other strides as long as it is uniform, but let's keep it simple
2111           // for now.
2112           continue;
2113         } else if (NextStride == -1 ||
2114                    (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2115                    (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2116           LLVM_DEBUG(dbgs()
2117                      << "Consecutive strides of 2 found, vld2/vstr2 can't "
2118                         "be tail-predicated\n.");
2119           return false;
2120           // TODO: don't tail predicate if there is a reversed load?
2121         } else if (EnableMaskedGatherScatters) {
2122           // Gather/scatters do allow loading from arbitrary strides, at
2123           // least if they are loop invariant.
2124           // TODO: Loop variant strides should in theory work, too, but
2125           // this requires further testing.
2126           const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);
2127           if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
2128             const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
2129             if (PSE.getSE()->isLoopInvariant(Step, L))
2130               continue;
2131           }
2132         }
2133         LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2134                              "tail-predicate\n.");
2135         return false;
2136       }
2137     }
2138   }
2139 
2140   LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2141   return true;
2142 }
2143 
2144 bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
2145                                              ScalarEvolution &SE,
2146                                              AssumptionCache &AC,
2147                                              TargetLibraryInfo *TLI,
2148                                              DominatorTree *DT,
2149                                              const LoopAccessInfo *LAI) {
2150   if (!EnableTailPredication) {
2151     LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
2152     return false;
2153   }
2154 
2155   // Creating a predicated vector loop is the first step for generating a
2156   // tail-predicated hardware loop, for which we need the MVE masked
2157   // load/stores instructions:
2158   if (!ST->hasMVEIntegerOps())
2159     return false;
2160 
2161   // For now, restrict this to single block loops.
2162   if (L->getNumBlocks() > 1) {
2163     LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
2164                          "loop.\n");
2165     return false;
2166   }
2167 
2168   assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
2169 
2170   HardwareLoopInfo HWLoopInfo(L);
2171   if (!HWLoopInfo.canAnalyze(*LI)) {
2172     LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2173                          "analyzable.\n");
2174     return false;
2175   }
2176 
2177   // This checks if we have the low-overhead branch architecture
2178   // extension, and if we will create a hardware-loop:
2179   if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
2180     LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2181                          "profitable.\n");
2182     return false;
2183   }
2184 
2185   if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) {
2186     LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2187                          "a candidate.\n");
2188     return false;
2189   }
2190 
2191   return canTailPredicateLoop(L, LI, SE, DL, LAI);
2192 }
2193 
2194 bool ARMTTIImpl::emitGetActiveLaneMask() const {
2195   if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
2196     return false;
2197 
2198   // Intrinsic @llvm.get.active.lane.mask is supported.
2199   // It is used in the MVETailPredication pass, which requires the number of
2200   // elements processed by this vector loop to setup the tail-predicated
2201   // loop.
2202   return true;
2203 }
2204 void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
2205                                          TTI::UnrollingPreferences &UP,
2206                                          OptimizationRemarkEmitter *ORE) {
2207   // Enable Upper bound unrolling universally, not dependant upon the conditions
2208   // below.
2209   UP.UpperBound = true;
2210 
2211   // Only currently enable these preferences for M-Class cores.
2212   if (!ST->isMClass())
2213     return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2214 
2215   // Disable loop unrolling for Oz and Os.
2216   UP.OptSizeThreshold = 0;
2217   UP.PartialOptSizeThreshold = 0;
2218   if (L->getHeader()->getParent()->hasOptSize())
2219     return;
2220 
2221   SmallVector<BasicBlock*, 4> ExitingBlocks;
2222   L->getExitingBlocks(ExitingBlocks);
2223   LLVM_DEBUG(dbgs() << "Loop has:\n"
2224                     << "Blocks: " << L->getNumBlocks() << "\n"
2225                     << "Exit blocks: " << ExitingBlocks.size() << "\n");
2226 
2227   // Only allow another exit other than the latch. This acts as an early exit
2228   // as it mirrors the profitability calculation of the runtime unroller.
2229   if (ExitingBlocks.size() > 2)
2230     return;
2231 
2232   // Limit the CFG of the loop body for targets with a branch predictor.
2233   // Allowing 4 blocks permits if-then-else diamonds in the body.
2234   if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
2235     return;
2236 
2237   // Don't unroll vectorized loops, including the remainder loop
2238   if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2239     return;
2240 
2241   // Scan the loop: don't unroll loops with calls as this could prevent
2242   // inlining.
2243   InstructionCost Cost = 0;
2244   for (auto *BB : L->getBlocks()) {
2245     for (auto &I : *BB) {
2246       // Don't unroll vectorised loop. MVE does not benefit from it as much as
2247       // scalar code.
2248       if (I.getType()->isVectorTy())
2249         return;
2250 
2251       if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2252         if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2253           if (!isLoweredToCall(F))
2254             continue;
2255         }
2256         return;
2257       }
2258 
2259       SmallVector<const Value*, 4> Operands(I.operand_values());
2260       Cost +=
2261         getUserCost(&I, Operands, TargetTransformInfo::TCK_SizeAndLatency);
2262     }
2263   }
2264 
2265   // On v6m cores, there are very few registers available. We can easily end up
2266   // spilling and reloading more registers in an unrolled loop. Look at the
2267   // number of LCSSA phis as a rough measure of how many registers will need to
2268   // be live out of the loop, reducing the default unroll count if more than 1
2269   // value is needed.  In the long run, all of this should be being learnt by a
2270   // machine.
2271   unsigned UnrollCount = 4;
2272   if (ST->isThumb1Only()) {
2273     unsigned ExitingValues = 0;
2274     SmallVector<BasicBlock *, 4> ExitBlocks;
2275     L->getExitBlocks(ExitBlocks);
2276     for (auto *Exit : ExitBlocks) {
2277       // Count the number of LCSSA phis. Exclude values coming from GEP's as
2278       // only the last is expected to be needed for address operands.
2279       unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {
2280         return PH.getNumOperands() != 1 ||
2281                !isa<GetElementPtrInst>(PH.getOperand(0));
2282       });
2283       ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2284     }
2285     if (ExitingValues)
2286       UnrollCount /= ExitingValues;
2287     if (UnrollCount <= 1)
2288       return;
2289   }
2290 
2291   LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2292   LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
2293 
2294   UP.Partial = true;
2295   UP.Runtime = true;
2296   UP.UnrollRemainder = true;
2297   UP.DefaultUnrollRuntimeCount = UnrollCount;
2298   UP.UnrollAndJam = true;
2299   UP.UnrollAndJamInnerLoopThreshold = 60;
2300 
2301   // Force unrolling small loops can be very useful because of the branch
2302   // taken cost of the backedge.
2303   if (Cost < 12)
2304     UP.Force = true;
2305 }
2306 
2307 void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
2308                                        TTI::PeelingPreferences &PP) {
2309   BaseT::getPeelingPreferences(L, SE, PP);
2310 }
2311 
2312 bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
2313                                        TTI::ReductionFlags Flags) const {
2314   if (!ST->hasMVEIntegerOps())
2315     return false;
2316 
2317   unsigned ScalarBits = Ty->getScalarSizeInBits();
2318   switch (Opcode) {
2319   case Instruction::Add:
2320     return ScalarBits <= 64;
2321   default:
2322     return false;
2323   }
2324 }
2325 
2326 bool ARMTTIImpl::preferPredicatedReductionSelect(
2327     unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
2328   if (!ST->hasMVEIntegerOps())
2329     return false;
2330   return true;
2331 }
2332