1 //===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "ARMTargetTransformInfo.h"
10 #include "ARMSubtarget.h"
11 #include "MCTargetDesc/ARMAddressingModes.h"
12 #include "llvm/ADT/APInt.h"
13 #include "llvm/ADT/SmallVector.h"
14 #include "llvm/Analysis/LoopInfo.h"
15 #include "llvm/CodeGen/CostTable.h"
16 #include "llvm/CodeGen/ISDOpcodes.h"
17 #include "llvm/CodeGen/ValueTypes.h"
18 #include "llvm/IR/BasicBlock.h"
19 #include "llvm/IR/DataLayout.h"
20 #include "llvm/IR/DerivedTypes.h"
21 #include "llvm/IR/Instruction.h"
22 #include "llvm/IR/Instructions.h"
23 #include "llvm/IR/IntrinsicInst.h"
24 #include "llvm/IR/IntrinsicsARM.h"
25 #include "llvm/IR/PatternMatch.h"
26 #include "llvm/IR/Type.h"
27 #include "llvm/MC/SubtargetFeature.h"
28 #include "llvm/Support/Casting.h"
29 #include "llvm/Support/MachineValueType.h"
30 #include "llvm/Target/TargetMachine.h"
31 #include <algorithm>
32 #include <cassert>
33 #include <cstdint>
34 #include <utility>
35 
36 using namespace llvm;
37 
38 #define DEBUG_TYPE "armtti"
39 
40 static cl::opt<bool> EnableMaskedLoadStores(
41   "enable-arm-maskedldst", cl::Hidden, cl::init(true),
42   cl::desc("Enable the generation of masked loads and stores"));
43 
44 static cl::opt<bool> DisableLowOverheadLoops(
45   "disable-arm-loloops", cl::Hidden, cl::init(false),
46   cl::desc("Disable the generation of low-overhead loops"));
47 
48 extern cl::opt<bool> DisableTailPredication;
49 
50 extern cl::opt<bool> EnableMaskedGatherScatters;
51 
52 bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
53                                      const Function *Callee) const {
54   const TargetMachine &TM = getTLI()->getTargetMachine();
55   const FeatureBitset &CallerBits =
56       TM.getSubtargetImpl(*Caller)->getFeatureBits();
57   const FeatureBitset &CalleeBits =
58       TM.getSubtargetImpl(*Callee)->getFeatureBits();
59 
60   // To inline a callee, all features not in the whitelist must match exactly.
61   bool MatchExact = (CallerBits & ~InlineFeatureWhitelist) ==
62                     (CalleeBits & ~InlineFeatureWhitelist);
63   // For features in the whitelist, the callee's features must be a subset of
64   // the callers'.
65   bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeatureWhitelist) ==
66                      (CalleeBits & InlineFeatureWhitelist);
67   return MatchExact && MatchSubset;
68 }
69 
70 bool ARMTTIImpl::shouldFavorBackedgeIndex(const Loop *L) const {
71   if (L->getHeader()->getParent()->hasOptSize())
72     return false;
73   if (ST->hasMVEIntegerOps())
74     return false;
75   return ST->isMClass() && ST->isThumb2() && L->getNumBlocks() == 1;
76 }
77 
78 bool ARMTTIImpl::shouldFavorPostInc() const {
79   if (ST->hasMVEIntegerOps())
80     return true;
81   return false;
82 }
83 
84 int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
85                               TTI::TargetCostKind CostKind) {
86   assert(Ty->isIntegerTy());
87 
88  unsigned Bits = Ty->getPrimitiveSizeInBits();
89  if (Bits == 0 || Imm.getActiveBits() >= 64)
90    return 4;
91 
92   int64_t SImmVal = Imm.getSExtValue();
93   uint64_t ZImmVal = Imm.getZExtValue();
94   if (!ST->isThumb()) {
95     if ((SImmVal >= 0 && SImmVal < 65536) ||
96         (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
97         (ARM_AM::getSOImmVal(~ZImmVal) != -1))
98       return 1;
99     return ST->hasV6T2Ops() ? 2 : 3;
100   }
101   if (ST->isThumb2()) {
102     if ((SImmVal >= 0 && SImmVal < 65536) ||
103         (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
104         (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
105       return 1;
106     return ST->hasV6T2Ops() ? 2 : 3;
107   }
108   // Thumb1, any i8 imm cost 1.
109   if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
110     return 1;
111   if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
112     return 2;
113   // Load from constantpool.
114   return 3;
115 }
116 
117 // Constants smaller than 256 fit in the immediate field of
118 // Thumb1 instructions so we return a zero cost and 1 otherwise.
119 int ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
120                                       const APInt &Imm, Type *Ty) {
121   if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
122     return 0;
123 
124   return 1;
125 }
126 
127 int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
128                                   Type *Ty, TTI::TargetCostKind CostKind) {
129   // Division by a constant can be turned into multiplication, but only if we
130   // know it's constant. So it's not so much that the immediate is cheap (it's
131   // not), but that the alternative is worse.
132   // FIXME: this is probably unneeded with GlobalISel.
133   if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
134        Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
135       Idx == 1)
136     return 0;
137 
138   if (Opcode == Instruction::And) {
139     // UXTB/UXTH
140     if (Imm == 255 || Imm == 65535)
141       return 0;
142     // Conversion to BIC is free, and means we can use ~Imm instead.
143     return std::min(getIntImmCost(Imm, Ty, CostKind),
144                     getIntImmCost(~Imm, Ty, CostKind));
145   }
146 
147   if (Opcode == Instruction::Add)
148     // Conversion to SUB is free, and means we can use -Imm instead.
149     return std::min(getIntImmCost(Imm, Ty, CostKind),
150                     getIntImmCost(-Imm, Ty, CostKind));
151 
152   if (Opcode == Instruction::ICmp && Imm.isNegative() &&
153       Ty->getIntegerBitWidth() == 32) {
154     int64_t NegImm = -Imm.getSExtValue();
155     if (ST->isThumb2() && NegImm < 1<<12)
156       // icmp X, #-C -> cmn X, #C
157       return 0;
158     if (ST->isThumb() && NegImm < 1<<8)
159       // icmp X, #-C -> adds X, #C
160       return 0;
161   }
162 
163   // xor a, -1 can always be folded to MVN
164   if (Opcode == Instruction::Xor && Imm.isAllOnesValue())
165     return 0;
166 
167   return getIntImmCost(Imm, Ty, CostKind);
168 }
169 
170 int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
171                                  TTI::TargetCostKind CostKind,
172                                  const Instruction *I) {
173   int ISD = TLI->InstructionOpcodeToISD(Opcode);
174   assert(ISD && "Invalid opcode");
175 
176   // Single to/from double precision conversions.
177   static const CostTblEntry NEONFltDblTbl[] = {
178     // Vector fptrunc/fpext conversions.
179     { ISD::FP_ROUND,   MVT::v2f64, 2 },
180     { ISD::FP_EXTEND,  MVT::v2f32, 2 },
181     { ISD::FP_EXTEND,  MVT::v4f32, 4 }
182   };
183 
184   if (Src->isVectorTy() && ST->hasNEON() && (ISD == ISD::FP_ROUND ||
185                                           ISD == ISD::FP_EXTEND)) {
186     std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
187     if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
188       return LT.first * Entry->Cost;
189   }
190 
191   EVT SrcTy = TLI->getValueType(DL, Src);
192   EVT DstTy = TLI->getValueType(DL, Dst);
193 
194   if (!SrcTy.isSimple() || !DstTy.isSimple())
195     return BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I);
196 
197   // The extend of a load is free
198   if (I && isa<LoadInst>(I->getOperand(0))) {
199     static const TypeConversionCostTblEntry LoadConversionTbl[] = {
200         {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
201         {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
202         {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
203         {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
204         {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
205         {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
206         {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
207         {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
208         {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
209         {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
210         {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
211         {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
212     };
213     if (const auto *Entry = ConvertCostTableLookup(
214             LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
215       return Entry->Cost;
216 
217     static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
218         {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
219         {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
220         {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
221         {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
222         {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
223         {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
224     };
225     if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
226       if (const auto *Entry =
227               ConvertCostTableLookup(MVELoadConversionTbl, ISD,
228                                      DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
229         return Entry->Cost;
230     }
231   }
232 
233   // NEON vector operations that can extend their inputs.
234   if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
235       I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
236     static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
237       // vaddl
238       { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
239       { ISD::ADD, MVT::v8i16, MVT::v8i8,  0 },
240       // vsubl
241       { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
242       { ISD::SUB, MVT::v8i16, MVT::v8i8,  0 },
243       // vmull
244       { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
245       { ISD::MUL, MVT::v8i16, MVT::v8i8,  0 },
246       // vshll
247       { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
248       { ISD::SHL, MVT::v8i16, MVT::v8i8,  0 },
249     };
250 
251     auto *User = cast<Instruction>(*I->user_begin());
252     int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
253     if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
254                                              DstTy.getSimpleVT(),
255                                              SrcTy.getSimpleVT())) {
256       return Entry->Cost;
257     }
258   }
259 
260   // Some arithmetic, load and store operations have specific instructions
261   // to cast up/down their types automatically at no extra cost.
262   // TODO: Get these tables to know at least what the related operations are.
263   static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
264     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
265     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
266     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
267     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
268     { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64, 0 },
269     { ISD::TRUNCATE,    MVT::v4i16, MVT::v4i32, 1 },
270 
271     // The number of vmovl instructions for the extension.
272     { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8,  1 },
273     { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8,  1 },
274     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8,  2 },
275     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8,  2 },
276     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8,  3 },
277     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8,  3 },
278     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
279     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
280     { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
281     { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
282     { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
283     { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
284     { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
285     { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
286     { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
287     { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
288     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
289     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
290 
291     // Operations that we legalize using splitting.
292     { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i32, 6 },
293     { ISD::TRUNCATE,    MVT::v8i8, MVT::v8i32, 3 },
294 
295     // Vector float <-> i32 conversions.
296     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
297     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
298 
299     { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
300     { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
301     { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
302     { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
303     { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
304     { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
305     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
306     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
307     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
308     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
309     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
310     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
311     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
312     { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
313     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
314     { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
315     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
316     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
317     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },
318     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },
319 
320     { ISD::FP_TO_SINT,  MVT::v4i32, MVT::v4f32, 1 },
321     { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f32, 1 },
322     { ISD::FP_TO_SINT,  MVT::v4i8, MVT::v4f32, 3 },
323     { ISD::FP_TO_UINT,  MVT::v4i8, MVT::v4f32, 3 },
324     { ISD::FP_TO_SINT,  MVT::v4i16, MVT::v4f32, 2 },
325     { ISD::FP_TO_UINT,  MVT::v4i16, MVT::v4f32, 2 },
326 
327     // Vector double <-> i32 conversions.
328     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
329     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
330 
331     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
332     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
333     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
334     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
335     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
336     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
337 
338     { ISD::FP_TO_SINT,  MVT::v2i32, MVT::v2f64, 2 },
339     { ISD::FP_TO_UINT,  MVT::v2i32, MVT::v2f64, 2 },
340     { ISD::FP_TO_SINT,  MVT::v8i16, MVT::v8f32, 4 },
341     { ISD::FP_TO_UINT,  MVT::v8i16, MVT::v8f32, 4 },
342     { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v16f32, 8 },
343     { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v16f32, 8 }
344   };
345 
346   if (SrcTy.isVector() && ST->hasNEON()) {
347     if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
348                                                    DstTy.getSimpleVT(),
349                                                    SrcTy.getSimpleVT()))
350       return Entry->Cost;
351   }
352 
353   // Scalar float to integer conversions.
354   static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
355     { ISD::FP_TO_SINT,  MVT::i1, MVT::f32, 2 },
356     { ISD::FP_TO_UINT,  MVT::i1, MVT::f32, 2 },
357     { ISD::FP_TO_SINT,  MVT::i1, MVT::f64, 2 },
358     { ISD::FP_TO_UINT,  MVT::i1, MVT::f64, 2 },
359     { ISD::FP_TO_SINT,  MVT::i8, MVT::f32, 2 },
360     { ISD::FP_TO_UINT,  MVT::i8, MVT::f32, 2 },
361     { ISD::FP_TO_SINT,  MVT::i8, MVT::f64, 2 },
362     { ISD::FP_TO_UINT,  MVT::i8, MVT::f64, 2 },
363     { ISD::FP_TO_SINT,  MVT::i16, MVT::f32, 2 },
364     { ISD::FP_TO_UINT,  MVT::i16, MVT::f32, 2 },
365     { ISD::FP_TO_SINT,  MVT::i16, MVT::f64, 2 },
366     { ISD::FP_TO_UINT,  MVT::i16, MVT::f64, 2 },
367     { ISD::FP_TO_SINT,  MVT::i32, MVT::f32, 2 },
368     { ISD::FP_TO_UINT,  MVT::i32, MVT::f32, 2 },
369     { ISD::FP_TO_SINT,  MVT::i32, MVT::f64, 2 },
370     { ISD::FP_TO_UINT,  MVT::i32, MVT::f64, 2 },
371     { ISD::FP_TO_SINT,  MVT::i64, MVT::f32, 10 },
372     { ISD::FP_TO_UINT,  MVT::i64, MVT::f32, 10 },
373     { ISD::FP_TO_SINT,  MVT::i64, MVT::f64, 10 },
374     { ISD::FP_TO_UINT,  MVT::i64, MVT::f64, 10 }
375   };
376   if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
377     if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
378                                                    DstTy.getSimpleVT(),
379                                                    SrcTy.getSimpleVT()))
380       return Entry->Cost;
381   }
382 
383   // Scalar integer to float conversions.
384   static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
385     { ISD::SINT_TO_FP,  MVT::f32, MVT::i1, 2 },
386     { ISD::UINT_TO_FP,  MVT::f32, MVT::i1, 2 },
387     { ISD::SINT_TO_FP,  MVT::f64, MVT::i1, 2 },
388     { ISD::UINT_TO_FP,  MVT::f64, MVT::i1, 2 },
389     { ISD::SINT_TO_FP,  MVT::f32, MVT::i8, 2 },
390     { ISD::UINT_TO_FP,  MVT::f32, MVT::i8, 2 },
391     { ISD::SINT_TO_FP,  MVT::f64, MVT::i8, 2 },
392     { ISD::UINT_TO_FP,  MVT::f64, MVT::i8, 2 },
393     { ISD::SINT_TO_FP,  MVT::f32, MVT::i16, 2 },
394     { ISD::UINT_TO_FP,  MVT::f32, MVT::i16, 2 },
395     { ISD::SINT_TO_FP,  MVT::f64, MVT::i16, 2 },
396     { ISD::UINT_TO_FP,  MVT::f64, MVT::i16, 2 },
397     { ISD::SINT_TO_FP,  MVT::f32, MVT::i32, 2 },
398     { ISD::UINT_TO_FP,  MVT::f32, MVT::i32, 2 },
399     { ISD::SINT_TO_FP,  MVT::f64, MVT::i32, 2 },
400     { ISD::UINT_TO_FP,  MVT::f64, MVT::i32, 2 },
401     { ISD::SINT_TO_FP,  MVT::f32, MVT::i64, 10 },
402     { ISD::UINT_TO_FP,  MVT::f32, MVT::i64, 10 },
403     { ISD::SINT_TO_FP,  MVT::f64, MVT::i64, 10 },
404     { ISD::UINT_TO_FP,  MVT::f64, MVT::i64, 10 }
405   };
406 
407   if (SrcTy.isInteger() && ST->hasNEON()) {
408     if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
409                                                    ISD, DstTy.getSimpleVT(),
410                                                    SrcTy.getSimpleVT()))
411       return Entry->Cost;
412   }
413 
414   // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
415   // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
416   // are linearised so take more.
417   static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
418     { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
419     { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
420     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
421     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
422     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
423     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
424     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
425     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
426     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
427     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
428     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
429     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
430   };
431 
432   if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
433     if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
434                                                    ISD, DstTy.getSimpleVT(),
435                                                    SrcTy.getSimpleVT()))
436       return Entry->Cost * ST->getMVEVectorCostFactor();
437   }
438 
439   // Scalar integer conversion costs.
440   static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
441     // i16 -> i64 requires two dependent operations.
442     { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
443 
444     // Truncates on i64 are assumed to be free.
445     { ISD::TRUNCATE,    MVT::i32, MVT::i64, 0 },
446     { ISD::TRUNCATE,    MVT::i16, MVT::i64, 0 },
447     { ISD::TRUNCATE,    MVT::i8,  MVT::i64, 0 },
448     { ISD::TRUNCATE,    MVT::i1,  MVT::i64, 0 }
449   };
450 
451   if (SrcTy.isInteger()) {
452     if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
453                                                    DstTy.getSimpleVT(),
454                                                    SrcTy.getSimpleVT()))
455       return Entry->Cost;
456   }
457 
458   int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
459                      ? ST->getMVEVectorCostFactor()
460                      : 1;
461   return BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I);
462 }
463 
464 int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
465                                    unsigned Index) {
466   // Penalize inserting into an D-subregister. We end up with a three times
467   // lower estimated throughput on swift.
468   if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
469       ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
470     return 3;
471 
472   if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
473                         Opcode == Instruction::ExtractElement)) {
474     // Cross-class copies are expensive on many microarchitectures,
475     // so assume they are expensive by default.
476     if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
477       return 3;
478 
479     // Even if it's not a cross class copy, this likely leads to mixing
480     // of NEON and VFP code and should be therefore penalized.
481     if (ValTy->isVectorTy() &&
482         ValTy->getScalarSizeInBits() <= 32)
483       return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
484   }
485 
486   if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
487                                  Opcode == Instruction::ExtractElement)) {
488     // We say MVE moves costs at least the MVEVectorCostFactor, even though
489     // they are scalar instructions. This helps prevent mixing scalar and
490     // vector, to prevent vectorising where we end up just scalarising the
491     // result anyway.
492     return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index),
493                     ST->getMVEVectorCostFactor()) *
494            cast<VectorType>(ValTy)->getNumElements() / 2;
495   }
496 
497   return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
498 }
499 
500 int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
501                                    TTI::TargetCostKind CostKind,
502                                    const Instruction *I) {
503   int ISD = TLI->InstructionOpcodeToISD(Opcode);
504   // On NEON a vector select gets lowered to vbsl.
505   if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT) {
506     // Lowering of some vector selects is currently far from perfect.
507     static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
508       { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
509       { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
510       { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
511     };
512 
513     EVT SelCondTy = TLI->getValueType(DL, CondTy);
514     EVT SelValTy = TLI->getValueType(DL, ValTy);
515     if (SelCondTy.isSimple() && SelValTy.isSimple()) {
516       if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
517                                                      SelCondTy.getSimpleVT(),
518                                                      SelValTy.getSimpleVT()))
519         return Entry->Cost;
520     }
521 
522     std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
523     return LT.first;
524   }
525 
526   int BaseCost = ST->hasMVEIntegerOps() && ValTy->isVectorTy()
527                      ? ST->getMVEVectorCostFactor()
528                      : 1;
529   return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind,
530                                               I);
531 }
532 
533 int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
534                                           const SCEV *Ptr) {
535   // Address computations in vectorized code with non-consecutive addresses will
536   // likely result in more instructions compared to scalar code where the
537   // computation can more often be merged into the index mode. The resulting
538   // extra micro-ops can significantly decrease throughput.
539   unsigned NumVectorInstToHideOverhead = 10;
540   int MaxMergeDistance = 64;
541 
542   if (ST->hasNEON()) {
543     if (Ty->isVectorTy() && SE &&
544         !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
545       return NumVectorInstToHideOverhead;
546 
547     // In many cases the address computation is not merged into the instruction
548     // addressing mode.
549     return 1;
550   }
551   return BaseT::getAddressComputationCost(Ty, SE, Ptr);
552 }
553 
554 bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) {
555   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
556     // If a VCTP is part of a chain, it's already profitable and shouldn't be
557     // optimized, else LSR may block tail-predication.
558     switch (II->getIntrinsicID()) {
559     case Intrinsic::arm_mve_vctp8:
560     case Intrinsic::arm_mve_vctp16:
561     case Intrinsic::arm_mve_vctp32:
562     case Intrinsic::arm_mve_vctp64:
563       return true;
564     default:
565       break;
566     }
567   }
568   return false;
569 }
570 
571 bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) {
572   if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
573     return false;
574 
575   if (auto *VecTy = dyn_cast<VectorType>(DataTy)) {
576     // Don't support v2i1 yet.
577     if (VecTy->getNumElements() == 2)
578       return false;
579 
580     // We don't support extending fp types.
581      unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
582     if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
583       return false;
584   }
585 
586   unsigned EltWidth = DataTy->getScalarSizeInBits();
587   return (EltWidth == 32 && (!Alignment || Alignment >= 4)) ||
588          (EltWidth == 16 && (!Alignment || Alignment >= 2)) ||
589          (EltWidth == 8);
590 }
591 
592 bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, MaybeAlign Alignment) {
593   if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
594     return false;
595 
596   // This method is called in 2 places:
597   //  - from the vectorizer with a scalar type, in which case we need to get
598   //  this as good as we can with the limited info we have (and rely on the cost
599   //  model for the rest).
600   //  - from the masked intrinsic lowering pass with the actual vector type.
601   // For MVE, we have a custom lowering pass that will already have custom
602   // legalised any gathers that we can to MVE intrinsics, and want to expand all
603   // the rest. The pass runs before the masked intrinsic lowering pass, so if we
604   // are here, we know we want to expand.
605   if (isa<VectorType>(Ty))
606     return false;
607 
608   unsigned EltWidth = Ty->getScalarSizeInBits();
609   return ((EltWidth == 32 && (!Alignment || Alignment >= 4)) ||
610           (EltWidth == 16 && (!Alignment || Alignment >= 2)) || EltWidth == 8);
611 }
612 
613 int ARMTTIImpl::getMemcpyCost(const Instruction *I) {
614   const MemCpyInst *MI = dyn_cast<MemCpyInst>(I);
615   assert(MI && "MemcpyInst expected");
616   ConstantInt *C = dyn_cast<ConstantInt>(MI->getLength());
617 
618   // To model the cost of a library call, we assume 1 for the call, and
619   // 3 for the argument setup.
620   const unsigned LibCallCost = 4;
621 
622   // If 'size' is not a constant, a library call will be generated.
623   if (!C)
624     return LibCallCost;
625 
626   const unsigned Size = C->getValue().getZExtValue();
627   const Align DstAlign = *MI->getDestAlign();
628   const Align SrcAlign = *MI->getSourceAlign();
629   const Function *F = I->getParent()->getParent();
630   const unsigned Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
631   std::vector<EVT> MemOps;
632 
633   // MemOps will be poplulated with a list of data types that needs to be
634   // loaded and stored. That's why we multiply the number of elements by 2 to
635   // get the cost for this memcpy.
636   if (getTLI()->findOptimalMemOpLowering(
637           MemOps, Limit,
638           MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
639                       /*IsVolatile*/ true),
640           MI->getDestAddressSpace(), MI->getSourceAddressSpace(),
641           F->getAttributes()))
642     return MemOps.size() * 2;
643 
644   // If we can't find an optimal memop lowering, return the default cost
645   return LibCallCost;
646 }
647 
648 int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
649                                int Index, VectorType *SubTp) {
650   if (ST->hasNEON()) {
651     if (Kind == TTI::SK_Broadcast) {
652       static const CostTblEntry NEONDupTbl[] = {
653           // VDUP handles these cases.
654           {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
655           {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
656           {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
657           {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
658           {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
659           {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
660 
661           {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
662           {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
663           {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
664           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
665 
666       std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
667 
668       if (const auto *Entry =
669               CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
670         return LT.first * Entry->Cost;
671     }
672     if (Kind == TTI::SK_Reverse) {
673       static const CostTblEntry NEONShuffleTbl[] = {
674           // Reverse shuffle cost one instruction if we are shuffling within a
675           // double word (vrev) or two if we shuffle a quad word (vrev, vext).
676           {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
677           {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
678           {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
679           {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
680           {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
681           {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
682 
683           {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
684           {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
685           {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
686           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
687 
688       std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
689 
690       if (const auto *Entry =
691               CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
692         return LT.first * Entry->Cost;
693     }
694     if (Kind == TTI::SK_Select) {
695       static const CostTblEntry NEONSelShuffleTbl[] = {
696           // Select shuffle cost table for ARM. Cost is the number of
697           // instructions
698           // required to create the shuffled vector.
699 
700           {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
701           {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
702           {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
703           {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
704 
705           {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
706           {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
707           {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
708 
709           {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
710 
711           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
712 
713       std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
714       if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
715                                               ISD::VECTOR_SHUFFLE, LT.second))
716         return LT.first * Entry->Cost;
717     }
718   }
719   if (ST->hasMVEIntegerOps()) {
720     if (Kind == TTI::SK_Broadcast) {
721       static const CostTblEntry MVEDupTbl[] = {
722           // VDUP handles these cases.
723           {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
724           {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
725           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
726           {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
727           {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
728 
729       std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
730 
731       if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
732                                               LT.second))
733         return LT.first * Entry->Cost * ST->getMVEVectorCostFactor();
734     }
735   }
736   int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
737                      ? ST->getMVEVectorCostFactor()
738                      : 1;
739   return BaseCost * BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
740 }
741 
742 int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
743                                        TTI::TargetCostKind CostKind,
744                                        TTI::OperandValueKind Op1Info,
745                                        TTI::OperandValueKind Op2Info,
746                                        TTI::OperandValueProperties Opd1PropInfo,
747                                        TTI::OperandValueProperties Opd2PropInfo,
748                                        ArrayRef<const Value *> Args,
749                                        const Instruction *CxtI) {
750   int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
751   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
752 
753   if (ST->hasNEON()) {
754     const unsigned FunctionCallDivCost = 20;
755     const unsigned ReciprocalDivCost = 10;
756     static const CostTblEntry CostTbl[] = {
757       // Division.
758       // These costs are somewhat random. Choose a cost of 20 to indicate that
759       // vectorizing devision (added function call) is going to be very expensive.
760       // Double registers types.
761       { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
762       { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
763       { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
764       { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
765       { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
766       { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
767       { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
768       { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
769       { ISD::SDIV, MVT::v4i16,     ReciprocalDivCost},
770       { ISD::UDIV, MVT::v4i16,     ReciprocalDivCost},
771       { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
772       { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
773       { ISD::SDIV, MVT::v8i8,      ReciprocalDivCost},
774       { ISD::UDIV, MVT::v8i8,      ReciprocalDivCost},
775       { ISD::SREM, MVT::v8i8,  8 * FunctionCallDivCost},
776       { ISD::UREM, MVT::v8i8,  8 * FunctionCallDivCost},
777       // Quad register types.
778       { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
779       { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
780       { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
781       { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
782       { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
783       { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
784       { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
785       { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
786       { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
787       { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
788       { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
789       { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
790       { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
791       { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
792       { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
793       { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
794       // Multiplication.
795     };
796 
797     if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
798       return LT.first * Entry->Cost;
799 
800     int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
801                                              Op2Info,
802                                              Opd1PropInfo, Opd2PropInfo);
803 
804     // This is somewhat of a hack. The problem that we are facing is that SROA
805     // creates a sequence of shift, and, or instructions to construct values.
806     // These sequences are recognized by the ISel and have zero-cost. Not so for
807     // the vectorized code. Because we have support for v2i64 but not i64 those
808     // sequences look particularly beneficial to vectorize.
809     // To work around this we increase the cost of v2i64 operations to make them
810     // seem less beneficial.
811     if (LT.second == MVT::v2i64 &&
812         Op2Info == TargetTransformInfo::OK_UniformConstantValue)
813       Cost += 4;
814 
815     return Cost;
816   }
817 
818   // If this operation is a shift on arm/thumb2, it might well be folded into
819   // the following instruction, hence having a cost of 0.
820   auto LooksLikeAFreeShift = [&]() {
821     if (ST->isThumb1Only() || Ty->isVectorTy())
822       return false;
823 
824     if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
825       return false;
826     if (Op2Info != TargetTransformInfo::OK_UniformConstantValue)
827       return false;
828 
829     // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
830     switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
831     case Instruction::Add:
832     case Instruction::Sub:
833     case Instruction::And:
834     case Instruction::Xor:
835     case Instruction::Or:
836     case Instruction::ICmp:
837       return true;
838     default:
839       return false;
840     }
841   };
842   if (LooksLikeAFreeShift())
843     return 0;
844 
845   int BaseCost = ST->hasMVEIntegerOps() && Ty->isVectorTy()
846                      ? ST->getMVEVectorCostFactor()
847                      : 1;
848 
849   // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
850   // without treating floats as more expensive that scalars or increasing the
851   // costs for custom operations. The results is also multiplied by the
852   // MVEVectorCostFactor where appropriate.
853   if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
854     return LT.first * BaseCost;
855 
856   // Else this is expand, assume that we need to scalarize this op.
857   if (auto *VTy = dyn_cast<VectorType>(Ty)) {
858     unsigned Num = VTy->getNumElements();
859     unsigned Cost = getArithmeticInstrCost(Opcode, Ty->getScalarType(),
860                                            CostKind);
861     // Return the cost of multiple scalar invocation plus the cost of
862     // inserting and extracting the values.
863     return BaseT::getScalarizationOverhead(VTy, Args) + Num * Cost;
864   }
865 
866   return BaseCost;
867 }
868 
869 int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
870                                 MaybeAlign Alignment, unsigned AddressSpace,
871                                 TTI::TargetCostKind CostKind,
872                                 const Instruction *I) {
873   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
874 
875   if (ST->hasNEON() && Src->isVectorTy() &&
876       (Alignment && *Alignment != Align(16)) &&
877       cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
878     // Unaligned loads/stores are extremely inefficient.
879     // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
880     return LT.first * 4;
881   }
882   int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
883                      ? ST->getMVEVectorCostFactor()
884                      : 1;
885   return BaseCost * LT.first;
886 }
887 
888 int ARMTTIImpl::getInterleavedMemoryOpCost(
889     unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
890     unsigned Alignment, unsigned AddressSpace,
891     TTI::TargetCostKind CostKind,
892     bool UseMaskForCond, bool UseMaskForGaps) {
893   assert(Factor >= 2 && "Invalid interleave factor");
894   assert(isa<VectorType>(VecTy) && "Expect a vector type");
895 
896   // vldN/vstN doesn't support vector types of i64/f64 element.
897   bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
898 
899   if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
900       !UseMaskForCond && !UseMaskForGaps) {
901     unsigned NumElts = cast<VectorType>(VecTy)->getNumElements();
902     auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
903 
904     // vldN/vstN only support legal vector types of size 64 or 128 in bits.
905     // Accesses having vector types that are a multiple of 128 bits can be
906     // matched to more than one vldN/vstN instruction.
907     int BaseCost = ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor() : 1;
908     if (NumElts % Factor == 0 &&
909         TLI->isLegalInterleavedAccessType(Factor, SubVecTy, DL))
910       return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
911 
912     // Some smaller than legal interleaved patterns are cheap as we can make
913     // use of the vmovn or vrev patterns to interleave a standard load. This is
914     // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
915     // promoted differently). The cost of 2 here is then a load and vrev or
916     // vmovn.
917     if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
918         VecTy->isIntOrIntVectorTy() && DL.getTypeSizeInBits(SubVecTy) <= 64)
919       return 2 * BaseCost;
920   }
921 
922   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
923                                            Alignment, AddressSpace, CostKind,
924                                            UseMaskForCond, UseMaskForGaps);
925 }
926 
927 unsigned ARMTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
928                                             Value *Ptr, bool VariableMask,
929                                             unsigned Alignment,
930                                             TTI::TargetCostKind CostKind,
931                                             const Instruction *I) {
932   using namespace PatternMatch;
933   if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
934     return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
935                                          Alignment, CostKind, I);
936 
937   assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
938   VectorType *VTy = cast<VectorType>(DataTy);
939 
940   // TODO: Splitting, once we do that.
941 
942   unsigned NumElems = VTy->getNumElements();
943   unsigned EltSize = VTy->getScalarSizeInBits();
944   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, DataTy);
945 
946   // For now, it is assumed that for the MVE gather instructions the loads are
947   // all effectively serialised. This means the cost is the scalar cost
948   // multiplied by the number of elements being loaded. This is possibly very
949   // conservative, but even so we still end up vectorising loops because the
950   // cost per iteration for many loops is lower than for scalar loops.
951   unsigned VectorCost = NumElems * LT.first;
952   // The scalarization cost should be a lot higher. We use the number of vector
953   // elements plus the scalarization overhead.
954   unsigned ScalarCost =
955       NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, {});
956 
957   if (Alignment < EltSize / 8)
958     return ScalarCost;
959 
960   unsigned ExtSize = EltSize;
961   // Check whether there's a single user that asks for an extended type
962   if (I != nullptr) {
963     // Dependent of the caller of this function, a gather instruction will
964     // either have opcode Instruction::Load or be a call to the masked_gather
965     // intrinsic
966     if ((I->getOpcode() == Instruction::Load ||
967          match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
968         I->hasOneUse()) {
969       const User *Us = *I->users().begin();
970       if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
971         // only allow valid type combinations
972         unsigned TypeSize =
973             cast<Instruction>(Us)->getType()->getScalarSizeInBits();
974         if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
975              (TypeSize == 16 && EltSize == 8)) &&
976             TypeSize * NumElems == 128) {
977           ExtSize = TypeSize;
978         }
979       }
980     }
981     // Check whether the input data needs to be truncated
982     TruncInst *T;
983     if ((I->getOpcode() == Instruction::Store ||
984          match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
985         (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
986       // Only allow valid type combinations
987       unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
988       if (((EltSize == 16 && TypeSize == 32) ||
989            (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
990           TypeSize * NumElems == 128)
991         ExtSize = TypeSize;
992     }
993   }
994 
995   if (ExtSize * NumElems != 128 || NumElems < 4)
996     return ScalarCost;
997 
998   // Any (aligned) i32 gather will not need to be scalarised.
999   if (ExtSize == 32)
1000     return VectorCost;
1001   // For smaller types, we need to ensure that the gep's inputs are correctly
1002   // extended from a small enough value. Other sizes (including i64) are
1003   // scalarized for now.
1004   if (ExtSize != 8 && ExtSize != 16)
1005     return ScalarCost;
1006 
1007   if (auto BC = dyn_cast<BitCastInst>(Ptr))
1008     Ptr = BC->getOperand(0);
1009   if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
1010     if (GEP->getNumOperands() != 2)
1011       return ScalarCost;
1012     unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
1013     // Scale needs to be correct (which is only relevant for i16s).
1014     if (Scale != 1 && Scale * 8 != ExtSize)
1015       return ScalarCost;
1016     // And we need to zext (not sext) the indexes from a small enough type.
1017     if (auto ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
1018       if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
1019         return VectorCost;
1020     }
1021     return ScalarCost;
1022   }
1023   return ScalarCost;
1024 }
1025 
1026 bool ARMTTIImpl::isLoweredToCall(const Function *F) {
1027   if (!F->isIntrinsic())
1028     BaseT::isLoweredToCall(F);
1029 
1030   // Assume all Arm-specific intrinsics map to an instruction.
1031   if (F->getName().startswith("llvm.arm"))
1032     return false;
1033 
1034   switch (F->getIntrinsicID()) {
1035   default: break;
1036   case Intrinsic::powi:
1037   case Intrinsic::sin:
1038   case Intrinsic::cos:
1039   case Intrinsic::pow:
1040   case Intrinsic::log:
1041   case Intrinsic::log10:
1042   case Intrinsic::log2:
1043   case Intrinsic::exp:
1044   case Intrinsic::exp2:
1045     return true;
1046   case Intrinsic::sqrt:
1047   case Intrinsic::fabs:
1048   case Intrinsic::copysign:
1049   case Intrinsic::floor:
1050   case Intrinsic::ceil:
1051   case Intrinsic::trunc:
1052   case Intrinsic::rint:
1053   case Intrinsic::nearbyint:
1054   case Intrinsic::round:
1055   case Intrinsic::canonicalize:
1056   case Intrinsic::lround:
1057   case Intrinsic::llround:
1058   case Intrinsic::lrint:
1059   case Intrinsic::llrint:
1060     if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
1061       return true;
1062     if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
1063       return true;
1064     // Some operations can be handled by vector instructions and assume
1065     // unsupported vectors will be expanded into supported scalar ones.
1066     // TODO Handle scalar operations properly.
1067     return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
1068   case Intrinsic::masked_store:
1069   case Intrinsic::masked_load:
1070   case Intrinsic::masked_gather:
1071   case Intrinsic::masked_scatter:
1072     return !ST->hasMVEIntegerOps();
1073   case Intrinsic::sadd_with_overflow:
1074   case Intrinsic::uadd_with_overflow:
1075   case Intrinsic::ssub_with_overflow:
1076   case Intrinsic::usub_with_overflow:
1077   case Intrinsic::sadd_sat:
1078   case Intrinsic::uadd_sat:
1079   case Intrinsic::ssub_sat:
1080   case Intrinsic::usub_sat:
1081     return false;
1082   }
1083 
1084   return BaseT::isLoweredToCall(F);
1085 }
1086 
1087 bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
1088                                           AssumptionCache &AC,
1089                                           TargetLibraryInfo *LibInfo,
1090                                           HardwareLoopInfo &HWLoopInfo) {
1091   // Low-overhead branches are only supported in the 'low-overhead branch'
1092   // extension of v8.1-m.
1093   if (!ST->hasLOB() || DisableLowOverheadLoops) {
1094     LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
1095     return false;
1096   }
1097 
1098   if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
1099     LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
1100     return false;
1101   }
1102 
1103   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1104   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
1105     LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
1106     return false;
1107   }
1108 
1109   const SCEV *TripCountSCEV =
1110     SE.getAddExpr(BackedgeTakenCount,
1111                   SE.getOne(BackedgeTakenCount->getType()));
1112 
1113   // We need to store the trip count in LR, a 32-bit register.
1114   if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
1115     LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
1116     return false;
1117   }
1118 
1119   // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
1120   // point in generating a hardware loop if that's going to happen.
1121   auto MaybeCall = [this](Instruction &I) {
1122     const ARMTargetLowering *TLI = getTLI();
1123     unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
1124     EVT VT = TLI->getValueType(DL, I.getType(), true);
1125     if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
1126       return true;
1127 
1128     // Check if an intrinsic will be lowered to a call and assume that any
1129     // other CallInst will generate a bl.
1130     if (auto *Call = dyn_cast<CallInst>(&I)) {
1131       if (isa<IntrinsicInst>(Call)) {
1132         if (const Function *F = Call->getCalledFunction())
1133           return isLoweredToCall(F);
1134       }
1135       return true;
1136     }
1137 
1138     // FPv5 provides conversions between integer, double-precision,
1139     // single-precision, and half-precision formats.
1140     switch (I.getOpcode()) {
1141     default:
1142       break;
1143     case Instruction::FPToSI:
1144     case Instruction::FPToUI:
1145     case Instruction::SIToFP:
1146     case Instruction::UIToFP:
1147     case Instruction::FPTrunc:
1148     case Instruction::FPExt:
1149       return !ST->hasFPARMv8Base();
1150     }
1151 
1152     // FIXME: Unfortunately the approach of checking the Operation Action does
1153     // not catch all cases of Legalization that use library calls. Our
1154     // Legalization step categorizes some transformations into library calls as
1155     // Custom, Expand or even Legal when doing type legalization. So for now
1156     // we have to special case for instance the SDIV of 64bit integers and the
1157     // use of floating point emulation.
1158     if (VT.isInteger() && VT.getSizeInBits() >= 64) {
1159       switch (ISD) {
1160       default:
1161         break;
1162       case ISD::SDIV:
1163       case ISD::UDIV:
1164       case ISD::SREM:
1165       case ISD::UREM:
1166       case ISD::SDIVREM:
1167       case ISD::UDIVREM:
1168         return true;
1169       }
1170     }
1171 
1172     // Assume all other non-float operations are supported.
1173     if (!VT.isFloatingPoint())
1174       return false;
1175 
1176     // We'll need a library call to handle most floats when using soft.
1177     if (TLI->useSoftFloat()) {
1178       switch (I.getOpcode()) {
1179       default:
1180         return true;
1181       case Instruction::Alloca:
1182       case Instruction::Load:
1183       case Instruction::Store:
1184       case Instruction::Select:
1185       case Instruction::PHI:
1186         return false;
1187       }
1188     }
1189 
1190     // We'll need a libcall to perform double precision operations on a single
1191     // precision only FPU.
1192     if (I.getType()->isDoubleTy() && !ST->hasFP64())
1193       return true;
1194 
1195     // Likewise for half precision arithmetic.
1196     if (I.getType()->isHalfTy() && !ST->hasFullFP16())
1197       return true;
1198 
1199     return false;
1200   };
1201 
1202   auto IsHardwareLoopIntrinsic = [](Instruction &I) {
1203     if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
1204       switch (Call->getIntrinsicID()) {
1205       default:
1206         break;
1207       case Intrinsic::set_loop_iterations:
1208       case Intrinsic::test_set_loop_iterations:
1209       case Intrinsic::loop_decrement:
1210       case Intrinsic::loop_decrement_reg:
1211         return true;
1212       }
1213     }
1214     return false;
1215   };
1216 
1217   // Scan the instructions to see if there's any that we know will turn into a
1218   // call or if this loop is already a low-overhead loop.
1219   auto ScanLoop = [&](Loop *L) {
1220     for (auto *BB : L->getBlocks()) {
1221       for (auto &I : *BB) {
1222         if (MaybeCall(I) || IsHardwareLoopIntrinsic(I)) {
1223           LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
1224           return false;
1225         }
1226       }
1227     }
1228     return true;
1229   };
1230 
1231   // Visit inner loops.
1232   for (auto Inner : *L)
1233     if (!ScanLoop(Inner))
1234       return false;
1235 
1236   if (!ScanLoop(L))
1237     return false;
1238 
1239   // TODO: Check whether the trip count calculation is expensive. If L is the
1240   // inner loop but we know it has a low trip count, calculating that trip
1241   // count (in the parent loop) may be detrimental.
1242 
1243   LLVMContext &C = L->getHeader()->getContext();
1244   HWLoopInfo.CounterInReg = true;
1245   HWLoopInfo.IsNestingLegal = false;
1246   HWLoopInfo.PerformEntryTest = true;
1247   HWLoopInfo.CountType = Type::getInt32Ty(C);
1248   HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
1249   return true;
1250 }
1251 
1252 static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
1253   // We don't allow icmp's, and because we only look at single block loops,
1254   // we simply count the icmps, i.e. there should only be 1 for the backedge.
1255   if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
1256     return false;
1257 
1258   if (isa<FCmpInst>(&I))
1259     return false;
1260 
1261   // We could allow extending/narrowing FP loads/stores, but codegen is
1262   // too inefficient so reject this for now.
1263   if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
1264     return false;
1265 
1266   // Extends have to be extending-loads
1267   if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
1268     if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
1269       return false;
1270 
1271   // Truncs have to be narrowing-stores
1272   if (isa<TruncInst>(&I) )
1273     if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
1274       return false;
1275 
1276   return true;
1277 }
1278 
1279 // To set up a tail-predicated loop, we need to know the total number of
1280 // elements processed by that loop. Thus, we need to determine the element
1281 // size and:
1282 // 1) it should be uniform for all operations in the vector loop, so we
1283 //    e.g. don't want any widening/narrowing operations.
1284 // 2) it should be smaller than i64s because we don't have vector operations
1285 //    that work on i64s.
1286 // 3) we don't want elements to be reversed or shuffled, to make sure the
1287 //    tail-predication masks/predicates the right lanes.
1288 //
1289 static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
1290                                  const DataLayout &DL,
1291                                  const LoopAccessInfo *LAI) {
1292   PredicatedScalarEvolution PSE = LAI->getPSE();
1293   int ICmpCount = 0;
1294   int Stride = 0;
1295 
1296   LLVM_DEBUG(dbgs() << "tail-predication: checking allowed instructions\n");
1297   SmallVector<Instruction *, 16> LoadStores;
1298   for (BasicBlock *BB : L->blocks()) {
1299     for (Instruction &I : BB->instructionsWithoutDebug()) {
1300       if (isa<PHINode>(&I))
1301         continue;
1302       if (!canTailPredicateInstruction(I, ICmpCount)) {
1303         LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
1304         return false;
1305       }
1306 
1307       Type *T  = I.getType();
1308       if (T->isPointerTy())
1309         T = T->getPointerElementType();
1310 
1311       if (T->getScalarSizeInBits() > 32) {
1312         LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
1313         return false;
1314       }
1315 
1316       if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
1317         Value *Ptr = isa<LoadInst>(I) ? I.getOperand(0) : I.getOperand(1);
1318         int64_t NextStride = getPtrStride(PSE, Ptr, L);
1319         // TODO: for now only allow consecutive strides of 1. We could support
1320         // other strides as long as it is uniform, but let's keep it simple for
1321         // now.
1322         if (Stride == 0 && NextStride == 1) {
1323           Stride = NextStride;
1324           continue;
1325         }
1326         if (Stride != NextStride) {
1327           LLVM_DEBUG(dbgs() << "Different strides found, can't "
1328                                "tail-predicate\n.");
1329           return false;
1330         }
1331       }
1332     }
1333   }
1334 
1335   LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
1336   return true;
1337 }
1338 
1339 bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
1340                                              ScalarEvolution &SE,
1341                                              AssumptionCache &AC,
1342                                              TargetLibraryInfo *TLI,
1343                                              DominatorTree *DT,
1344                                              const LoopAccessInfo *LAI) {
1345   if (DisableTailPredication)
1346     return false;
1347 
1348   // Creating a predicated vector loop is the first step for generating a
1349   // tail-predicated hardware loop, for which we need the MVE masked
1350   // load/stores instructions:
1351   if (!ST->hasMVEIntegerOps())
1352     return false;
1353 
1354   // For now, restrict this to single block loops.
1355   if (L->getNumBlocks() > 1) {
1356     LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
1357                          "loop.\n");
1358     return false;
1359   }
1360 
1361   assert(L->empty() && "preferPredicateOverEpilogue: inner-loop expected");
1362 
1363   HardwareLoopInfo HWLoopInfo(L);
1364   if (!HWLoopInfo.canAnalyze(*LI)) {
1365     LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
1366                          "analyzable.\n");
1367     return false;
1368   }
1369 
1370   // This checks if we have the low-overhead branch architecture
1371   // extension, and if we will create a hardware-loop:
1372   if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
1373     LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
1374                          "profitable.\n");
1375     return false;
1376   }
1377 
1378   if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) {
1379     LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
1380                          "a candidate.\n");
1381     return false;
1382   }
1383 
1384   return canTailPredicateLoop(L, LI, SE, DL, LAI);
1385 }
1386 
1387 
1388 void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1389                                          TTI::UnrollingPreferences &UP) {
1390   // Only currently enable these preferences for M-Class cores.
1391   if (!ST->isMClass())
1392     return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP);
1393 
1394   // Disable loop unrolling for Oz and Os.
1395   UP.OptSizeThreshold = 0;
1396   UP.PartialOptSizeThreshold = 0;
1397   if (L->getHeader()->getParent()->hasOptSize())
1398     return;
1399 
1400   // Only enable on Thumb-2 targets.
1401   if (!ST->isThumb2())
1402     return;
1403 
1404   SmallVector<BasicBlock*, 4> ExitingBlocks;
1405   L->getExitingBlocks(ExitingBlocks);
1406   LLVM_DEBUG(dbgs() << "Loop has:\n"
1407                     << "Blocks: " << L->getNumBlocks() << "\n"
1408                     << "Exit blocks: " << ExitingBlocks.size() << "\n");
1409 
1410   // Only allow another exit other than the latch. This acts as an early exit
1411   // as it mirrors the profitability calculation of the runtime unroller.
1412   if (ExitingBlocks.size() > 2)
1413     return;
1414 
1415   // Limit the CFG of the loop body for targets with a branch predictor.
1416   // Allowing 4 blocks permits if-then-else diamonds in the body.
1417   if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
1418     return;
1419 
1420   // Scan the loop: don't unroll loops with calls as this could prevent
1421   // inlining.
1422   unsigned Cost = 0;
1423   for (auto *BB : L->getBlocks()) {
1424     for (auto &I : *BB) {
1425       // Don't unroll vectorised loop. MVE does not benefit from it as much as
1426       // scalar code.
1427       if (I.getType()->isVectorTy())
1428         return;
1429 
1430       if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1431         if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
1432           if (!isLoweredToCall(F))
1433             continue;
1434         }
1435         return;
1436       }
1437 
1438       SmallVector<const Value*, 4> Operands(I.value_op_begin(),
1439                                             I.value_op_end());
1440       Cost += getUserCost(&I, Operands, TargetTransformInfo::TCK_CodeSize);
1441     }
1442   }
1443 
1444   LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
1445 
1446   UP.Partial = true;
1447   UP.Runtime = true;
1448   UP.UpperBound = true;
1449   UP.UnrollRemainder = true;
1450   UP.DefaultUnrollRuntimeCount = 4;
1451   UP.UnrollAndJam = true;
1452   UP.UnrollAndJamInnerLoopThreshold = 60;
1453 
1454   // Force unrolling small loops can be very useful because of the branch
1455   // taken cost of the backedge.
1456   if (Cost < 12)
1457     UP.Force = true;
1458 }
1459 
1460 bool ARMTTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
1461                                        TTI::ReductionFlags Flags) const {
1462   assert(isa<VectorType>(Ty) && "Expected Ty to be a vector type");
1463   unsigned ScalarBits = Ty->getScalarSizeInBits();
1464   if (!ST->hasMVEIntegerOps())
1465     return false;
1466 
1467   switch (Opcode) {
1468   case Instruction::FAdd:
1469   case Instruction::FMul:
1470   case Instruction::And:
1471   case Instruction::Or:
1472   case Instruction::Xor:
1473   case Instruction::Mul:
1474   case Instruction::FCmp:
1475     return false;
1476   case Instruction::ICmp:
1477   case Instruction::Add:
1478     return ScalarBits < 64 &&
1479            (ScalarBits * cast<VectorType>(Ty)->getNumElements()) % 128 == 0;
1480   default:
1481     llvm_unreachable("Unhandled reduction opcode");
1482   }
1483   return false;
1484 }
1485