1 //===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8
9 #include "ARMTargetTransformInfo.h"
10 #include "ARMSubtarget.h"
11 #include "MCTargetDesc/ARMAddressingModes.h"
12 #include "llvm/ADT/APInt.h"
13 #include "llvm/ADT/SmallVector.h"
14 #include "llvm/Analysis/LoopInfo.h"
15 #include "llvm/CodeGen/CostTable.h"
16 #include "llvm/CodeGen/ISDOpcodes.h"
17 #include "llvm/CodeGen/ValueTypes.h"
18 #include "llvm/IR/BasicBlock.h"
19 #include "llvm/IR/DataLayout.h"
20 #include "llvm/IR/DerivedTypes.h"
21 #include "llvm/IR/Instruction.h"
22 #include "llvm/IR/Instructions.h"
23 #include "llvm/IR/IntrinsicInst.h"
24 #include "llvm/IR/Intrinsics.h"
25 #include "llvm/IR/IntrinsicsARM.h"
26 #include "llvm/IR/PatternMatch.h"
27 #include "llvm/IR/Type.h"
28 #include "llvm/MC/SubtargetFeature.h"
29 #include "llvm/Support/Casting.h"
30 #include "llvm/Support/KnownBits.h"
31 #include "llvm/Support/MachineValueType.h"
32 #include "llvm/Target/TargetMachine.h"
33 #include "llvm/Transforms/InstCombine/InstCombiner.h"
34 #include "llvm/Transforms/Utils/Local.h"
35 #include "llvm/Transforms/Utils/LoopUtils.h"
36 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
37 #include <algorithm>
38 #include <cassert>
39 #include <cstdint>
40 #include <utility>
41
42 using namespace llvm;
43
44 #define DEBUG_TYPE "armtti"
45
46 static cl::opt<bool> EnableMaskedLoadStores(
47 "enable-arm-maskedldst", cl::Hidden, cl::init(true),
48 cl::desc("Enable the generation of masked loads and stores"));
49
50 static cl::opt<bool> DisableLowOverheadLoops(
51 "disable-arm-loloops", cl::Hidden, cl::init(false),
52 cl::desc("Disable the generation of low-overhead loops"));
53
54 static cl::opt<bool>
55 AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
56 cl::desc("Enable the generation of WLS loops"));
57
58 extern cl::opt<TailPredication::Mode> EnableTailPredication;
59
60 extern cl::opt<bool> EnableMaskedGatherScatters;
61
62 extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor;
63
64 /// Convert a vector load intrinsic into a simple llvm load instruction.
65 /// This is beneficial when the underlying object being addressed comes
66 /// from a constant, since we get constant-folding for free.
simplifyNeonVld1(const IntrinsicInst & II,unsigned MemAlign,InstCombiner::BuilderTy & Builder)67 static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
68 InstCombiner::BuilderTy &Builder) {
69 auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
70
71 if (!IntrAlign)
72 return nullptr;
73
74 unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
75 ? MemAlign
76 : IntrAlign->getLimitedValue();
77
78 if (!isPowerOf2_32(Alignment))
79 return nullptr;
80
81 auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
82 PointerType::get(II.getType(), 0));
83 return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
84 }
85
areInlineCompatible(const Function * Caller,const Function * Callee) const86 bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
87 const Function *Callee) const {
88 const TargetMachine &TM = getTLI()->getTargetMachine();
89 const FeatureBitset &CallerBits =
90 TM.getSubtargetImpl(*Caller)->getFeatureBits();
91 const FeatureBitset &CalleeBits =
92 TM.getSubtargetImpl(*Callee)->getFeatureBits();
93
94 // To inline a callee, all features not in the allowed list must match exactly.
95 bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
96 (CalleeBits & ~InlineFeaturesAllowed);
97 // For features in the allowed list, the callee's features must be a subset of
98 // the callers'.
99 bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
100 (CalleeBits & InlineFeaturesAllowed);
101 return MatchExact && MatchSubset;
102 }
103
104 TTI::AddressingModeKind
getPreferredAddressingMode(const Loop * L,ScalarEvolution * SE) const105 ARMTTIImpl::getPreferredAddressingMode(const Loop *L,
106 ScalarEvolution *SE) const {
107 if (ST->hasMVEIntegerOps())
108 return TTI::AMK_PostIndexed;
109
110 if (L->getHeader()->getParent()->hasOptSize())
111 return TTI::AMK_None;
112
113 if (ST->isMClass() && ST->isThumb2() &&
114 L->getNumBlocks() == 1)
115 return TTI::AMK_PreIndexed;
116
117 return TTI::AMK_None;
118 }
119
120 Optional<Instruction *>
instCombineIntrinsic(InstCombiner & IC,IntrinsicInst & II) const121 ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
122 using namespace PatternMatch;
123 Intrinsic::ID IID = II.getIntrinsicID();
124 switch (IID) {
125 default:
126 break;
127 case Intrinsic::arm_neon_vld1: {
128 Align MemAlign =
129 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
130 &IC.getAssumptionCache(), &IC.getDominatorTree());
131 if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
132 return IC.replaceInstUsesWith(II, V);
133 }
134 break;
135 }
136
137 case Intrinsic::arm_neon_vld2:
138 case Intrinsic::arm_neon_vld3:
139 case Intrinsic::arm_neon_vld4:
140 case Intrinsic::arm_neon_vld2lane:
141 case Intrinsic::arm_neon_vld3lane:
142 case Intrinsic::arm_neon_vld4lane:
143 case Intrinsic::arm_neon_vst1:
144 case Intrinsic::arm_neon_vst2:
145 case Intrinsic::arm_neon_vst3:
146 case Intrinsic::arm_neon_vst4:
147 case Intrinsic::arm_neon_vst2lane:
148 case Intrinsic::arm_neon_vst3lane:
149 case Intrinsic::arm_neon_vst4lane: {
150 Align MemAlign =
151 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
152 &IC.getAssumptionCache(), &IC.getDominatorTree());
153 unsigned AlignArg = II.arg_size() - 1;
154 Value *AlignArgOp = II.getArgOperand(AlignArg);
155 MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
156 if (Align && *Align < MemAlign) {
157 return IC.replaceOperand(
158 II, AlignArg,
159 ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(),
160 false));
161 }
162 break;
163 }
164
165 case Intrinsic::arm_mve_pred_i2v: {
166 Value *Arg = II.getArgOperand(0);
167 Value *ArgArg;
168 if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
169 PatternMatch::m_Value(ArgArg))) &&
170 II.getType() == ArgArg->getType()) {
171 return IC.replaceInstUsesWith(II, ArgArg);
172 }
173 Constant *XorMask;
174 if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
175 PatternMatch::m_Value(ArgArg)),
176 PatternMatch::m_Constant(XorMask))) &&
177 II.getType() == ArgArg->getType()) {
178 if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
179 if (CI->getValue().trunc(16).isAllOnes()) {
180 auto TrueVector = IC.Builder.CreateVectorSplat(
181 cast<FixedVectorType>(II.getType())->getNumElements(),
182 IC.Builder.getTrue());
183 return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
184 }
185 }
186 }
187 KnownBits ScalarKnown(32);
188 if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
189 ScalarKnown, 0)) {
190 return &II;
191 }
192 break;
193 }
194 case Intrinsic::arm_mve_pred_v2i: {
195 Value *Arg = II.getArgOperand(0);
196 Value *ArgArg;
197 if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
198 PatternMatch::m_Value(ArgArg)))) {
199 return IC.replaceInstUsesWith(II, ArgArg);
200 }
201 if (!II.getMetadata(LLVMContext::MD_range)) {
202 Type *IntTy32 = Type::getInt32Ty(II.getContext());
203 Metadata *M[] = {
204 ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)),
205 ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0x10000))};
206 II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M));
207 return &II;
208 }
209 break;
210 }
211 case Intrinsic::arm_mve_vadc:
212 case Intrinsic::arm_mve_vadc_predicated: {
213 unsigned CarryOp =
214 (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
215 assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
216 "Bad type for intrinsic!");
217
218 KnownBits CarryKnown(32);
219 if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
220 CarryKnown)) {
221 return &II;
222 }
223 break;
224 }
225 case Intrinsic::arm_mve_vmldava: {
226 Instruction *I = cast<Instruction>(&II);
227 if (I->hasOneUse()) {
228 auto *User = cast<Instruction>(*I->user_begin());
229 Value *OpZ;
230 if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
231 match(I->getOperand(3), m_Zero())) {
232 Value *OpX = I->getOperand(4);
233 Value *OpY = I->getOperand(5);
234 Type *OpTy = OpX->getType();
235
236 IC.Builder.SetInsertPoint(User);
237 Value *V =
238 IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
239 {I->getOperand(0), I->getOperand(1),
240 I->getOperand(2), OpZ, OpX, OpY});
241
242 IC.replaceInstUsesWith(*User, V);
243 return IC.eraseInstFromFunction(*User);
244 }
245 }
246 return None;
247 }
248 }
249 return None;
250 }
251
simplifyDemandedVectorEltsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt OrigDemandedElts,APInt & UndefElts,APInt & UndefElts2,APInt & UndefElts3,std::function<void (Instruction *,unsigned,APInt,APInt &)> SimplifyAndSetOp) const252 Optional<Value *> ARMTTIImpl::simplifyDemandedVectorEltsIntrinsic(
253 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
254 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
255 std::function<void(Instruction *, unsigned, APInt, APInt &)>
256 SimplifyAndSetOp) const {
257
258 // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
259 // opcode specifying a Top/Bottom instruction, which can change between
260 // instructions.
261 auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
262 unsigned NumElts = cast<FixedVectorType>(II.getType())->getNumElements();
263 unsigned IsTop = cast<ConstantInt>(II.getOperand(TopOpc))->getZExtValue();
264
265 // The only odd/even lanes of operand 0 will only be demanded depending
266 // on whether this is a top/bottom instruction.
267 APInt DemandedElts =
268 APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
269 : APInt::getHighBitsSet(2, 1));
270 SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts);
271 // The other lanes will be defined from the inserted elements.
272 UndefElts &= APInt::getSplat(NumElts, !IsTop ? APInt::getLowBitsSet(2, 1)
273 : APInt::getHighBitsSet(2, 1));
274 return None;
275 };
276
277 switch (II.getIntrinsicID()) {
278 default:
279 break;
280 case Intrinsic::arm_mve_vcvt_narrow:
281 SimplifyNarrowInstrTopBottom(2);
282 break;
283 case Intrinsic::arm_mve_vqmovn:
284 SimplifyNarrowInstrTopBottom(4);
285 break;
286 case Intrinsic::arm_mve_vshrn:
287 SimplifyNarrowInstrTopBottom(7);
288 break;
289 }
290
291 return None;
292 }
293
getIntImmCost(const APInt & Imm,Type * Ty,TTI::TargetCostKind CostKind)294 InstructionCost ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
295 TTI::TargetCostKind CostKind) {
296 assert(Ty->isIntegerTy());
297
298 unsigned Bits = Ty->getPrimitiveSizeInBits();
299 if (Bits == 0 || Imm.getActiveBits() >= 64)
300 return 4;
301
302 int64_t SImmVal = Imm.getSExtValue();
303 uint64_t ZImmVal = Imm.getZExtValue();
304 if (!ST->isThumb()) {
305 if ((SImmVal >= 0 && SImmVal < 65536) ||
306 (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
307 (ARM_AM::getSOImmVal(~ZImmVal) != -1))
308 return 1;
309 return ST->hasV6T2Ops() ? 2 : 3;
310 }
311 if (ST->isThumb2()) {
312 if ((SImmVal >= 0 && SImmVal < 65536) ||
313 (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
314 (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
315 return 1;
316 return ST->hasV6T2Ops() ? 2 : 3;
317 }
318 // Thumb1, any i8 imm cost 1.
319 if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
320 return 1;
321 if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
322 return 2;
323 // Load from constantpool.
324 return 3;
325 }
326
327 // Constants smaller than 256 fit in the immediate field of
328 // Thumb1 instructions so we return a zero cost and 1 otherwise.
getIntImmCodeSizeCost(unsigned Opcode,unsigned Idx,const APInt & Imm,Type * Ty)329 InstructionCost ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
330 const APInt &Imm, Type *Ty) {
331 if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
332 return 0;
333
334 return 1;
335 }
336
337 // Checks whether Inst is part of a min(max()) or max(min()) pattern
338 // that will match to an SSAT instruction. Returns the instruction being
339 // saturated, or null if no saturation pattern was found.
isSSATMinMaxPattern(Instruction * Inst,const APInt & Imm)340 static Value *isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
341 Value *LHS, *RHS;
342 ConstantInt *C;
343 SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor;
344
345 if (InstSPF == SPF_SMAX &&
346 PatternMatch::match(RHS, PatternMatch::m_ConstantInt(C)) &&
347 C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) {
348
349 auto isSSatMin = [&](Value *MinInst) {
350 if (isa<SelectInst>(MinInst)) {
351 Value *MinLHS, *MinRHS;
352 ConstantInt *MinC;
353 SelectPatternFlavor MinSPF =
354 matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
355 if (MinSPF == SPF_SMIN &&
356 PatternMatch::match(MinRHS, PatternMatch::m_ConstantInt(MinC)) &&
357 MinC->getValue() == ((-Imm) - 1))
358 return true;
359 }
360 return false;
361 };
362
363 if (isSSatMin(Inst->getOperand(1)))
364 return cast<Instruction>(Inst->getOperand(1))->getOperand(1);
365 if (Inst->hasNUses(2) &&
366 (isSSatMin(*Inst->user_begin()) || isSSatMin(*(++Inst->user_begin()))))
367 return Inst->getOperand(1);
368 }
369 return nullptr;
370 }
371
372 // Look for a FP Saturation pattern, where the instruction can be simplified to
373 // a fptosi.sat. max(min(fptosi)). The constant in this case is always free.
isFPSatMinMaxPattern(Instruction * Inst,const APInt & Imm)374 static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm) {
375 if (Imm.getBitWidth() != 64 ||
376 Imm != APInt::getHighBitsSet(64, 33)) // -2147483648
377 return false;
378 Value *FP = isSSATMinMaxPattern(Inst, Imm);
379 if (!FP && isa<ICmpInst>(Inst) && Inst->hasOneUse())
380 FP = isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm);
381 if (!FP)
382 return false;
383 return isa<FPToSIInst>(FP);
384 }
385
getIntImmCostInst(unsigned Opcode,unsigned Idx,const APInt & Imm,Type * Ty,TTI::TargetCostKind CostKind,Instruction * Inst)386 InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
387 const APInt &Imm, Type *Ty,
388 TTI::TargetCostKind CostKind,
389 Instruction *Inst) {
390 // Division by a constant can be turned into multiplication, but only if we
391 // know it's constant. So it's not so much that the immediate is cheap (it's
392 // not), but that the alternative is worse.
393 // FIXME: this is probably unneeded with GlobalISel.
394 if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
395 Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
396 Idx == 1)
397 return 0;
398
399 // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
400 // splitting any large offsets.
401 if (Opcode == Instruction::GetElementPtr && Idx != 0)
402 return 0;
403
404 if (Opcode == Instruction::And) {
405 // UXTB/UXTH
406 if (Imm == 255 || Imm == 65535)
407 return 0;
408 // Conversion to BIC is free, and means we can use ~Imm instead.
409 return std::min(getIntImmCost(Imm, Ty, CostKind),
410 getIntImmCost(~Imm, Ty, CostKind));
411 }
412
413 if (Opcode == Instruction::Add)
414 // Conversion to SUB is free, and means we can use -Imm instead.
415 return std::min(getIntImmCost(Imm, Ty, CostKind),
416 getIntImmCost(-Imm, Ty, CostKind));
417
418 if (Opcode == Instruction::ICmp && Imm.isNegative() &&
419 Ty->getIntegerBitWidth() == 32) {
420 int64_t NegImm = -Imm.getSExtValue();
421 if (ST->isThumb2() && NegImm < 1<<12)
422 // icmp X, #-C -> cmn X, #C
423 return 0;
424 if (ST->isThumb() && NegImm < 1<<8)
425 // icmp X, #-C -> adds X, #C
426 return 0;
427 }
428
429 // xor a, -1 can always be folded to MVN
430 if (Opcode == Instruction::Xor && Imm.isAllOnes())
431 return 0;
432
433 // Ensures negative constant of min(max()) or max(min()) patterns that
434 // match to SSAT instructions don't get hoisted
435 if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
436 Ty->getIntegerBitWidth() <= 32) {
437 if (isSSATMinMaxPattern(Inst, Imm) ||
438 (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
439 isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
440 return 0;
441 }
442
443 if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm))
444 return 0;
445
446 // We can convert <= -1 to < 0, which is generally quite cheap.
447 if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnesValue()) {
448 ICmpInst::Predicate Pred = cast<ICmpInst>(Inst)->getPredicate();
449 if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE)
450 return std::min(getIntImmCost(Imm, Ty, CostKind),
451 getIntImmCost(Imm + 1, Ty, CostKind));
452 }
453
454 return getIntImmCost(Imm, Ty, CostKind);
455 }
456
getCFInstrCost(unsigned Opcode,TTI::TargetCostKind CostKind,const Instruction * I)457 InstructionCost ARMTTIImpl::getCFInstrCost(unsigned Opcode,
458 TTI::TargetCostKind CostKind,
459 const Instruction *I) {
460 if (CostKind == TTI::TCK_RecipThroughput &&
461 (ST->hasNEON() || ST->hasMVEIntegerOps())) {
462 // FIXME: The vectorizer is highly sensistive to the cost of these
463 // instructions, which suggests that it may be using the costs incorrectly.
464 // But, for now, just make them free to avoid performance regressions for
465 // vector targets.
466 return 0;
467 }
468 return BaseT::getCFInstrCost(Opcode, CostKind, I);
469 }
470
getCastInstrCost(unsigned Opcode,Type * Dst,Type * Src,TTI::CastContextHint CCH,TTI::TargetCostKind CostKind,const Instruction * I)471 InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
472 Type *Src,
473 TTI::CastContextHint CCH,
474 TTI::TargetCostKind CostKind,
475 const Instruction *I) {
476 int ISD = TLI->InstructionOpcodeToISD(Opcode);
477 assert(ISD && "Invalid opcode");
478
479 // TODO: Allow non-throughput costs that aren't binary.
480 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
481 if (CostKind != TTI::TCK_RecipThroughput)
482 return Cost == 0 ? 0 : 1;
483 return Cost;
484 };
485 auto IsLegalFPType = [this](EVT VT) {
486 EVT EltVT = VT.getScalarType();
487 return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
488 (EltVT == MVT::f64 && ST->hasFP64()) ||
489 (EltVT == MVT::f16 && ST->hasFullFP16());
490 };
491
492 EVT SrcTy = TLI->getValueType(DL, Src);
493 EVT DstTy = TLI->getValueType(DL, Dst);
494
495 if (!SrcTy.isSimple() || !DstTy.isSimple())
496 return AdjustCost(
497 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
498
499 // Extending masked load/Truncating masked stores is expensive because we
500 // currently don't split them. This means that we'll likely end up
501 // loading/storing each element individually (hence the high cost).
502 if ((ST->hasMVEIntegerOps() &&
503 (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
504 Opcode == Instruction::SExt)) ||
505 (ST->hasMVEFloatOps() &&
506 (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
507 IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
508 if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
509 return 2 * DstTy.getVectorNumElements() *
510 ST->getMVEVectorCostFactor(CostKind);
511
512 // The extend of other kinds of load is free
513 if (CCH == TTI::CastContextHint::Normal ||
514 CCH == TTI::CastContextHint::Masked) {
515 static const TypeConversionCostTblEntry LoadConversionTbl[] = {
516 {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
517 {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
518 {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
519 {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
520 {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
521 {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
522 {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
523 {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
524 {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
525 {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
526 {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
527 {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
528 };
529 if (const auto *Entry = ConvertCostTableLookup(
530 LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
531 return AdjustCost(Entry->Cost);
532
533 static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
534 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
535 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
536 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
537 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
538 {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
539 {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
540 // The following extend from a legal type to an illegal type, so need to
541 // split the load. This introduced an extra load operation, but the
542 // extend is still "free".
543 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1},
544 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1},
545 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3},
546 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3},
547 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1},
548 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1},
549 };
550 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
551 if (const auto *Entry =
552 ConvertCostTableLookup(MVELoadConversionTbl, ISD,
553 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
554 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
555 }
556
557 static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
558 // FPExtends are similar but also require the VCVT instructions.
559 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1},
560 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3},
561 };
562 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
563 if (const auto *Entry =
564 ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
565 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
566 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
567 }
568
569 // The truncate of a store is free. This is the mirror of extends above.
570 static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
571 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
572 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
573 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
574 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1},
575 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1},
576 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3},
577 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1},
578 };
579 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
580 if (const auto *Entry =
581 ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
582 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
583 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
584 }
585
586 static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
587 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
588 {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
589 };
590 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
591 if (const auto *Entry =
592 ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
593 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
594 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
595 }
596 }
597
598 // NEON vector operations that can extend their inputs.
599 if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
600 I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
601 static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
602 // vaddl
603 { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
604 { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 },
605 // vsubl
606 { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
607 { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 },
608 // vmull
609 { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
610 { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 },
611 // vshll
612 { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
613 { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 },
614 };
615
616 auto *User = cast<Instruction>(*I->user_begin());
617 int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
618 if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
619 DstTy.getSimpleVT(),
620 SrcTy.getSimpleVT())) {
621 return AdjustCost(Entry->Cost);
622 }
623 }
624
625 // Single to/from double precision conversions.
626 if (Src->isVectorTy() && ST->hasNEON() &&
627 ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
628 DstTy.getScalarType() == MVT::f32) ||
629 (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
630 DstTy.getScalarType() == MVT::f64))) {
631 static const CostTblEntry NEONFltDblTbl[] = {
632 // Vector fptrunc/fpext conversions.
633 {ISD::FP_ROUND, MVT::v2f64, 2},
634 {ISD::FP_EXTEND, MVT::v2f32, 2},
635 {ISD::FP_EXTEND, MVT::v4f32, 4}};
636
637 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
638 if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
639 return AdjustCost(LT.first * Entry->Cost);
640 }
641
642 // Some arithmetic, load and store operations have specific instructions
643 // to cast up/down their types automatically at no extra cost.
644 // TODO: Get these tables to know at least what the related operations are.
645 static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
646 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
647 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
648 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
649 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
650 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
651 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
652
653 // The number of vmovl instructions for the extension.
654 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
655 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
656 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
657 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
658 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
659 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
660 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
661 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
662 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
663 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
664 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
665 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
666 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
667 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
668 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
669 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
670 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
671 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
672
673 // Operations that we legalize using splitting.
674 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
675 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
676
677 // Vector float <-> i32 conversions.
678 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
679 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
680
681 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
682 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
683 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
684 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
685 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
686 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
687 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
688 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
689 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
690 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
691 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
692 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
693 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
694 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
695 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
696 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
697 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
698 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
699 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
700 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
701
702 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
703 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
704 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 },
705 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 },
706 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
707 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
708
709 // Vector double <-> i32 conversions.
710 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
711 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
712
713 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
714 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
715 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
716 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
717 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
718 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
719
720 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
721 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
722 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 4 },
723 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 4 },
724 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 8 },
725 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 8 }
726 };
727
728 if (SrcTy.isVector() && ST->hasNEON()) {
729 if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
730 DstTy.getSimpleVT(),
731 SrcTy.getSimpleVT()))
732 return AdjustCost(Entry->Cost);
733 }
734
735 // Scalar float to integer conversions.
736 static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
737 { ISD::FP_TO_SINT, MVT::i1, MVT::f32, 2 },
738 { ISD::FP_TO_UINT, MVT::i1, MVT::f32, 2 },
739 { ISD::FP_TO_SINT, MVT::i1, MVT::f64, 2 },
740 { ISD::FP_TO_UINT, MVT::i1, MVT::f64, 2 },
741 { ISD::FP_TO_SINT, MVT::i8, MVT::f32, 2 },
742 { ISD::FP_TO_UINT, MVT::i8, MVT::f32, 2 },
743 { ISD::FP_TO_SINT, MVT::i8, MVT::f64, 2 },
744 { ISD::FP_TO_UINT, MVT::i8, MVT::f64, 2 },
745 { ISD::FP_TO_SINT, MVT::i16, MVT::f32, 2 },
746 { ISD::FP_TO_UINT, MVT::i16, MVT::f32, 2 },
747 { ISD::FP_TO_SINT, MVT::i16, MVT::f64, 2 },
748 { ISD::FP_TO_UINT, MVT::i16, MVT::f64, 2 },
749 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 2 },
750 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 2 },
751 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 2 },
752 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 2 },
753 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 10 },
754 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 10 },
755 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 10 },
756 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 10 }
757 };
758 if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
759 if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
760 DstTy.getSimpleVT(),
761 SrcTy.getSimpleVT()))
762 return AdjustCost(Entry->Cost);
763 }
764
765 // Scalar integer to float conversions.
766 static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
767 { ISD::SINT_TO_FP, MVT::f32, MVT::i1, 2 },
768 { ISD::UINT_TO_FP, MVT::f32, MVT::i1, 2 },
769 { ISD::SINT_TO_FP, MVT::f64, MVT::i1, 2 },
770 { ISD::UINT_TO_FP, MVT::f64, MVT::i1, 2 },
771 { ISD::SINT_TO_FP, MVT::f32, MVT::i8, 2 },
772 { ISD::UINT_TO_FP, MVT::f32, MVT::i8, 2 },
773 { ISD::SINT_TO_FP, MVT::f64, MVT::i8, 2 },
774 { ISD::UINT_TO_FP, MVT::f64, MVT::i8, 2 },
775 { ISD::SINT_TO_FP, MVT::f32, MVT::i16, 2 },
776 { ISD::UINT_TO_FP, MVT::f32, MVT::i16, 2 },
777 { ISD::SINT_TO_FP, MVT::f64, MVT::i16, 2 },
778 { ISD::UINT_TO_FP, MVT::f64, MVT::i16, 2 },
779 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 2 },
780 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 2 },
781 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 2 },
782 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 2 },
783 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 10 },
784 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 10 },
785 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 10 },
786 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 10 }
787 };
788
789 if (SrcTy.isInteger() && ST->hasNEON()) {
790 if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
791 ISD, DstTy.getSimpleVT(),
792 SrcTy.getSimpleVT()))
793 return AdjustCost(Entry->Cost);
794 }
795
796 // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
797 // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
798 // are linearised so take more.
799 static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
800 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
801 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
802 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
803 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
804 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
805 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
806 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
807 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
808 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
809 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
810 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
811 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
812 };
813
814 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
815 if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
816 ISD, DstTy.getSimpleVT(),
817 SrcTy.getSimpleVT()))
818 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
819 }
820
821 if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
822 // As general rule, fp converts that were not matched above are scalarized
823 // and cost 1 vcvt for each lane, so long as the instruction is available.
824 // If not it will become a series of function calls.
825 const InstructionCost CallCost =
826 getCallInstrCost(nullptr, Dst, {Src}, CostKind);
827 int Lanes = 1;
828 if (SrcTy.isFixedLengthVector())
829 Lanes = SrcTy.getVectorNumElements();
830
831 if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
832 return Lanes;
833 else
834 return Lanes * CallCost;
835 }
836
837 if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
838 SrcTy.isFixedLengthVector()) {
839 // Treat a truncate with larger than legal source (128bits for MVE) as
840 // expensive, 2 instructions per lane.
841 if ((SrcTy.getScalarType() == MVT::i8 ||
842 SrcTy.getScalarType() == MVT::i16 ||
843 SrcTy.getScalarType() == MVT::i32) &&
844 SrcTy.getSizeInBits() > 128 &&
845 SrcTy.getSizeInBits() > DstTy.getSizeInBits())
846 return SrcTy.getVectorNumElements() * 2;
847 }
848
849 // Scalar integer conversion costs.
850 static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
851 // i16 -> i64 requires two dependent operations.
852 { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
853
854 // Truncates on i64 are assumed to be free.
855 { ISD::TRUNCATE, MVT::i32, MVT::i64, 0 },
856 { ISD::TRUNCATE, MVT::i16, MVT::i64, 0 },
857 { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 },
858 { ISD::TRUNCATE, MVT::i1, MVT::i64, 0 }
859 };
860
861 if (SrcTy.isInteger()) {
862 if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
863 DstTy.getSimpleVT(),
864 SrcTy.getSimpleVT()))
865 return AdjustCost(Entry->Cost);
866 }
867
868 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
869 ? ST->getMVEVectorCostFactor(CostKind)
870 : 1;
871 return AdjustCost(
872 BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
873 }
874
getVectorInstrCost(unsigned Opcode,Type * ValTy,unsigned Index)875 InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
876 unsigned Index) {
877 // Penalize inserting into an D-subregister. We end up with a three times
878 // lower estimated throughput on swift.
879 if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
880 ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
881 return 3;
882
883 if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
884 Opcode == Instruction::ExtractElement)) {
885 // Cross-class copies are expensive on many microarchitectures,
886 // so assume they are expensive by default.
887 if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
888 return 3;
889
890 // Even if it's not a cross class copy, this likely leads to mixing
891 // of NEON and VFP code and should be therefore penalized.
892 if (ValTy->isVectorTy() &&
893 ValTy->getScalarSizeInBits() <= 32)
894 return std::max<InstructionCost>(
895 BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
896 }
897
898 if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
899 Opcode == Instruction::ExtractElement)) {
900 // Integer cross-lane moves are more expensive than float, which can
901 // sometimes just be vmovs. Integer involve being passes to GPR registers,
902 // causing more of a delay.
903 std::pair<InstructionCost, MVT> LT =
904 getTLI()->getTypeLegalizationCost(DL, ValTy->getScalarType());
905 return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
906 }
907
908 return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
909 }
910
getCmpSelInstrCost(unsigned Opcode,Type * ValTy,Type * CondTy,CmpInst::Predicate VecPred,TTI::TargetCostKind CostKind,const Instruction * I)911 InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
912 Type *CondTy,
913 CmpInst::Predicate VecPred,
914 TTI::TargetCostKind CostKind,
915 const Instruction *I) {
916 int ISD = TLI->InstructionOpcodeToISD(Opcode);
917
918 // Thumb scalar code size cost for select.
919 if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
920 ST->isThumb() && !ValTy->isVectorTy()) {
921 // Assume expensive structs.
922 if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
923 return TTI::TCC_Expensive;
924
925 // Select costs can vary because they:
926 // - may require one or more conditional mov (including an IT),
927 // - can't operate directly on immediates,
928 // - require live flags, which we can't copy around easily.
929 InstructionCost Cost = TLI->getTypeLegalizationCost(DL, ValTy).first;
930
931 // Possible IT instruction for Thumb2, or more for Thumb1.
932 ++Cost;
933
934 // i1 values may need rematerialising by using mov immediates and/or
935 // flag setting instructions.
936 if (ValTy->isIntegerTy(1))
937 ++Cost;
938
939 return Cost;
940 }
941
942 // If this is a vector min/max/abs, use the cost of that intrinsic directly
943 // instead. Hopefully when min/max intrinsics are more prevalent this code
944 // will not be needed.
945 const Instruction *Sel = I;
946 if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
947 Sel->hasOneUse())
948 Sel = cast<Instruction>(Sel->user_back());
949 if (Sel && ValTy->isVectorTy() &&
950 (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
951 const Value *LHS, *RHS;
952 SelectPatternFlavor SPF = matchSelectPattern(Sel, LHS, RHS).Flavor;
953 unsigned IID = 0;
954 switch (SPF) {
955 case SPF_ABS:
956 IID = Intrinsic::abs;
957 break;
958 case SPF_SMIN:
959 IID = Intrinsic::smin;
960 break;
961 case SPF_SMAX:
962 IID = Intrinsic::smax;
963 break;
964 case SPF_UMIN:
965 IID = Intrinsic::umin;
966 break;
967 case SPF_UMAX:
968 IID = Intrinsic::umax;
969 break;
970 case SPF_FMINNUM:
971 IID = Intrinsic::minnum;
972 break;
973 case SPF_FMAXNUM:
974 IID = Intrinsic::maxnum;
975 break;
976 default:
977 break;
978 }
979 if (IID) {
980 // The ICmp is free, the select gets the cost of the min/max/etc
981 if (Sel != I)
982 return 0;
983 IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
984 return getIntrinsicInstrCost(CostAttrs, CostKind);
985 }
986 }
987
988 // On NEON a vector select gets lowered to vbsl.
989 if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
990 // Lowering of some vector selects is currently far from perfect.
991 static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
992 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
993 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
994 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
995 };
996
997 EVT SelCondTy = TLI->getValueType(DL, CondTy);
998 EVT SelValTy = TLI->getValueType(DL, ValTy);
999 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
1000 if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
1001 SelCondTy.getSimpleVT(),
1002 SelValTy.getSimpleVT()))
1003 return Entry->Cost;
1004 }
1005
1006 std::pair<InstructionCost, MVT> LT =
1007 TLI->getTypeLegalizationCost(DL, ValTy);
1008 return LT.first;
1009 }
1010
1011 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
1012 (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1013 cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
1014 FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
1015 FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy);
1016 if (!VecCondTy)
1017 VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy));
1018
1019 // If we don't have mve.fp any fp operations will need to be scalarized.
1020 if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
1021 // One scalaization insert, one scalarization extract and the cost of the
1022 // fcmps.
1023 return BaseT::getScalarizationOverhead(VecValTy, false, true) +
1024 BaseT::getScalarizationOverhead(VecCondTy, true, false) +
1025 VecValTy->getNumElements() *
1026 getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
1027 VecCondTy->getScalarType(), VecPred, CostKind,
1028 I);
1029 }
1030
1031 std::pair<InstructionCost, MVT> LT =
1032 TLI->getTypeLegalizationCost(DL, ValTy);
1033 int BaseCost = ST->getMVEVectorCostFactor(CostKind);
1034 // There are two types - the input that specifies the type of the compare
1035 // and the output vXi1 type. Because we don't know how the output will be
1036 // split, we may need an expensive shuffle to get two in sync. This has the
1037 // effect of making larger than legal compares (v8i32 for example)
1038 // expensive.
1039 if (LT.second.isVector() && LT.second.getVectorNumElements() > 2) {
1040 if (LT.first > 1)
1041 return LT.first * BaseCost +
1042 BaseT::getScalarizationOverhead(VecCondTy, true, false);
1043 return BaseCost;
1044 }
1045 }
1046
1047 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1048 // for "multiple beats" potentially needed by MVE instructions.
1049 int BaseCost = 1;
1050 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
1051 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1052
1053 return BaseCost *
1054 BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1055 }
1056
getAddressComputationCost(Type * Ty,ScalarEvolution * SE,const SCEV * Ptr)1057 InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty,
1058 ScalarEvolution *SE,
1059 const SCEV *Ptr) {
1060 // Address computations in vectorized code with non-consecutive addresses will
1061 // likely result in more instructions compared to scalar code where the
1062 // computation can more often be merged into the index mode. The resulting
1063 // extra micro-ops can significantly decrease throughput.
1064 unsigned NumVectorInstToHideOverhead = 10;
1065 int MaxMergeDistance = 64;
1066
1067 if (ST->hasNEON()) {
1068 if (Ty->isVectorTy() && SE &&
1069 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
1070 return NumVectorInstToHideOverhead;
1071
1072 // In many cases the address computation is not merged into the instruction
1073 // addressing mode.
1074 return 1;
1075 }
1076 return BaseT::getAddressComputationCost(Ty, SE, Ptr);
1077 }
1078
isProfitableLSRChainElement(Instruction * I)1079 bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) {
1080 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
1081 // If a VCTP is part of a chain, it's already profitable and shouldn't be
1082 // optimized, else LSR may block tail-predication.
1083 switch (II->getIntrinsicID()) {
1084 case Intrinsic::arm_mve_vctp8:
1085 case Intrinsic::arm_mve_vctp16:
1086 case Intrinsic::arm_mve_vctp32:
1087 case Intrinsic::arm_mve_vctp64:
1088 return true;
1089 default:
1090 break;
1091 }
1092 }
1093 return false;
1094 }
1095
isLegalMaskedLoad(Type * DataTy,Align Alignment)1096 bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
1097 if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
1098 return false;
1099
1100 if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
1101 // Don't support v2i1 yet.
1102 if (VecTy->getNumElements() == 2)
1103 return false;
1104
1105 // We don't support extending fp types.
1106 unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
1107 if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
1108 return false;
1109 }
1110
1111 unsigned EltWidth = DataTy->getScalarSizeInBits();
1112 return (EltWidth == 32 && Alignment >= 4) ||
1113 (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
1114 }
1115
isLegalMaskedGather(Type * Ty,Align Alignment)1116 bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) {
1117 if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
1118 return false;
1119
1120 unsigned EltWidth = Ty->getScalarSizeInBits();
1121 return ((EltWidth == 32 && Alignment >= 4) ||
1122 (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1123 }
1124
1125 /// Given a memcpy/memset/memmove instruction, return the number of memory
1126 /// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1127 /// call is used.
getNumMemOps(const IntrinsicInst * I) const1128 int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const {
1129 MemOp MOp;
1130 unsigned DstAddrSpace = ~0u;
1131 unsigned SrcAddrSpace = ~0u;
1132 const Function *F = I->getParent()->getParent();
1133
1134 if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
1135 ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
1136 // If 'size' is not a constant, a library call will be generated.
1137 if (!C)
1138 return -1;
1139
1140 const unsigned Size = C->getValue().getZExtValue();
1141 const Align DstAlign = *MC->getDestAlign();
1142 const Align SrcAlign = *MC->getSourceAlign();
1143
1144 MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1145 /*IsVolatile*/ false);
1146 DstAddrSpace = MC->getDestAddressSpace();
1147 SrcAddrSpace = MC->getSourceAddressSpace();
1148 }
1149 else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
1150 ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
1151 // If 'size' is not a constant, a library call will be generated.
1152 if (!C)
1153 return -1;
1154
1155 const unsigned Size = C->getValue().getZExtValue();
1156 const Align DstAlign = *MS->getDestAlign();
1157
1158 MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1159 /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1160 DstAddrSpace = MS->getDestAddressSpace();
1161 }
1162 else
1163 llvm_unreachable("Expected a memcpy/move or memset!");
1164
1165 unsigned Limit, Factor = 2;
1166 switch(I->getIntrinsicID()) {
1167 case Intrinsic::memcpy:
1168 Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
1169 break;
1170 case Intrinsic::memmove:
1171 Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
1172 break;
1173 case Intrinsic::memset:
1174 Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
1175 Factor = 1;
1176 break;
1177 default:
1178 llvm_unreachable("Expected a memcpy/move or memset!");
1179 }
1180
1181 // MemOps will be poplulated with a list of data types that needs to be
1182 // loaded and stored. That's why we multiply the number of elements by 2 to
1183 // get the cost for this memcpy.
1184 std::vector<EVT> MemOps;
1185 if (getTLI()->findOptimalMemOpLowering(
1186 MemOps, Limit, MOp, DstAddrSpace,
1187 SrcAddrSpace, F->getAttributes()))
1188 return MemOps.size() * Factor;
1189
1190 // If we can't find an optimal memop lowering, return the default cost
1191 return -1;
1192 }
1193
getMemcpyCost(const Instruction * I)1194 InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction *I) {
1195 int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
1196
1197 // To model the cost of a library call, we assume 1 for the call, and
1198 // 3 for the argument setup.
1199 if (NumOps == -1)
1200 return 4;
1201 return NumOps;
1202 }
1203
getShuffleCost(TTI::ShuffleKind Kind,VectorType * Tp,ArrayRef<int> Mask,int Index,VectorType * SubTp,ArrayRef<const Value * > Args)1204 InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1205 VectorType *Tp, ArrayRef<int> Mask,
1206 int Index, VectorType *SubTp,
1207 ArrayRef<const Value *> Args) {
1208 Kind = improveShuffleKindFromMask(Kind, Mask);
1209 if (ST->hasNEON()) {
1210 if (Kind == TTI::SK_Broadcast) {
1211 static const CostTblEntry NEONDupTbl[] = {
1212 // VDUP handles these cases.
1213 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1214 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1215 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1216 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1217 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1218 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1219
1220 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1221 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1222 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1223 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
1224
1225 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1226 if (const auto *Entry =
1227 CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
1228 return LT.first * Entry->Cost;
1229 }
1230 if (Kind == TTI::SK_Reverse) {
1231 static const CostTblEntry NEONShuffleTbl[] = {
1232 // Reverse shuffle cost one instruction if we are shuffling within a
1233 // double word (vrev) or two if we shuffle a quad word (vrev, vext).
1234 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1235 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1236 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1237 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1238 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1239 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1240
1241 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1242 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1243 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
1244 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
1245
1246 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1247 if (const auto *Entry =
1248 CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
1249 return LT.first * Entry->Cost;
1250 }
1251 if (Kind == TTI::SK_Select) {
1252 static const CostTblEntry NEONSelShuffleTbl[] = {
1253 // Select shuffle cost table for ARM. Cost is the number of
1254 // instructions
1255 // required to create the shuffled vector.
1256
1257 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1258 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1259 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1260 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1261
1262 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1263 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1264 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
1265
1266 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
1267
1268 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
1269
1270 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1271 if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
1272 ISD::VECTOR_SHUFFLE, LT.second))
1273 return LT.first * Entry->Cost;
1274 }
1275 }
1276 if (ST->hasMVEIntegerOps()) {
1277 if (Kind == TTI::SK_Broadcast) {
1278 static const CostTblEntry MVEDupTbl[] = {
1279 // VDUP handles these cases.
1280 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1281 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1282 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
1283 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1284 {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
1285
1286 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1287 if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
1288 LT.second))
1289 return LT.first * Entry->Cost *
1290 ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput);
1291 }
1292
1293 if (!Mask.empty()) {
1294 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1295 if (LT.second.isVector() &&
1296 Mask.size() <= LT.second.getVectorNumElements() &&
1297 (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
1298 isVREVMask(Mask, LT.second, 64)))
1299 return ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) * LT.first;
1300 }
1301 }
1302
1303 int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
1304 ? ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput)
1305 : 1;
1306 return BaseCost * BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
1307 }
1308
getArithmeticInstrCost(unsigned Opcode,Type * Ty,TTI::TargetCostKind CostKind,TTI::OperandValueKind Op1Info,TTI::OperandValueKind Op2Info,TTI::OperandValueProperties Opd1PropInfo,TTI::OperandValueProperties Opd2PropInfo,ArrayRef<const Value * > Args,const Instruction * CxtI)1309 InstructionCost ARMTTIImpl::getArithmeticInstrCost(
1310 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1311 TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
1312 TTI::OperandValueProperties Opd1PropInfo,
1313 TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
1314 const Instruction *CxtI) {
1315 int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1316 if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
1317 // Make operations on i1 relatively expensive as this often involves
1318 // combining predicates. AND and XOR should be easier to handle with IT
1319 // blocks.
1320 switch (ISDOpcode) {
1321 default:
1322 break;
1323 case ISD::AND:
1324 case ISD::XOR:
1325 return 2;
1326 case ISD::OR:
1327 return 3;
1328 }
1329 }
1330
1331 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
1332
1333 if (ST->hasNEON()) {
1334 const unsigned FunctionCallDivCost = 20;
1335 const unsigned ReciprocalDivCost = 10;
1336 static const CostTblEntry CostTbl[] = {
1337 // Division.
1338 // These costs are somewhat random. Choose a cost of 20 to indicate that
1339 // vectorizing devision (added function call) is going to be very expensive.
1340 // Double registers types.
1341 { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1342 { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1343 { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
1344 { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
1345 { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1346 { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1347 { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
1348 { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
1349 { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
1350 { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
1351 { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
1352 { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
1353 { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
1354 { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
1355 { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
1356 { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
1357 // Quad register types.
1358 { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1359 { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1360 { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
1361 { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
1362 { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1363 { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1364 { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
1365 { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
1366 { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1367 { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1368 { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
1369 { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
1370 { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1371 { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1372 { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
1373 { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
1374 // Multiplication.
1375 };
1376
1377 if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
1378 return LT.first * Entry->Cost;
1379
1380 InstructionCost Cost = BaseT::getArithmeticInstrCost(
1381 Opcode, Ty, CostKind, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo);
1382
1383 // This is somewhat of a hack. The problem that we are facing is that SROA
1384 // creates a sequence of shift, and, or instructions to construct values.
1385 // These sequences are recognized by the ISel and have zero-cost. Not so for
1386 // the vectorized code. Because we have support for v2i64 but not i64 those
1387 // sequences look particularly beneficial to vectorize.
1388 // To work around this we increase the cost of v2i64 operations to make them
1389 // seem less beneficial.
1390 if (LT.second == MVT::v2i64 &&
1391 Op2Info == TargetTransformInfo::OK_UniformConstantValue)
1392 Cost += 4;
1393
1394 return Cost;
1395 }
1396
1397 // If this operation is a shift on arm/thumb2, it might well be folded into
1398 // the following instruction, hence having a cost of 0.
1399 auto LooksLikeAFreeShift = [&]() {
1400 if (ST->isThumb1Only() || Ty->isVectorTy())
1401 return false;
1402
1403 if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1404 return false;
1405 if (Op2Info != TargetTransformInfo::OK_UniformConstantValue)
1406 return false;
1407
1408 // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1409 switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
1410 case Instruction::Add:
1411 case Instruction::Sub:
1412 case Instruction::And:
1413 case Instruction::Xor:
1414 case Instruction::Or:
1415 case Instruction::ICmp:
1416 return true;
1417 default:
1418 return false;
1419 }
1420 };
1421 if (LooksLikeAFreeShift())
1422 return 0;
1423
1424 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1425 // for "multiple beats" potentially needed by MVE instructions.
1426 int BaseCost = 1;
1427 if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1428 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1429
1430 // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
1431 // without treating floats as more expensive that scalars or increasing the
1432 // costs for custom operations. The results is also multiplied by the
1433 // MVEVectorCostFactor where appropriate.
1434 if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
1435 return LT.first * BaseCost;
1436
1437 // Else this is expand, assume that we need to scalarize this op.
1438 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
1439 unsigned Num = VTy->getNumElements();
1440 InstructionCost Cost =
1441 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind);
1442 // Return the cost of multiple scalar invocation plus the cost of
1443 // inserting and extracting the values.
1444 SmallVector<Type *> Tys(Args.size(), Ty);
1445 return BaseT::getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
1446 }
1447
1448 return BaseCost;
1449 }
1450
getMemoryOpCost(unsigned Opcode,Type * Src,MaybeAlign Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind,const Instruction * I)1451 InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1452 MaybeAlign Alignment,
1453 unsigned AddressSpace,
1454 TTI::TargetCostKind CostKind,
1455 const Instruction *I) {
1456 // TODO: Handle other cost kinds.
1457 if (CostKind != TTI::TCK_RecipThroughput)
1458 return 1;
1459
1460 // Type legalization can't handle structs
1461 if (TLI->getValueType(DL, Src, true) == MVT::Other)
1462 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1463 CostKind);
1464
1465 if (ST->hasNEON() && Src->isVectorTy() &&
1466 (Alignment && *Alignment != Align(16)) &&
1467 cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
1468 // Unaligned loads/stores are extremely inefficient.
1469 // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1470 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
1471 return LT.first * 4;
1472 }
1473
1474 // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
1475 // Same for stores.
1476 if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
1477 ((Opcode == Instruction::Load && I->hasOneUse() &&
1478 isa<FPExtInst>(*I->user_begin())) ||
1479 (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
1480 FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);
1481 Type *DstTy =
1482 Opcode == Instruction::Load
1483 ? (*I->user_begin())->getType()
1484 : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
1485 if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
1486 DstTy->getScalarType()->isFloatTy())
1487 return ST->getMVEVectorCostFactor(CostKind);
1488 }
1489
1490 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1491 ? ST->getMVEVectorCostFactor(CostKind)
1492 : 1;
1493 return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1494 CostKind, I);
1495 }
1496
1497 InstructionCost
getMaskedMemoryOpCost(unsigned Opcode,Type * Src,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind)1498 ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1499 unsigned AddressSpace,
1500 TTI::TargetCostKind CostKind) {
1501 if (ST->hasMVEIntegerOps()) {
1502 if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment))
1503 return ST->getMVEVectorCostFactor(CostKind);
1504 if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment))
1505 return ST->getMVEVectorCostFactor(CostKind);
1506 }
1507 if (!isa<FixedVectorType>(Src))
1508 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1509 CostKind);
1510 // Scalar cost, which is currently very high due to the efficiency of the
1511 // generated code.
1512 return cast<FixedVectorType>(Src)->getNumElements() * 8;
1513 }
1514
getInterleavedMemoryOpCost(unsigned Opcode,Type * VecTy,unsigned Factor,ArrayRef<unsigned> Indices,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind,bool UseMaskForCond,bool UseMaskForGaps)1515 InstructionCost ARMTTIImpl::getInterleavedMemoryOpCost(
1516 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1517 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1518 bool UseMaskForCond, bool UseMaskForGaps) {
1519 assert(Factor >= 2 && "Invalid interleave factor");
1520 assert(isa<VectorType>(VecTy) && "Expect a vector type");
1521
1522 // vldN/vstN doesn't support vector types of i64/f64 element.
1523 bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
1524
1525 if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
1526 !UseMaskForCond && !UseMaskForGaps) {
1527 unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1528 auto *SubVecTy =
1529 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
1530
1531 // vldN/vstN only support legal vector types of size 64 or 128 in bits.
1532 // Accesses having vector types that are a multiple of 128 bits can be
1533 // matched to more than one vldN/vstN instruction.
1534 int BaseCost =
1535 ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;
1536 if (NumElts % Factor == 0 &&
1537 TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
1538 return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1539
1540 // Some smaller than legal interleaved patterns are cheap as we can make
1541 // use of the vmovn or vrev patterns to interleave a standard load. This is
1542 // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1543 // promoted differently). The cost of 2 here is then a load and vrev or
1544 // vmovn.
1545 if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1546 VecTy->isIntOrIntVectorTy() &&
1547 DL.getTypeSizeInBits(SubVecTy).getFixedSize() <= 64)
1548 return 2 * BaseCost;
1549 }
1550
1551 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1552 Alignment, AddressSpace, CostKind,
1553 UseMaskForCond, UseMaskForGaps);
1554 }
1555
getGatherScatterOpCost(unsigned Opcode,Type * DataTy,const Value * Ptr,bool VariableMask,Align Alignment,TTI::TargetCostKind CostKind,const Instruction * I)1556 InstructionCost ARMTTIImpl::getGatherScatterOpCost(
1557 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1558 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
1559 using namespace PatternMatch;
1560 if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
1561 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1562 Alignment, CostKind, I);
1563
1564 assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
1565 auto *VTy = cast<FixedVectorType>(DataTy);
1566
1567 // TODO: Splitting, once we do that.
1568
1569 unsigned NumElems = VTy->getNumElements();
1570 unsigned EltSize = VTy->getScalarSizeInBits();
1571 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, DataTy);
1572
1573 // For now, it is assumed that for the MVE gather instructions the loads are
1574 // all effectively serialised. This means the cost is the scalar cost
1575 // multiplied by the number of elements being loaded. This is possibly very
1576 // conservative, but even so we still end up vectorising loops because the
1577 // cost per iteration for many loops is lower than for scalar loops.
1578 InstructionCost VectorCost =
1579 NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
1580 // The scalarization cost should be a lot higher. We use the number of vector
1581 // elements plus the scalarization overhead.
1582 InstructionCost ScalarCost =
1583 NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, true, false) +
1584 BaseT::getScalarizationOverhead(VTy, false, true);
1585
1586 if (EltSize < 8 || Alignment < EltSize / 8)
1587 return ScalarCost;
1588
1589 unsigned ExtSize = EltSize;
1590 // Check whether there's a single user that asks for an extended type
1591 if (I != nullptr) {
1592 // Dependent of the caller of this function, a gather instruction will
1593 // either have opcode Instruction::Load or be a call to the masked_gather
1594 // intrinsic
1595 if ((I->getOpcode() == Instruction::Load ||
1596 match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
1597 I->hasOneUse()) {
1598 const User *Us = *I->users().begin();
1599 if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
1600 // only allow valid type combinations
1601 unsigned TypeSize =
1602 cast<Instruction>(Us)->getType()->getScalarSizeInBits();
1603 if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
1604 (TypeSize == 16 && EltSize == 8)) &&
1605 TypeSize * NumElems == 128) {
1606 ExtSize = TypeSize;
1607 }
1608 }
1609 }
1610 // Check whether the input data needs to be truncated
1611 TruncInst *T;
1612 if ((I->getOpcode() == Instruction::Store ||
1613 match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
1614 (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
1615 // Only allow valid type combinations
1616 unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
1617 if (((EltSize == 16 && TypeSize == 32) ||
1618 (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
1619 TypeSize * NumElems == 128)
1620 ExtSize = TypeSize;
1621 }
1622 }
1623
1624 if (ExtSize * NumElems != 128 || NumElems < 4)
1625 return ScalarCost;
1626
1627 // Any (aligned) i32 gather will not need to be scalarised.
1628 if (ExtSize == 32)
1629 return VectorCost;
1630 // For smaller types, we need to ensure that the gep's inputs are correctly
1631 // extended from a small enough value. Other sizes (including i64) are
1632 // scalarized for now.
1633 if (ExtSize != 8 && ExtSize != 16)
1634 return ScalarCost;
1635
1636 if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
1637 Ptr = BC->getOperand(0);
1638 if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
1639 if (GEP->getNumOperands() != 2)
1640 return ScalarCost;
1641 unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
1642 // Scale needs to be correct (which is only relevant for i16s).
1643 if (Scale != 1 && Scale * 8 != ExtSize)
1644 return ScalarCost;
1645 // And we need to zext (not sext) the indexes from a small enough type.
1646 if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
1647 if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
1648 return VectorCost;
1649 }
1650 return ScalarCost;
1651 }
1652 return ScalarCost;
1653 }
1654
1655 InstructionCost
getArithmeticReductionCost(unsigned Opcode,VectorType * ValTy,Optional<FastMathFlags> FMF,TTI::TargetCostKind CostKind)1656 ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
1657 Optional<FastMathFlags> FMF,
1658 TTI::TargetCostKind CostKind) {
1659 if (TTI::requiresOrderedReduction(FMF))
1660 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1661
1662 EVT ValVT = TLI->getValueType(DL, ValTy);
1663 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1664 if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD)
1665 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1666
1667 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1668
1669 static const CostTblEntry CostTblAdd[]{
1670 {ISD::ADD, MVT::v16i8, 1},
1671 {ISD::ADD, MVT::v8i16, 1},
1672 {ISD::ADD, MVT::v4i32, 1},
1673 };
1674 if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
1675 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1676
1677 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1678 }
1679
1680 InstructionCost
getExtendedAddReductionCost(bool IsMLA,bool IsUnsigned,Type * ResTy,VectorType * ValTy,TTI::TargetCostKind CostKind)1681 ARMTTIImpl::getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
1682 Type *ResTy, VectorType *ValTy,
1683 TTI::TargetCostKind CostKind) {
1684 EVT ValVT = TLI->getValueType(DL, ValTy);
1685 EVT ResVT = TLI->getValueType(DL, ResTy);
1686
1687 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1688 std::pair<InstructionCost, MVT> LT =
1689 TLI->getTypeLegalizationCost(DL, ValTy);
1690
1691 // The legal cases are:
1692 // VADDV u/s 8/16/32
1693 // VMLAV u/s 8/16/32
1694 // VADDLV u/s 32
1695 // VMLALV u/s 16/32
1696 // Codegen currently cannot always handle larger than legal vectors very
1697 // well, especially for predicated reductions where the mask needs to be
1698 // split, so restrict to 128bit or smaller input types.
1699 unsigned RevVTSize = ResVT.getSizeInBits();
1700 if (ValVT.getSizeInBits() <= 128 &&
1701 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1702 (LT.second == MVT::v8i16 && RevVTSize <= (IsMLA ? 64u : 32u)) ||
1703 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1704 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1705 }
1706
1707 return BaseT::getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, ValTy,
1708 CostKind);
1709 }
1710
1711 InstructionCost
getIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)1712 ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1713 TTI::TargetCostKind CostKind) {
1714 switch (ICA.getID()) {
1715 case Intrinsic::get_active_lane_mask:
1716 // Currently we make a somewhat optimistic assumption that
1717 // active_lane_mask's are always free. In reality it may be freely folded
1718 // into a tail predicated loop, expanded into a VCPT or expanded into a lot
1719 // of add/icmp code. We may need to improve this in the future, but being
1720 // able to detect if it is free or not involves looking at a lot of other
1721 // code. We currently assume that the vectorizer inserted these, and knew
1722 // what it was doing in adding one.
1723 if (ST->hasMVEIntegerOps())
1724 return 0;
1725 break;
1726 case Intrinsic::sadd_sat:
1727 case Intrinsic::ssub_sat:
1728 case Intrinsic::uadd_sat:
1729 case Intrinsic::usub_sat: {
1730 if (!ST->hasMVEIntegerOps())
1731 break;
1732 Type *VT = ICA.getReturnType();
1733
1734 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1735 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1736 LT.second == MVT::v16i8) {
1737 // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
1738 // need to extend the type, as it uses shr(qadd(shl, shl)).
1739 unsigned Instrs =
1740 LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4;
1741 return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
1742 }
1743 break;
1744 }
1745 case Intrinsic::abs:
1746 case Intrinsic::smin:
1747 case Intrinsic::smax:
1748 case Intrinsic::umin:
1749 case Intrinsic::umax: {
1750 if (!ST->hasMVEIntegerOps())
1751 break;
1752 Type *VT = ICA.getReturnType();
1753
1754 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1755 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1756 LT.second == MVT::v16i8)
1757 return LT.first * ST->getMVEVectorCostFactor(CostKind);
1758 break;
1759 }
1760 case Intrinsic::minnum:
1761 case Intrinsic::maxnum: {
1762 if (!ST->hasMVEFloatOps())
1763 break;
1764 Type *VT = ICA.getReturnType();
1765 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1766 if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
1767 return LT.first * ST->getMVEVectorCostFactor(CostKind);
1768 break;
1769 }
1770 case Intrinsic::fptosi_sat:
1771 case Intrinsic::fptoui_sat: {
1772 if (ICA.getArgTypes().empty())
1773 break;
1774 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
1775 auto LT = TLI->getTypeLegalizationCost(DL, ICA.getArgTypes()[0]);
1776 EVT MTy = TLI->getValueType(DL, ICA.getReturnType());
1777 // Check for the legal types, with the corect subtarget features.
1778 if ((ST->hasVFP2Base() && LT.second == MVT::f32 && MTy == MVT::i32) ||
1779 (ST->hasFP64() && LT.second == MVT::f64 && MTy == MVT::i32) ||
1780 (ST->hasFullFP16() && LT.second == MVT::f16 && MTy == MVT::i32))
1781 return LT.first;
1782
1783 // Equally for MVE vector types
1784 if (ST->hasMVEFloatOps() &&
1785 (LT.second == MVT::v4f32 || LT.second == MVT::v8f16) &&
1786 LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())
1787 return LT.first * ST->getMVEVectorCostFactor(CostKind);
1788
1789 // Otherwise we use a legal convert followed by a min+max
1790 if (((ST->hasVFP2Base() && LT.second == MVT::f32) ||
1791 (ST->hasFP64() && LT.second == MVT::f64) ||
1792 (ST->hasFullFP16() && LT.second == MVT::f16) ||
1793 (ST->hasMVEFloatOps() &&
1794 (LT.second == MVT::v4f32 || LT.second == MVT::v8f16))) &&
1795 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
1796 Type *LegalTy = Type::getIntNTy(ICA.getReturnType()->getContext(),
1797 LT.second.getScalarSizeInBits());
1798 InstructionCost Cost =
1799 LT.second.isVector() ? ST->getMVEVectorCostFactor(CostKind) : 1;
1800 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
1801 : Intrinsic::umin,
1802 LegalTy, {LegalTy, LegalTy});
1803 Cost += getIntrinsicInstrCost(Attrs1, CostKind);
1804 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
1805 : Intrinsic::umax,
1806 LegalTy, {LegalTy, LegalTy});
1807 Cost += getIntrinsicInstrCost(Attrs2, CostKind);
1808 return LT.first * Cost;
1809 }
1810 break;
1811 }
1812 }
1813
1814 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1815 }
1816
isLoweredToCall(const Function * F)1817 bool ARMTTIImpl::isLoweredToCall(const Function *F) {
1818 if (!F->isIntrinsic())
1819 return BaseT::isLoweredToCall(F);
1820
1821 // Assume all Arm-specific intrinsics map to an instruction.
1822 if (F->getName().startswith("llvm.arm"))
1823 return false;
1824
1825 switch (F->getIntrinsicID()) {
1826 default: break;
1827 case Intrinsic::powi:
1828 case Intrinsic::sin:
1829 case Intrinsic::cos:
1830 case Intrinsic::pow:
1831 case Intrinsic::log:
1832 case Intrinsic::log10:
1833 case Intrinsic::log2:
1834 case Intrinsic::exp:
1835 case Intrinsic::exp2:
1836 return true;
1837 case Intrinsic::sqrt:
1838 case Intrinsic::fabs:
1839 case Intrinsic::copysign:
1840 case Intrinsic::floor:
1841 case Intrinsic::ceil:
1842 case Intrinsic::trunc:
1843 case Intrinsic::rint:
1844 case Intrinsic::nearbyint:
1845 case Intrinsic::round:
1846 case Intrinsic::canonicalize:
1847 case Intrinsic::lround:
1848 case Intrinsic::llround:
1849 case Intrinsic::lrint:
1850 case Intrinsic::llrint:
1851 if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
1852 return true;
1853 if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
1854 return true;
1855 // Some operations can be handled by vector instructions and assume
1856 // unsupported vectors will be expanded into supported scalar ones.
1857 // TODO Handle scalar operations properly.
1858 return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
1859 case Intrinsic::masked_store:
1860 case Intrinsic::masked_load:
1861 case Intrinsic::masked_gather:
1862 case Intrinsic::masked_scatter:
1863 return !ST->hasMVEIntegerOps();
1864 case Intrinsic::sadd_with_overflow:
1865 case Intrinsic::uadd_with_overflow:
1866 case Intrinsic::ssub_with_overflow:
1867 case Intrinsic::usub_with_overflow:
1868 case Intrinsic::sadd_sat:
1869 case Intrinsic::uadd_sat:
1870 case Intrinsic::ssub_sat:
1871 case Intrinsic::usub_sat:
1872 return false;
1873 }
1874
1875 return BaseT::isLoweredToCall(F);
1876 }
1877
maybeLoweredToCall(Instruction & I)1878 bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) {
1879 unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
1880 EVT VT = TLI->getValueType(DL, I.getType(), true);
1881 if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
1882 return true;
1883
1884 // Check if an intrinsic will be lowered to a call and assume that any
1885 // other CallInst will generate a bl.
1886 if (auto *Call = dyn_cast<CallInst>(&I)) {
1887 if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
1888 switch(II->getIntrinsicID()) {
1889 case Intrinsic::memcpy:
1890 case Intrinsic::memset:
1891 case Intrinsic::memmove:
1892 return getNumMemOps(II) == -1;
1893 default:
1894 if (const Function *F = Call->getCalledFunction())
1895 return isLoweredToCall(F);
1896 }
1897 }
1898 return true;
1899 }
1900
1901 // FPv5 provides conversions between integer, double-precision,
1902 // single-precision, and half-precision formats.
1903 switch (I.getOpcode()) {
1904 default:
1905 break;
1906 case Instruction::FPToSI:
1907 case Instruction::FPToUI:
1908 case Instruction::SIToFP:
1909 case Instruction::UIToFP:
1910 case Instruction::FPTrunc:
1911 case Instruction::FPExt:
1912 return !ST->hasFPARMv8Base();
1913 }
1914
1915 // FIXME: Unfortunately the approach of checking the Operation Action does
1916 // not catch all cases of Legalization that use library calls. Our
1917 // Legalization step categorizes some transformations into library calls as
1918 // Custom, Expand or even Legal when doing type legalization. So for now
1919 // we have to special case for instance the SDIV of 64bit integers and the
1920 // use of floating point emulation.
1921 if (VT.isInteger() && VT.getSizeInBits() >= 64) {
1922 switch (ISD) {
1923 default:
1924 break;
1925 case ISD::SDIV:
1926 case ISD::UDIV:
1927 case ISD::SREM:
1928 case ISD::UREM:
1929 case ISD::SDIVREM:
1930 case ISD::UDIVREM:
1931 return true;
1932 }
1933 }
1934
1935 // Assume all other non-float operations are supported.
1936 if (!VT.isFloatingPoint())
1937 return false;
1938
1939 // We'll need a library call to handle most floats when using soft.
1940 if (TLI->useSoftFloat()) {
1941 switch (I.getOpcode()) {
1942 default:
1943 return true;
1944 case Instruction::Alloca:
1945 case Instruction::Load:
1946 case Instruction::Store:
1947 case Instruction::Select:
1948 case Instruction::PHI:
1949 return false;
1950 }
1951 }
1952
1953 // We'll need a libcall to perform double precision operations on a single
1954 // precision only FPU.
1955 if (I.getType()->isDoubleTy() && !ST->hasFP64())
1956 return true;
1957
1958 // Likewise for half precision arithmetic.
1959 if (I.getType()->isHalfTy() && !ST->hasFullFP16())
1960 return true;
1961
1962 return false;
1963 }
1964
isHardwareLoopProfitable(Loop * L,ScalarEvolution & SE,AssumptionCache & AC,TargetLibraryInfo * LibInfo,HardwareLoopInfo & HWLoopInfo)1965 bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
1966 AssumptionCache &AC,
1967 TargetLibraryInfo *LibInfo,
1968 HardwareLoopInfo &HWLoopInfo) {
1969 // Low-overhead branches are only supported in the 'low-overhead branch'
1970 // extension of v8.1-m.
1971 if (!ST->hasLOB() || DisableLowOverheadLoops) {
1972 LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
1973 return false;
1974 }
1975
1976 if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
1977 LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
1978 return false;
1979 }
1980
1981 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1982 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
1983 LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
1984 return false;
1985 }
1986
1987 const SCEV *TripCountSCEV =
1988 SE.getAddExpr(BackedgeTakenCount,
1989 SE.getOne(BackedgeTakenCount->getType()));
1990
1991 // We need to store the trip count in LR, a 32-bit register.
1992 if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
1993 LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
1994 return false;
1995 }
1996
1997 // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
1998 // point in generating a hardware loop if that's going to happen.
1999
2000 auto IsHardwareLoopIntrinsic = [](Instruction &I) {
2001 if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
2002 switch (Call->getIntrinsicID()) {
2003 default:
2004 break;
2005 case Intrinsic::start_loop_iterations:
2006 case Intrinsic::test_start_loop_iterations:
2007 case Intrinsic::loop_decrement:
2008 case Intrinsic::loop_decrement_reg:
2009 return true;
2010 }
2011 }
2012 return false;
2013 };
2014
2015 // Scan the instructions to see if there's any that we know will turn into a
2016 // call or if this loop is already a low-overhead loop or will become a tail
2017 // predicated loop.
2018 bool IsTailPredLoop = false;
2019 auto ScanLoop = [&](Loop *L) {
2020 for (auto *BB : L->getBlocks()) {
2021 for (auto &I : *BB) {
2022 if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
2023 isa<InlineAsm>(I)) {
2024 LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
2025 return false;
2026 }
2027 if (auto *II = dyn_cast<IntrinsicInst>(&I))
2028 IsTailPredLoop |=
2029 II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
2030 II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
2031 II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
2032 II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
2033 II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
2034 }
2035 }
2036 return true;
2037 };
2038
2039 // Visit inner loops.
2040 for (auto Inner : *L)
2041 if (!ScanLoop(Inner))
2042 return false;
2043
2044 if (!ScanLoop(L))
2045 return false;
2046
2047 // TODO: Check whether the trip count calculation is expensive. If L is the
2048 // inner loop but we know it has a low trip count, calculating that trip
2049 // count (in the parent loop) may be detrimental.
2050
2051 LLVMContext &C = L->getHeader()->getContext();
2052 HWLoopInfo.CounterInReg = true;
2053 HWLoopInfo.IsNestingLegal = false;
2054 HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
2055 HWLoopInfo.CountType = Type::getInt32Ty(C);
2056 HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
2057 return true;
2058 }
2059
canTailPredicateInstruction(Instruction & I,int & ICmpCount)2060 static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
2061 // We don't allow icmp's, and because we only look at single block loops,
2062 // we simply count the icmps, i.e. there should only be 1 for the backedge.
2063 if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
2064 return false;
2065 // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
2066 // not currently canonical, but soon will be. Code without them uses icmp, and
2067 // so is not tail predicated as per the condition above. In order to get the
2068 // same performance we treat min and max the same as an icmp for tailpred
2069 // purposes for the moment (we often rely on non-tailpred and higher VF's to
2070 // pick more optimial instructions like VQDMULH. They need to be recognized
2071 // directly by the vectorizer).
2072 if (auto *II = dyn_cast<IntrinsicInst>(&I))
2073 if ((II->getIntrinsicID() == Intrinsic::smin ||
2074 II->getIntrinsicID() == Intrinsic::smax ||
2075 II->getIntrinsicID() == Intrinsic::umin ||
2076 II->getIntrinsicID() == Intrinsic::umax) &&
2077 ++ICmpCount > 1)
2078 return false;
2079
2080 if (isa<FCmpInst>(&I))
2081 return false;
2082
2083 // We could allow extending/narrowing FP loads/stores, but codegen is
2084 // too inefficient so reject this for now.
2085 if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
2086 return false;
2087
2088 // Extends have to be extending-loads
2089 if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
2090 if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
2091 return false;
2092
2093 // Truncs have to be narrowing-stores
2094 if (isa<TruncInst>(&I) )
2095 if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
2096 return false;
2097
2098 return true;
2099 }
2100
2101 // To set up a tail-predicated loop, we need to know the total number of
2102 // elements processed by that loop. Thus, we need to determine the element
2103 // size and:
2104 // 1) it should be uniform for all operations in the vector loop, so we
2105 // e.g. don't want any widening/narrowing operations.
2106 // 2) it should be smaller than i64s because we don't have vector operations
2107 // that work on i64s.
2108 // 3) we don't want elements to be reversed or shuffled, to make sure the
2109 // tail-predication masks/predicates the right lanes.
2110 //
canTailPredicateLoop(Loop * L,LoopInfo * LI,ScalarEvolution & SE,const DataLayout & DL,const LoopAccessInfo * LAI)2111 static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
2112 const DataLayout &DL,
2113 const LoopAccessInfo *LAI) {
2114 LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
2115
2116 // If there are live-out values, it is probably a reduction. We can predicate
2117 // most reduction operations freely under MVE using a combination of
2118 // prefer-predicated-reduction-select and inloop reductions. We limit this to
2119 // floating point and integer reductions, but don't check for operators
2120 // specifically here. If the value ends up not being a reduction (and so the
2121 // vectorizer cannot tailfold the loop), we should fall back to standard
2122 // vectorization automatically.
2123 SmallVector< Instruction *, 8 > LiveOuts;
2124 LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2125 bool ReductionsDisabled =
2126 EnableTailPredication == TailPredication::EnabledNoReductions ||
2127 EnableTailPredication == TailPredication::ForceEnabledNoReductions;
2128
2129 for (auto *I : LiveOuts) {
2130 if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2131 !I->getType()->isHalfTy()) {
2132 LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
2133 "live-out value\n");
2134 return false;
2135 }
2136 if (ReductionsDisabled) {
2137 LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
2138 return false;
2139 }
2140 }
2141
2142 // Next, check that all instructions can be tail-predicated.
2143 PredicatedScalarEvolution PSE = LAI->getPSE();
2144 SmallVector<Instruction *, 16> LoadStores;
2145 int ICmpCount = 0;
2146
2147 for (BasicBlock *BB : L->blocks()) {
2148 for (Instruction &I : BB->instructionsWithoutDebug()) {
2149 if (isa<PHINode>(&I))
2150 continue;
2151 if (!canTailPredicateInstruction(I, ICmpCount)) {
2152 LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2153 return false;
2154 }
2155
2156 Type *T = I.getType();
2157 if (T->getScalarSizeInBits() > 32) {
2158 LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2159 return false;
2160 }
2161 if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
2162 Value *Ptr = getLoadStorePointerOperand(&I);
2163 Type *AccessTy = getLoadStoreType(&I);
2164 int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L);
2165 if (NextStride == 1) {
2166 // TODO: for now only allow consecutive strides of 1. We could support
2167 // other strides as long as it is uniform, but let's keep it simple
2168 // for now.
2169 continue;
2170 } else if (NextStride == -1 ||
2171 (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2172 (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2173 LLVM_DEBUG(dbgs()
2174 << "Consecutive strides of 2 found, vld2/vstr2 can't "
2175 "be tail-predicated\n.");
2176 return false;
2177 // TODO: don't tail predicate if there is a reversed load?
2178 } else if (EnableMaskedGatherScatters) {
2179 // Gather/scatters do allow loading from arbitrary strides, at
2180 // least if they are loop invariant.
2181 // TODO: Loop variant strides should in theory work, too, but
2182 // this requires further testing.
2183 const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);
2184 if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
2185 const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
2186 if (PSE.getSE()->isLoopInvariant(Step, L))
2187 continue;
2188 }
2189 }
2190 LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2191 "tail-predicate\n.");
2192 return false;
2193 }
2194 }
2195 }
2196
2197 LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2198 return true;
2199 }
2200
preferPredicateOverEpilogue(Loop * L,LoopInfo * LI,ScalarEvolution & SE,AssumptionCache & AC,TargetLibraryInfo * TLI,DominatorTree * DT,LoopVectorizationLegality * LVL)2201 bool ARMTTIImpl::preferPredicateOverEpilogue(
2202 Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC,
2203 TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL) {
2204 if (!EnableTailPredication) {
2205 LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
2206 return false;
2207 }
2208
2209 // Creating a predicated vector loop is the first step for generating a
2210 // tail-predicated hardware loop, for which we need the MVE masked
2211 // load/stores instructions:
2212 if (!ST->hasMVEIntegerOps())
2213 return false;
2214
2215 // For now, restrict this to single block loops.
2216 if (L->getNumBlocks() > 1) {
2217 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
2218 "loop.\n");
2219 return false;
2220 }
2221
2222 assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
2223
2224 HardwareLoopInfo HWLoopInfo(L);
2225 if (!HWLoopInfo.canAnalyze(*LI)) {
2226 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2227 "analyzable.\n");
2228 return false;
2229 }
2230
2231 // This checks if we have the low-overhead branch architecture
2232 // extension, and if we will create a hardware-loop:
2233 if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
2234 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2235 "profitable.\n");
2236 return false;
2237 }
2238
2239 if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) {
2240 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2241 "a candidate.\n");
2242 return false;
2243 }
2244
2245 return canTailPredicateLoop(L, LI, SE, DL, LVL->getLAI());
2246 }
2247
emitGetActiveLaneMask() const2248 PredicationStyle ARMTTIImpl::emitGetActiveLaneMask() const {
2249 if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
2250 return PredicationStyle::None;
2251
2252 // Intrinsic @llvm.get.active.lane.mask is supported.
2253 // It is used in the MVETailPredication pass, which requires the number of
2254 // elements processed by this vector loop to setup the tail-predicated
2255 // loop.
2256 return PredicationStyle::Data;
2257 }
getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP,OptimizationRemarkEmitter * ORE)2258 void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
2259 TTI::UnrollingPreferences &UP,
2260 OptimizationRemarkEmitter *ORE) {
2261 // Enable Upper bound unrolling universally, not dependant upon the conditions
2262 // below.
2263 UP.UpperBound = true;
2264
2265 // Only currently enable these preferences for M-Class cores.
2266 if (!ST->isMClass())
2267 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2268
2269 // Disable loop unrolling for Oz and Os.
2270 UP.OptSizeThreshold = 0;
2271 UP.PartialOptSizeThreshold = 0;
2272 if (L->getHeader()->getParent()->hasOptSize())
2273 return;
2274
2275 SmallVector<BasicBlock*, 4> ExitingBlocks;
2276 L->getExitingBlocks(ExitingBlocks);
2277 LLVM_DEBUG(dbgs() << "Loop has:\n"
2278 << "Blocks: " << L->getNumBlocks() << "\n"
2279 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2280
2281 // Only allow another exit other than the latch. This acts as an early exit
2282 // as it mirrors the profitability calculation of the runtime unroller.
2283 if (ExitingBlocks.size() > 2)
2284 return;
2285
2286 // Limit the CFG of the loop body for targets with a branch predictor.
2287 // Allowing 4 blocks permits if-then-else diamonds in the body.
2288 if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
2289 return;
2290
2291 // Don't unroll vectorized loops, including the remainder loop
2292 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2293 return;
2294
2295 // Scan the loop: don't unroll loops with calls as this could prevent
2296 // inlining.
2297 InstructionCost Cost = 0;
2298 for (auto *BB : L->getBlocks()) {
2299 for (auto &I : *BB) {
2300 // Don't unroll vectorised loop. MVE does not benefit from it as much as
2301 // scalar code.
2302 if (I.getType()->isVectorTy())
2303 return;
2304
2305 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2306 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2307 if (!isLoweredToCall(F))
2308 continue;
2309 }
2310 return;
2311 }
2312
2313 SmallVector<const Value*, 4> Operands(I.operand_values());
2314 Cost +=
2315 getUserCost(&I, Operands, TargetTransformInfo::TCK_SizeAndLatency);
2316 }
2317 }
2318
2319 // On v6m cores, there are very few registers available. We can easily end up
2320 // spilling and reloading more registers in an unrolled loop. Look at the
2321 // number of LCSSA phis as a rough measure of how many registers will need to
2322 // be live out of the loop, reducing the default unroll count if more than 1
2323 // value is needed. In the long run, all of this should be being learnt by a
2324 // machine.
2325 unsigned UnrollCount = 4;
2326 if (ST->isThumb1Only()) {
2327 unsigned ExitingValues = 0;
2328 SmallVector<BasicBlock *, 4> ExitBlocks;
2329 L->getExitBlocks(ExitBlocks);
2330 for (auto *Exit : ExitBlocks) {
2331 // Count the number of LCSSA phis. Exclude values coming from GEP's as
2332 // only the last is expected to be needed for address operands.
2333 unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {
2334 return PH.getNumOperands() != 1 ||
2335 !isa<GetElementPtrInst>(PH.getOperand(0));
2336 });
2337 ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2338 }
2339 if (ExitingValues)
2340 UnrollCount /= ExitingValues;
2341 if (UnrollCount <= 1)
2342 return;
2343 }
2344
2345 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2346 LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
2347
2348 UP.Partial = true;
2349 UP.Runtime = true;
2350 UP.UnrollRemainder = true;
2351 UP.DefaultUnrollRuntimeCount = UnrollCount;
2352 UP.UnrollAndJam = true;
2353 UP.UnrollAndJamInnerLoopThreshold = 60;
2354
2355 // Force unrolling small loops can be very useful because of the branch
2356 // taken cost of the backedge.
2357 if (Cost < 12)
2358 UP.Force = true;
2359 }
2360
getPeelingPreferences(Loop * L,ScalarEvolution & SE,TTI::PeelingPreferences & PP)2361 void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
2362 TTI::PeelingPreferences &PP) {
2363 BaseT::getPeelingPreferences(L, SE, PP);
2364 }
2365
preferInLoopReduction(unsigned Opcode,Type * Ty,TTI::ReductionFlags Flags) const2366 bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
2367 TTI::ReductionFlags Flags) const {
2368 if (!ST->hasMVEIntegerOps())
2369 return false;
2370
2371 unsigned ScalarBits = Ty->getScalarSizeInBits();
2372 switch (Opcode) {
2373 case Instruction::Add:
2374 return ScalarBits <= 64;
2375 default:
2376 return false;
2377 }
2378 }
2379
preferPredicatedReductionSelect(unsigned Opcode,Type * Ty,TTI::ReductionFlags Flags) const2380 bool ARMTTIImpl::preferPredicatedReductionSelect(
2381 unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
2382 if (!ST->hasMVEIntegerOps())
2383 return false;
2384 return true;
2385 }
2386