1 //===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8
9 #include "RISCVTargetTransformInfo.h"
10 #include "MCTargetDesc/RISCVMatInt.h"
11 #include "llvm/ADT/STLExtras.h"
12 #include "llvm/Analysis/TargetTransformInfo.h"
13 #include "llvm/CodeGen/BasicTTIImpl.h"
14 #include "llvm/CodeGen/CostTable.h"
15 #include "llvm/CodeGen/TargetLowering.h"
16 #include "llvm/IR/Instructions.h"
17 #include <cmath>
18 #include <optional>
19 using namespace llvm;
20
21 #define DEBUG_TYPE "riscvtti"
22
23 static cl::opt<unsigned> RVVRegisterWidthLMUL(
24 "riscv-v-register-bit-width-lmul",
25 cl::desc(
26 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
27 "by autovectorized code. Fractional LMULs are not supported."),
28 cl::init(2), cl::Hidden);
29
30 static cl::opt<unsigned> SLPMaxVF(
31 "riscv-v-slp-max-vf",
32 cl::desc(
33 "Overrides result used for getMaximumVF query which is used "
34 "exclusively by SLP vectorizer."),
35 cl::Hidden);
36
37 InstructionCost
getRISCVInstructionCost(ArrayRef<unsigned> OpCodes,MVT VT,TTI::TargetCostKind CostKind)38 RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
39 TTI::TargetCostKind CostKind) {
40 // Check if the type is valid for all CostKind
41 if (!VT.isVector())
42 return InstructionCost::getInvalid();
43 size_t NumInstr = OpCodes.size();
44 if (CostKind == TTI::TCK_CodeSize)
45 return NumInstr;
46 InstructionCost LMULCost = TLI->getLMULCost(VT);
47 if ((CostKind != TTI::TCK_RecipThroughput) && (CostKind != TTI::TCK_Latency))
48 return LMULCost * NumInstr;
49 InstructionCost Cost = 0;
50 for (auto Op : OpCodes) {
51 switch (Op) {
52 case RISCV::VRGATHER_VI:
53 Cost += TLI->getVRGatherVICost(VT);
54 break;
55 case RISCV::VRGATHER_VV:
56 Cost += TLI->getVRGatherVVCost(VT);
57 break;
58 case RISCV::VSLIDEUP_VI:
59 case RISCV::VSLIDEDOWN_VI:
60 Cost += TLI->getVSlideVICost(VT);
61 break;
62 case RISCV::VSLIDEUP_VX:
63 case RISCV::VSLIDEDOWN_VX:
64 Cost += TLI->getVSlideVXCost(VT);
65 break;
66 case RISCV::VREDMAX_VS:
67 case RISCV::VREDMIN_VS:
68 case RISCV::VREDMAXU_VS:
69 case RISCV::VREDMINU_VS:
70 case RISCV::VREDSUM_VS:
71 case RISCV::VREDAND_VS:
72 case RISCV::VREDOR_VS:
73 case RISCV::VREDXOR_VS:
74 case RISCV::VFREDMAX_VS:
75 case RISCV::VFREDMIN_VS:
76 case RISCV::VFREDUSUM_VS: {
77 unsigned VL = VT.getVectorMinNumElements();
78 if (!VT.isFixedLengthVector())
79 VL *= *getVScaleForTuning();
80 Cost += Log2_32_Ceil(VL);
81 break;
82 }
83 case RISCV::VFREDOSUM_VS: {
84 unsigned VL = VT.getVectorMinNumElements();
85 if (!VT.isFixedLengthVector())
86 VL *= *getVScaleForTuning();
87 Cost += VL;
88 break;
89 }
90 case RISCV::VMV_X_S:
91 case RISCV::VMV_S_X:
92 Cost += 1;
93 break;
94 default:
95 Cost += LMULCost;
96 }
97 }
98 return Cost;
99 }
100
getIntImmCost(const APInt & Imm,Type * Ty,TTI::TargetCostKind CostKind)101 InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
102 TTI::TargetCostKind CostKind) {
103 assert(Ty->isIntegerTy() &&
104 "getIntImmCost can only estimate cost of materialising integers");
105
106 // We have a Zero register, so 0 is always free.
107 if (Imm == 0)
108 return TTI::TCC_Free;
109
110 // Otherwise, we check how many instructions it will take to materialise.
111 const DataLayout &DL = getDataLayout();
112 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *getST());
113 }
114
115 // Look for patterns of shift followed by AND that can be turned into a pair of
116 // shifts. We won't need to materialize an immediate for the AND so these can
117 // be considered free.
canUseShiftPair(Instruction * Inst,const APInt & Imm)118 static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
119 uint64_t Mask = Imm.getZExtValue();
120 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
121 if (!BO || !BO->hasOneUse())
122 return false;
123
124 if (BO->getOpcode() != Instruction::Shl)
125 return false;
126
127 if (!isa<ConstantInt>(BO->getOperand(1)))
128 return false;
129
130 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
131 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
132 // is a mask shifted by c2 bits with c3 leading zeros.
133 if (isShiftedMask_64(Mask)) {
134 unsigned Trailing = llvm::countr_zero(Mask);
135 if (ShAmt == Trailing)
136 return true;
137 }
138
139 return false;
140 }
141
getIntImmCostInst(unsigned Opcode,unsigned Idx,const APInt & Imm,Type * Ty,TTI::TargetCostKind CostKind,Instruction * Inst)142 InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
143 const APInt &Imm, Type *Ty,
144 TTI::TargetCostKind CostKind,
145 Instruction *Inst) {
146 assert(Ty->isIntegerTy() &&
147 "getIntImmCost can only estimate cost of materialising integers");
148
149 // We have a Zero register, so 0 is always free.
150 if (Imm == 0)
151 return TTI::TCC_Free;
152
153 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
154 // commutative, in others the immediate comes from a specific argument index.
155 bool Takes12BitImm = false;
156 unsigned ImmArgIdx = ~0U;
157
158 switch (Opcode) {
159 case Instruction::GetElementPtr:
160 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
161 // split up large offsets in GEP into better parts than ConstantHoisting
162 // can.
163 return TTI::TCC_Free;
164 case Instruction::And:
165 // zext.h
166 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
167 return TTI::TCC_Free;
168 // zext.w
169 if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba())
170 return TTI::TCC_Free;
171 // bclri
172 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
173 return TTI::TCC_Free;
174 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
175 canUseShiftPair(Inst, Imm))
176 return TTI::TCC_Free;
177 Takes12BitImm = true;
178 break;
179 case Instruction::Add:
180 Takes12BitImm = true;
181 break;
182 case Instruction::Or:
183 case Instruction::Xor:
184 // bseti/binvi
185 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
186 return TTI::TCC_Free;
187 Takes12BitImm = true;
188 break;
189 case Instruction::Mul:
190 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
191 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
192 return TTI::TCC_Free;
193 // One more or less than a power of 2 can use SLLI+ADD/SUB.
194 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
195 return TTI::TCC_Free;
196 // FIXME: There is no MULI instruction.
197 Takes12BitImm = true;
198 break;
199 case Instruction::Sub:
200 case Instruction::Shl:
201 case Instruction::LShr:
202 case Instruction::AShr:
203 Takes12BitImm = true;
204 ImmArgIdx = 1;
205 break;
206 default:
207 break;
208 }
209
210 if (Takes12BitImm) {
211 // Check immediate is the correct argument...
212 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
213 // ... and fits into the 12-bit immediate.
214 if (Imm.getSignificantBits() <= 64 &&
215 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
216 return TTI::TCC_Free;
217 }
218 }
219
220 // Otherwise, use the full materialisation cost.
221 return getIntImmCost(Imm, Ty, CostKind);
222 }
223
224 // By default, prevent hoisting.
225 return TTI::TCC_Free;
226 }
227
228 InstructionCost
getIntImmCostIntrin(Intrinsic::ID IID,unsigned Idx,const APInt & Imm,Type * Ty,TTI::TargetCostKind CostKind)229 RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
230 const APInt &Imm, Type *Ty,
231 TTI::TargetCostKind CostKind) {
232 // Prevent hoisting in unknown cases.
233 return TTI::TCC_Free;
234 }
235
236 TargetTransformInfo::PopcntSupportKind
getPopcntSupport(unsigned TyWidth)237 RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) {
238 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
239 return ST->hasStdExtZbb() || ST->hasVendorXCVbitmanip()
240 ? TTI::PSK_FastHardware
241 : TTI::PSK_Software;
242 }
243
shouldExpandReduction(const IntrinsicInst * II) const244 bool RISCVTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const {
245 // Currently, the ExpandReductions pass can't expand scalable-vector
246 // reductions, but we still request expansion as RVV doesn't support certain
247 // reductions and the SelectionDAG can't legalize them either.
248 switch (II->getIntrinsicID()) {
249 default:
250 return false;
251 // These reductions have no equivalent in RVV
252 case Intrinsic::vector_reduce_mul:
253 case Intrinsic::vector_reduce_fmul:
254 return true;
255 }
256 }
257
getMaxVScale() const258 std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
259 if (ST->hasVInstructions())
260 return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock;
261 return BaseT::getMaxVScale();
262 }
263
getVScaleForTuning() const264 std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
265 if (ST->hasVInstructions())
266 if (unsigned MinVLen = ST->getRealMinVLen();
267 MinVLen >= RISCV::RVVBitsPerBlock)
268 return MinVLen / RISCV::RVVBitsPerBlock;
269 return BaseT::getVScaleForTuning();
270 }
271
272 TypeSize
getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const273 RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
274 unsigned LMUL =
275 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
276 switch (K) {
277 case TargetTransformInfo::RGK_Scalar:
278 return TypeSize::getFixed(ST->getXLen());
279 case TargetTransformInfo::RGK_FixedWidthVector:
280 return TypeSize::getFixed(
281 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
282 case TargetTransformInfo::RGK_ScalableVector:
283 return TypeSize::getScalable(
284 (ST->hasVInstructions() &&
285 ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock)
286 ? LMUL * RISCV::RVVBitsPerBlock
287 : 0);
288 }
289
290 llvm_unreachable("Unsupported register kind");
291 }
292
293 InstructionCost
getConstantPoolLoadCost(Type * Ty,TTI::TargetCostKind CostKind)294 RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty, TTI::TargetCostKind CostKind) {
295 // Add a cost of address generation + the cost of the load. The address
296 // is expected to be a PC relative offset to a constant pool entry
297 // using auipc/addi.
298 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
299 /*AddressSpace=*/0, CostKind);
300 }
301
getVRGatherIndexType(MVT DataVT,const RISCVSubtarget & ST,LLVMContext & C)302 static VectorType *getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST,
303 LLVMContext &C) {
304 assert((DataVT.getScalarSizeInBits() != 8 ||
305 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
306 MVT IndexVT = DataVT.changeTypeToInteger();
307 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
308 IndexVT = IndexVT.changeVectorElementType(MVT::i16);
309 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
310 }
311
getShuffleCost(TTI::ShuffleKind Kind,VectorType * Tp,ArrayRef<int> Mask,TTI::TargetCostKind CostKind,int Index,VectorType * SubTp,ArrayRef<const Value * > Args)312 InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
313 VectorType *Tp, ArrayRef<int> Mask,
314 TTI::TargetCostKind CostKind,
315 int Index, VectorType *SubTp,
316 ArrayRef<const Value *> Args) {
317 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
318
319 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
320
321 // First, handle cases where having a fixed length vector enables us to
322 // give a more accurate cost than falling back to generic scalable codegen.
323 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
324 if (isa<FixedVectorType>(Tp)) {
325 switch (Kind) {
326 default:
327 break;
328 case TTI::SK_PermuteSingleSrc: {
329 if (Mask.size() >= 2 && LT.second.isFixedLengthVector()) {
330 MVT EltTp = LT.second.getVectorElementType();
331 // If the size of the element is < ELEN then shuffles of interleaves and
332 // deinterleaves of 2 vectors can be lowered into the following
333 // sequences
334 if (EltTp.getScalarSizeInBits() < ST->getELen()) {
335 // Example sequence:
336 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
337 // vwaddu.vv v10, v8, v9
338 // li a0, -1 (ignored)
339 // vwmaccu.vx v10, a0, v9
340 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
341 return 2 * LT.first * TLI->getLMULCost(LT.second);
342
343 if (Mask[0] == 0 || Mask[0] == 1) {
344 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
345 // Example sequence:
346 // vnsrl.wi v10, v8, 0
347 if (equal(DeinterleaveMask, Mask))
348 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
349 LT.second, CostKind);
350 }
351 }
352 }
353 // vrgather + cost of generating the mask constant.
354 // We model this for an unknown mask with a single vrgather.
355 if (LT.second.isFixedLengthVector() && LT.first == 1 &&
356 (LT.second.getScalarSizeInBits() != 8 ||
357 LT.second.getVectorNumElements() <= 256)) {
358 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, Tp->getContext());
359 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
360 return IndexCost +
361 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
362 }
363 [[fallthrough]];
364 }
365 case TTI::SK_Transpose:
366 case TTI::SK_PermuteTwoSrc: {
367 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
368 // register for the second vrgather. We model this for an unknown
369 // (shuffle) mask.
370 if (LT.second.isFixedLengthVector() && LT.first == 1 &&
371 (LT.second.getScalarSizeInBits() != 8 ||
372 LT.second.getVectorNumElements() <= 256)) {
373 auto &C = Tp->getContext();
374 auto EC = Tp->getElementCount();
375 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
376 VectorType *MaskTy = VectorType::get(IntegerType::getInt1Ty(C), EC);
377 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
378 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
379 return 2 * IndexCost +
380 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
381 LT.second, CostKind) +
382 MaskCost;
383 }
384 [[fallthrough]];
385 }
386 case TTI::SK_Select: {
387 // We are going to permute multiple sources and the result will be in
388 // multiple destinations. Providing an accurate cost only for splits where
389 // the element type remains the same.
390 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
391 LT.second.isFixedLengthVector() &&
392 LT.second.getVectorElementType().getSizeInBits() ==
393 Tp->getElementType()->getPrimitiveSizeInBits() &&
394 LT.second.getVectorNumElements() <
395 cast<FixedVectorType>(Tp)->getNumElements() &&
396 divideCeil(Mask.size(),
397 cast<FixedVectorType>(Tp)->getNumElements()) ==
398 static_cast<unsigned>(*LT.first.getValue())) {
399 unsigned NumRegs = *LT.first.getValue();
400 unsigned VF = cast<FixedVectorType>(Tp)->getNumElements();
401 unsigned SubVF = PowerOf2Ceil(VF / NumRegs);
402 auto *SubVecTy = FixedVectorType::get(Tp->getElementType(), SubVF);
403
404 InstructionCost Cost = 0;
405 for (unsigned I = 0; I < NumRegs; ++I) {
406 bool IsSingleVector = true;
407 SmallVector<int> SubMask(SubVF, PoisonMaskElem);
408 transform(Mask.slice(I * SubVF,
409 I == NumRegs - 1 ? Mask.size() % SubVF : SubVF),
410 SubMask.begin(), [&](int I) {
411 bool SingleSubVector = I / VF == 0;
412 IsSingleVector &= SingleSubVector;
413 return (SingleSubVector ? 0 : 1) * SubVF + I % VF;
414 });
415 Cost += getShuffleCost(IsSingleVector ? TTI::SK_PermuteSingleSrc
416 : TTI::SK_PermuteTwoSrc,
417 SubVecTy, SubMask, CostKind, 0, nullptr);
418 return Cost;
419 }
420 }
421 break;
422 }
423 }
424 };
425
426 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
427 switch (Kind) {
428 default:
429 // Fallthrough to generic handling.
430 // TODO: Most of these cases will return getInvalid in generic code, and
431 // must be implemented here.
432 break;
433 case TTI::SK_ExtractSubvector:
434 // Example sequence:
435 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
436 // vslidedown.vi v8, v9, 2
437 return LT.first *
438 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
439 case TTI::SK_InsertSubvector:
440 // Example sequence:
441 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
442 // vslideup.vi v8, v9, 2
443 return LT.first *
444 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
445 case TTI::SK_Select: {
446 // Example sequence:
447 // li a0, 90
448 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
449 // vmv.s.x v0, a0
450 // vmerge.vvm v8, v9, v8, v0
451 // We use 2 for the cost of the mask materialization as this is the true
452 // cost for small masks and most shuffles are small. At worst, this cost
453 // should be a very small constant for the constant pool load. As such,
454 // we may bias towards large selects slightly more than truely warranted.
455 return LT.first *
456 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
457 LT.second, CostKind));
458 }
459 case TTI::SK_Broadcast: {
460 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
461 Instruction::InsertElement);
462 if (LT.second.getScalarSizeInBits() == 1) {
463 if (HasScalar) {
464 // Example sequence:
465 // andi a0, a0, 1
466 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
467 // vmv.v.x v8, a0
468 // vmsne.vi v0, v8, 0
469 return LT.first *
470 (TLI->getLMULCost(LT.second) + // FIXME: should be 1 for andi
471 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
472 LT.second, CostKind));
473 }
474 // Example sequence:
475 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
476 // vmv.v.i v8, 0
477 // vmerge.vim v8, v8, 1, v0
478 // vmv.x.s a0, v8
479 // andi a0, a0, 1
480 // vmv.v.x v8, a0
481 // vmsne.vi v0, v8, 0
482
483 return LT.first *
484 (TLI->getLMULCost(LT.second) + // FIXME: this should be 1 for andi
485 getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
486 RISCV::VMV_X_S, RISCV::VMV_V_X,
487 RISCV::VMSNE_VI},
488 LT.second, CostKind));
489 }
490
491 if (HasScalar) {
492 // Example sequence:
493 // vmv.v.x v8, a0
494 return LT.first *
495 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
496 }
497
498 // Example sequence:
499 // vrgather.vi v9, v8, 0
500 return LT.first *
501 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
502 }
503 case TTI::SK_Splice: {
504 // vslidedown+vslideup.
505 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
506 // of similar code, but I think we expand through memory.
507 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
508 if (Index >= 0 && Index < 32)
509 Opcodes[0] = RISCV::VSLIDEDOWN_VI;
510 else if (Index < 0 && Index > -32)
511 Opcodes[1] = RISCV::VSLIDEUP_VI;
512 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
513 }
514 case TTI::SK_Reverse: {
515 // TODO: Cases to improve here:
516 // * Illegal vector types
517 // * i64 on RV32
518 // * i1 vector
519 // At low LMUL, most of the cost is producing the vrgather index register.
520 // At high LMUL, the cost of the vrgather itself will dominate.
521 // Example sequence:
522 // csrr a0, vlenb
523 // srli a0, a0, 3
524 // addi a0, a0, -1
525 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
526 // vid.v v9
527 // vrsub.vx v10, v9, a0
528 // vrgather.vv v9, v8, v10
529 InstructionCost LenCost = 3;
530 if (LT.second.isFixedLengthVector())
531 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
532 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
533 // FIXME: replace the constant `2` below with cost of {VID_V,VRSUB_VX}
534 InstructionCost GatherCost =
535 2 + getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
536 // Mask operation additionally required extend and truncate
537 InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(1) ? 3 : 0;
538 return LT.first * (LenCost + GatherCost + ExtendCost);
539 }
540 }
541 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
542 }
543
544 InstructionCost
getMaskedMemoryOpCost(unsigned Opcode,Type * Src,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind)545 RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
546 unsigned AddressSpace,
547 TTI::TargetCostKind CostKind) {
548 if (!isLegalMaskedLoadStore(Src, Alignment) ||
549 CostKind != TTI::TCK_RecipThroughput)
550 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
551 CostKind);
552
553 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
554 }
555
getInterleavedMemoryOpCost(unsigned Opcode,Type * VecTy,unsigned Factor,ArrayRef<unsigned> Indices,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind,bool UseMaskForCond,bool UseMaskForGaps)556 InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(
557 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
558 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
559 bool UseMaskForCond, bool UseMaskForGaps) {
560 if (isa<ScalableVectorType>(VecTy))
561 return InstructionCost::getInvalid();
562 auto *FVTy = cast<FixedVectorType>(VecTy);
563 InstructionCost MemCost =
564 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
565 unsigned VF = FVTy->getNumElements() / Factor;
566
567 // The interleaved memory access pass will lower interleaved memory ops (i.e
568 // a load and store followed by a specific shuffle) to vlseg/vsseg
569 // intrinsics. In those cases then we can treat it as if it's just one (legal)
570 // memory op
571 if (!UseMaskForCond && !UseMaskForGaps &&
572 Factor <= TLI->getMaxSupportedInterleaveFactor()) {
573 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(FVTy);
574 // Need to make sure type has't been scalarized
575 if (LT.second.isFixedLengthVector()) {
576 auto *LegalFVTy = FixedVectorType::get(FVTy->getElementType(),
577 LT.second.getVectorNumElements());
578 // FIXME: We use the memory op cost of the *legalized* type here, becuase
579 // it's getMemoryOpCost returns a really expensive cost for types like
580 // <6 x i8>, which show up when doing interleaves of Factor=3 etc.
581 // Should the memory op cost of these be cheaper?
582 if (TLI->isLegalInterleavedAccessType(LegalFVTy, Factor, Alignment,
583 AddressSpace, DL)) {
584 InstructionCost LegalMemCost = getMemoryOpCost(
585 Opcode, LegalFVTy, Alignment, AddressSpace, CostKind);
586 return LT.first + LegalMemCost;
587 }
588 }
589 }
590
591 // An interleaved load will look like this for Factor=3:
592 // %wide.vec = load <12 x i32>, ptr %3, align 4
593 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
594 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
595 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
596 if (Opcode == Instruction::Load) {
597 InstructionCost Cost = MemCost;
598 for (unsigned Index : Indices) {
599 FixedVectorType *SubVecTy =
600 FixedVectorType::get(FVTy->getElementType(), VF * Factor);
601 auto Mask = createStrideMask(Index, Factor, VF);
602 InstructionCost ShuffleCost =
603 getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, SubVecTy, Mask,
604 CostKind, 0, nullptr, {});
605 Cost += ShuffleCost;
606 }
607 return Cost;
608 }
609
610 // TODO: Model for NF > 2
611 // We'll need to enhance getShuffleCost to model shuffles that are just
612 // inserts and extracts into subvectors, since they won't have the full cost
613 // of a vrgather.
614 // An interleaved store for 3 vectors of 4 lanes will look like
615 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
616 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
617 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
618 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
619 // store <12 x i32> %interleaved.vec, ptr %10, align 4
620 if (Factor != 2)
621 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
622 Alignment, AddressSpace, CostKind,
623 UseMaskForCond, UseMaskForGaps);
624
625 assert(Opcode == Instruction::Store && "Opcode must be a store");
626 // For an interleaving store of 2 vectors, we perform one large interleaving
627 // shuffle that goes into the wide store
628 auto Mask = createInterleaveMask(VF, Factor);
629 InstructionCost ShuffleCost =
630 getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, FVTy, Mask,
631 CostKind, 0, nullptr, {});
632 return MemCost + ShuffleCost;
633 }
634
getGatherScatterOpCost(unsigned Opcode,Type * DataTy,const Value * Ptr,bool VariableMask,Align Alignment,TTI::TargetCostKind CostKind,const Instruction * I)635 InstructionCost RISCVTTIImpl::getGatherScatterOpCost(
636 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
637 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
638 if (CostKind != TTI::TCK_RecipThroughput)
639 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
640 Alignment, CostKind, I);
641
642 if ((Opcode == Instruction::Load &&
643 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
644 (Opcode == Instruction::Store &&
645 !isLegalMaskedScatter(DataTy, Align(Alignment))))
646 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
647 Alignment, CostKind, I);
648
649 // Cost is proportional to the number of memory operations implied. For
650 // scalable vectors, we use an estimate on that number since we don't
651 // know exactly what VL will be.
652 auto &VTy = *cast<VectorType>(DataTy);
653 InstructionCost MemOpCost =
654 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
655 {TTI::OK_AnyValue, TTI::OP_None}, I);
656 unsigned NumLoads = getEstimatedVLFor(&VTy);
657 return NumLoads * MemOpCost;
658 }
659
660 // Currently, these represent both throughput and codesize costs
661 // for the respective intrinsics. The costs in this table are simply
662 // instruction counts with the following adjustments made:
663 // * One vsetvli is considered free.
664 static const CostTblEntry VectorIntrinsicCostTable[]{
665 {Intrinsic::floor, MVT::f32, 9},
666 {Intrinsic::floor, MVT::f64, 9},
667 {Intrinsic::ceil, MVT::f32, 9},
668 {Intrinsic::ceil, MVT::f64, 9},
669 {Intrinsic::trunc, MVT::f32, 7},
670 {Intrinsic::trunc, MVT::f64, 7},
671 {Intrinsic::round, MVT::f32, 9},
672 {Intrinsic::round, MVT::f64, 9},
673 {Intrinsic::roundeven, MVT::f32, 9},
674 {Intrinsic::roundeven, MVT::f64, 9},
675 {Intrinsic::rint, MVT::f32, 7},
676 {Intrinsic::rint, MVT::f64, 7},
677 {Intrinsic::lrint, MVT::i32, 1},
678 {Intrinsic::lrint, MVT::i64, 1},
679 {Intrinsic::llrint, MVT::i64, 1},
680 {Intrinsic::nearbyint, MVT::f32, 9},
681 {Intrinsic::nearbyint, MVT::f64, 9},
682 {Intrinsic::bswap, MVT::i16, 3},
683 {Intrinsic::bswap, MVT::i32, 12},
684 {Intrinsic::bswap, MVT::i64, 31},
685 {Intrinsic::vp_bswap, MVT::i16, 3},
686 {Intrinsic::vp_bswap, MVT::i32, 12},
687 {Intrinsic::vp_bswap, MVT::i64, 31},
688 {Intrinsic::vp_fshl, MVT::i8, 7},
689 {Intrinsic::vp_fshl, MVT::i16, 7},
690 {Intrinsic::vp_fshl, MVT::i32, 7},
691 {Intrinsic::vp_fshl, MVT::i64, 7},
692 {Intrinsic::vp_fshr, MVT::i8, 7},
693 {Intrinsic::vp_fshr, MVT::i16, 7},
694 {Intrinsic::vp_fshr, MVT::i32, 7},
695 {Intrinsic::vp_fshr, MVT::i64, 7},
696 {Intrinsic::bitreverse, MVT::i8, 17},
697 {Intrinsic::bitreverse, MVT::i16, 24},
698 {Intrinsic::bitreverse, MVT::i32, 33},
699 {Intrinsic::bitreverse, MVT::i64, 52},
700 {Intrinsic::vp_bitreverse, MVT::i8, 17},
701 {Intrinsic::vp_bitreverse, MVT::i16, 24},
702 {Intrinsic::vp_bitreverse, MVT::i32, 33},
703 {Intrinsic::vp_bitreverse, MVT::i64, 52},
704 {Intrinsic::ctpop, MVT::i8, 12},
705 {Intrinsic::ctpop, MVT::i16, 19},
706 {Intrinsic::ctpop, MVT::i32, 20},
707 {Intrinsic::ctpop, MVT::i64, 21},
708 {Intrinsic::vp_ctpop, MVT::i8, 12},
709 {Intrinsic::vp_ctpop, MVT::i16, 19},
710 {Intrinsic::vp_ctpop, MVT::i32, 20},
711 {Intrinsic::vp_ctpop, MVT::i64, 21},
712 {Intrinsic::vp_ctlz, MVT::i8, 19},
713 {Intrinsic::vp_ctlz, MVT::i16, 28},
714 {Intrinsic::vp_ctlz, MVT::i32, 31},
715 {Intrinsic::vp_ctlz, MVT::i64, 35},
716 {Intrinsic::vp_cttz, MVT::i8, 16},
717 {Intrinsic::vp_cttz, MVT::i16, 23},
718 {Intrinsic::vp_cttz, MVT::i32, 24},
719 {Intrinsic::vp_cttz, MVT::i64, 25},
720 };
721
getISDForVPIntrinsicID(Intrinsic::ID ID)722 static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID) {
723 switch (ID) {
724 #define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD) \
725 case Intrinsic::VPID: \
726 return ISD::VPSD;
727 #include "llvm/IR/VPIntrinsics.def"
728 #undef HELPER_MAP_VPID_TO_VPSD
729 }
730 return ISD::DELETED_NODE;
731 }
732
733 InstructionCost
getIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)734 RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
735 TTI::TargetCostKind CostKind) {
736 auto *RetTy = ICA.getReturnType();
737 switch (ICA.getID()) {
738 case Intrinsic::ceil:
739 case Intrinsic::floor:
740 case Intrinsic::trunc:
741 case Intrinsic::rint:
742 case Intrinsic::lrint:
743 case Intrinsic::llrint:
744 case Intrinsic::round:
745 case Intrinsic::roundeven: {
746 // These all use the same code.
747 auto LT = getTypeLegalizationCost(RetTy);
748 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
749 return LT.first * 8;
750 break;
751 }
752 case Intrinsic::umin:
753 case Intrinsic::umax:
754 case Intrinsic::smin:
755 case Intrinsic::smax: {
756 auto LT = getTypeLegalizationCost(RetTy);
757 if ((ST->hasVInstructions() && LT.second.isVector()) ||
758 (LT.second.isScalarInteger() && ST->hasStdExtZbb()))
759 return LT.first;
760 break;
761 }
762 case Intrinsic::sadd_sat:
763 case Intrinsic::ssub_sat:
764 case Intrinsic::uadd_sat:
765 case Intrinsic::usub_sat:
766 case Intrinsic::fabs:
767 case Intrinsic::sqrt: {
768 auto LT = getTypeLegalizationCost(RetTy);
769 if (ST->hasVInstructions() && LT.second.isVector())
770 return LT.first;
771 break;
772 }
773 case Intrinsic::ctpop: {
774 auto LT = getTypeLegalizationCost(RetTy);
775 if (ST->hasVInstructions() && ST->hasStdExtZvbb() && LT.second.isVector())
776 return LT.first;
777 break;
778 }
779 case Intrinsic::abs: {
780 auto LT = getTypeLegalizationCost(RetTy);
781 if (ST->hasVInstructions() && LT.second.isVector()) {
782 // vrsub.vi v10, v8, 0
783 // vmax.vv v8, v8, v10
784 return LT.first * 2;
785 }
786 break;
787 }
788 // TODO: add more intrinsic
789 case Intrinsic::experimental_stepvector: {
790 unsigned Cost = 1; // vid
791 auto LT = getTypeLegalizationCost(RetTy);
792 return Cost + (LT.first - 1);
793 }
794 case Intrinsic::vp_rint: {
795 // RISC-V target uses at least 5 instructions to lower rounding intrinsics.
796 unsigned Cost = 5;
797 auto LT = getTypeLegalizationCost(RetTy);
798 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
799 return Cost * LT.first;
800 break;
801 }
802 case Intrinsic::vp_nearbyint: {
803 // More one read and one write for fflags than vp_rint.
804 unsigned Cost = 7;
805 auto LT = getTypeLegalizationCost(RetTy);
806 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
807 return Cost * LT.first;
808 break;
809 }
810 case Intrinsic::vp_ceil:
811 case Intrinsic::vp_floor:
812 case Intrinsic::vp_round:
813 case Intrinsic::vp_roundeven:
814 case Intrinsic::vp_roundtozero: {
815 // Rounding with static rounding mode needs two more instructions to
816 // swap/write FRM than vp_rint.
817 unsigned Cost = 7;
818 auto LT = getTypeLegalizationCost(RetTy);
819 unsigned VPISD = getISDForVPIntrinsicID(ICA.getID());
820 if (TLI->isOperationCustom(VPISD, LT.second))
821 return Cost * LT.first;
822 break;
823 }
824 }
825
826 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
827 if (auto LT = getTypeLegalizationCost(RetTy);
828 LT.second.isVector()) {
829 MVT EltTy = LT.second.getVectorElementType();
830 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
831 ICA.getID(), EltTy))
832 return LT.first * Entry->Cost;
833 }
834 }
835
836 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
837 }
838
getCastInstrCost(unsigned Opcode,Type * Dst,Type * Src,TTI::CastContextHint CCH,TTI::TargetCostKind CostKind,const Instruction * I)839 InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
840 Type *Src,
841 TTI::CastContextHint CCH,
842 TTI::TargetCostKind CostKind,
843 const Instruction *I) {
844 if (isa<VectorType>(Dst) && isa<VectorType>(Src)) {
845 // FIXME: Need to compute legalizing cost for illegal types.
846 if (!isTypeLegal(Src) || !isTypeLegal(Dst))
847 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
848
849 // Skip if element size of Dst or Src is bigger than ELEN.
850 if (Src->getScalarSizeInBits() > ST->getELen() ||
851 Dst->getScalarSizeInBits() > ST->getELen())
852 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
853
854 int ISD = TLI->InstructionOpcodeToISD(Opcode);
855 assert(ISD && "Invalid opcode");
856
857 // FIXME: Need to consider vsetvli and lmul.
858 int PowDiff = (int)Log2_32(Dst->getScalarSizeInBits()) -
859 (int)Log2_32(Src->getScalarSizeInBits());
860 switch (ISD) {
861 case ISD::SIGN_EXTEND:
862 case ISD::ZERO_EXTEND:
863 if (Src->getScalarSizeInBits() == 1) {
864 // We do not use vsext/vzext to extend from mask vector.
865 // Instead we use the following instructions to extend from mask vector:
866 // vmv.v.i v8, 0
867 // vmerge.vim v8, v8, -1, v0
868 return 2;
869 }
870 return 1;
871 case ISD::TRUNCATE:
872 if (Dst->getScalarSizeInBits() == 1) {
873 // We do not use several vncvt to truncate to mask vector. So we could
874 // not use PowDiff to calculate it.
875 // Instead we use the following instructions to truncate to mask vector:
876 // vand.vi v8, v8, 1
877 // vmsne.vi v0, v8, 0
878 return 2;
879 }
880 [[fallthrough]];
881 case ISD::FP_EXTEND:
882 case ISD::FP_ROUND:
883 // Counts of narrow/widen instructions.
884 return std::abs(PowDiff);
885 case ISD::FP_TO_SINT:
886 case ISD::FP_TO_UINT:
887 case ISD::SINT_TO_FP:
888 case ISD::UINT_TO_FP:
889 if (Src->getScalarSizeInBits() == 1 || Dst->getScalarSizeInBits() == 1) {
890 // The cost of convert from or to mask vector is different from other
891 // cases. We could not use PowDiff to calculate it.
892 // For mask vector to fp, we should use the following instructions:
893 // vmv.v.i v8, 0
894 // vmerge.vim v8, v8, -1, v0
895 // vfcvt.f.x.v v8, v8
896
897 // And for fp vector to mask, we use:
898 // vfncvt.rtz.x.f.w v9, v8
899 // vand.vi v8, v9, 1
900 // vmsne.vi v0, v8, 0
901 return 3;
902 }
903 if (std::abs(PowDiff) <= 1)
904 return 1;
905 // Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8),
906 // so it only need two conversion.
907 if (Src->isIntOrIntVectorTy())
908 return 2;
909 // Counts of narrow/widen instructions.
910 return std::abs(PowDiff);
911 }
912 }
913 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
914 }
915
getEstimatedVLFor(VectorType * Ty)916 unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) {
917 if (isa<ScalableVectorType>(Ty)) {
918 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
919 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
920 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
921 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
922 }
923 return cast<FixedVectorType>(Ty)->getNumElements();
924 }
925
926 InstructionCost
getMinMaxReductionCost(Intrinsic::ID IID,VectorType * Ty,FastMathFlags FMF,TTI::TargetCostKind CostKind)927 RISCVTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
928 FastMathFlags FMF,
929 TTI::TargetCostKind CostKind) {
930 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
931 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
932
933 // Skip if scalar size of Ty is bigger than ELEN.
934 if (Ty->getScalarSizeInBits() > ST->getELen())
935 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
936
937 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
938 if (Ty->getElementType()->isIntegerTy(1))
939 // vcpop sequences, see vreduction-mask.ll. umax, smin actually only
940 // cost 2, but we don't have enough info here so we slightly over cost.
941 return (LT.first - 1) + 3;
942
943 // IR Reduction is composed by two vmv and one rvv reduction instruction.
944 InstructionCost BaseCost = 2;
945
946 if (CostKind == TTI::TCK_CodeSize)
947 return (LT.first - 1) + BaseCost;
948
949 unsigned VL = getEstimatedVLFor(Ty);
950 return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
951 }
952
953 InstructionCost
getArithmeticReductionCost(unsigned Opcode,VectorType * Ty,std::optional<FastMathFlags> FMF,TTI::TargetCostKind CostKind)954 RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
955 std::optional<FastMathFlags> FMF,
956 TTI::TargetCostKind CostKind) {
957 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
958 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
959
960 // Skip if scalar size of Ty is bigger than ELEN.
961 if (Ty->getScalarSizeInBits() > ST->getELen())
962 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
963
964 int ISD = TLI->InstructionOpcodeToISD(Opcode);
965 assert(ISD && "Invalid opcode");
966
967 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
968 ISD != ISD::FADD)
969 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
970
971 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
972 if (Ty->getElementType()->isIntegerTy(1))
973 // vcpop sequences, see vreduction-mask.ll
974 return (LT.first - 1) + (ISD == ISD::AND ? 3 : 2);
975
976 // IR Reduction is composed by two vmv and one rvv reduction instruction.
977 InstructionCost BaseCost = 2;
978
979 if (CostKind == TTI::TCK_CodeSize)
980 return (LT.first - 1) + BaseCost;
981
982 unsigned VL = getEstimatedVLFor(Ty);
983 if (TTI::requiresOrderedReduction(FMF))
984 return (LT.first - 1) + BaseCost + VL;
985 return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
986 }
987
getExtendedReductionCost(unsigned Opcode,bool IsUnsigned,Type * ResTy,VectorType * ValTy,FastMathFlags FMF,TTI::TargetCostKind CostKind)988 InstructionCost RISCVTTIImpl::getExtendedReductionCost(
989 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
990 FastMathFlags FMF, TTI::TargetCostKind CostKind) {
991 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
992 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
993 FMF, CostKind);
994
995 // Skip if scalar size of ResTy is bigger than ELEN.
996 if (ResTy->getScalarSizeInBits() > ST->getELen())
997 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
998 FMF, CostKind);
999
1000 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
1001 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1002 FMF, CostKind);
1003
1004 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1005
1006 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
1007 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1008 FMF, CostKind);
1009
1010 return (LT.first - 1) +
1011 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1012 }
1013
getStoreImmCost(Type * Ty,TTI::OperandValueInfo OpInfo,TTI::TargetCostKind CostKind)1014 InstructionCost RISCVTTIImpl::getStoreImmCost(Type *Ty,
1015 TTI::OperandValueInfo OpInfo,
1016 TTI::TargetCostKind CostKind) {
1017 assert(OpInfo.isConstant() && "non constant operand?");
1018 if (!isa<VectorType>(Ty))
1019 // FIXME: We need to account for immediate materialization here, but doing
1020 // a decent job requires more knowledge about the immediate than we
1021 // currently have here.
1022 return 0;
1023
1024 if (OpInfo.isUniform())
1025 // vmv.x.i, vmv.v.x, or vfmv.v.f
1026 // We ignore the cost of the scalar constant materialization to be consistent
1027 // with how we treat scalar constants themselves just above.
1028 return 1;
1029
1030 return getConstantPoolLoadCost(Ty, CostKind);
1031 }
1032
1033
getMemoryOpCost(unsigned Opcode,Type * Src,MaybeAlign Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind,TTI::OperandValueInfo OpInfo,const Instruction * I)1034 InstructionCost RISCVTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1035 MaybeAlign Alignment,
1036 unsigned AddressSpace,
1037 TTI::TargetCostKind CostKind,
1038 TTI::OperandValueInfo OpInfo,
1039 const Instruction *I) {
1040 EVT VT = TLI->getValueType(DL, Src, true);
1041 // Type legalization can't handle structs
1042 if (VT == MVT::Other)
1043 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1044 CostKind, OpInfo, I);
1045
1046 InstructionCost Cost = 0;
1047 if (Opcode == Instruction::Store && OpInfo.isConstant())
1048 Cost += getStoreImmCost(Src, OpInfo, CostKind);
1049 InstructionCost BaseCost =
1050 BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1051 CostKind, OpInfo, I);
1052 // Assume memory ops cost scale with the number of vector registers
1053 // possible accessed by the instruction. Note that BasicTTI already
1054 // handles the LT.first term for us.
1055 if (std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1056 LT.second.isVector() && CostKind != TTI::TCK_CodeSize)
1057 BaseCost *= TLI->getLMULCost(LT.second);
1058 return Cost + BaseCost;
1059
1060 }
1061
getCmpSelInstrCost(unsigned Opcode,Type * ValTy,Type * CondTy,CmpInst::Predicate VecPred,TTI::TargetCostKind CostKind,const Instruction * I)1062 InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
1063 Type *CondTy,
1064 CmpInst::Predicate VecPred,
1065 TTI::TargetCostKind CostKind,
1066 const Instruction *I) {
1067 if (CostKind != TTI::TCK_RecipThroughput)
1068 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1069 I);
1070
1071 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1072 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1073 I);
1074
1075 // Skip if scalar size of ValTy is bigger than ELEN.
1076 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
1077 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1078 I);
1079
1080 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
1081 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1082 if (CondTy->isVectorTy()) {
1083 if (ValTy->getScalarSizeInBits() == 1) {
1084 // vmandn.mm v8, v8, v9
1085 // vmand.mm v9, v0, v9
1086 // vmor.mm v0, v9, v8
1087 return LT.first * 3;
1088 }
1089 // vselect and max/min are supported natively.
1090 return LT.first * 1;
1091 }
1092
1093 if (ValTy->getScalarSizeInBits() == 1) {
1094 // vmv.v.x v9, a0
1095 // vmsne.vi v9, v9, 0
1096 // vmandn.mm v8, v8, v9
1097 // vmand.mm v9, v0, v9
1098 // vmor.mm v0, v9, v8
1099 return LT.first * 5;
1100 }
1101
1102 // vmv.v.x v10, a0
1103 // vmsne.vi v0, v10, 0
1104 // vmerge.vvm v8, v9, v8, v0
1105 return LT.first * 3;
1106 }
1107
1108 if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1109 ValTy->isVectorTy()) {
1110 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1111
1112 // Support natively.
1113 if (CmpInst::isIntPredicate(VecPred))
1114 return LT.first * 1;
1115
1116 // If we do not support the input floating point vector type, use the base
1117 // one which will calculate as:
1118 // ScalarizeCost + Num * Cost for fixed vector,
1119 // InvalidCost for scalable vector.
1120 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
1121 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
1122 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
1123 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1124 I);
1125 switch (VecPred) {
1126 // Support natively.
1127 case CmpInst::FCMP_OEQ:
1128 case CmpInst::FCMP_OGT:
1129 case CmpInst::FCMP_OGE:
1130 case CmpInst::FCMP_OLT:
1131 case CmpInst::FCMP_OLE:
1132 case CmpInst::FCMP_UNE:
1133 return LT.first * 1;
1134 // TODO: Other comparisons?
1135 default:
1136 break;
1137 }
1138 }
1139
1140 // TODO: Add cost for scalar type.
1141
1142 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1143 }
1144
getCFInstrCost(unsigned Opcode,TTI::TargetCostKind CostKind,const Instruction * I)1145 InstructionCost RISCVTTIImpl::getCFInstrCost(unsigned Opcode,
1146 TTI::TargetCostKind CostKind,
1147 const Instruction *I) {
1148 if (CostKind != TTI::TCK_RecipThroughput)
1149 return Opcode == Instruction::PHI ? 0 : 1;
1150 // Branches are assumed to be predicted.
1151 return 0;
1152 }
1153
getVectorInstrCost(unsigned Opcode,Type * Val,TTI::TargetCostKind CostKind,unsigned Index,Value * Op0,Value * Op1)1154 InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
1155 TTI::TargetCostKind CostKind,
1156 unsigned Index, Value *Op0,
1157 Value *Op1) {
1158 assert(Val->isVectorTy() && "This must be a vector type");
1159
1160 if (Opcode != Instruction::ExtractElement &&
1161 Opcode != Instruction::InsertElement)
1162 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1163
1164 // Legalize the type.
1165 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
1166
1167 // This type is legalized to a scalar type.
1168 if (!LT.second.isVector()) {
1169 auto *FixedVecTy = cast<FixedVectorType>(Val);
1170 // If Index is a known constant, cost is zero.
1171 if (Index != -1U)
1172 return 0;
1173 // Extract/InsertElement with non-constant index is very costly when
1174 // scalarized; estimate cost of loads/stores sequence via the stack:
1175 // ExtractElement cost: store vector to stack, load scalar;
1176 // InsertElement cost: store vector to stack, store scalar, load vector.
1177 Type *ElemTy = FixedVecTy->getElementType();
1178 auto NumElems = FixedVecTy->getNumElements();
1179 auto Align = DL.getPrefTypeAlign(ElemTy);
1180 InstructionCost LoadCost =
1181 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);
1182 InstructionCost StoreCost =
1183 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);
1184 return Opcode == Instruction::ExtractElement
1185 ? StoreCost * NumElems + LoadCost
1186 : (StoreCost + LoadCost) * NumElems + StoreCost;
1187 }
1188
1189 // For unsupported scalable vector.
1190 if (LT.second.isScalableVector() && !LT.first.isValid())
1191 return LT.first;
1192
1193 if (!isTypeLegal(Val))
1194 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1195
1196 // Mask vector extract/insert is expanded via e8.
1197 if (Val->getScalarSizeInBits() == 1) {
1198 VectorType *WideTy =
1199 VectorType::get(IntegerType::get(Val->getContext(), 8),
1200 cast<VectorType>(Val)->getElementCount());
1201 if (Opcode == Instruction::ExtractElement) {
1202 InstructionCost ExtendCost
1203 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
1204 TTI::CastContextHint::None, CostKind);
1205 InstructionCost ExtractCost
1206 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
1207 return ExtendCost + ExtractCost;
1208 }
1209 InstructionCost ExtendCost
1210 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
1211 TTI::CastContextHint::None, CostKind);
1212 InstructionCost InsertCost
1213 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
1214 InstructionCost TruncCost
1215 = getCastInstrCost(Instruction::Trunc, Val, WideTy,
1216 TTI::CastContextHint::None, CostKind);
1217 return ExtendCost + InsertCost + TruncCost;
1218 }
1219
1220
1221 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
1222 // and vslideup + vmv.s.x to insert element to vector.
1223 unsigned BaseCost = 1;
1224 // When insertelement we should add the index with 1 as the input of vslideup.
1225 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
1226
1227 if (Index != -1U) {
1228 // The type may be split. For fixed-width vectors we can normalize the
1229 // index to the new type.
1230 if (LT.second.isFixedLengthVector()) {
1231 unsigned Width = LT.second.getVectorNumElements();
1232 Index = Index % Width;
1233 }
1234
1235 // We could extract/insert the first element without vslidedown/vslideup.
1236 if (Index == 0)
1237 SlideCost = 0;
1238 else if (Opcode == Instruction::InsertElement)
1239 SlideCost = 1; // With a constant index, we do not need to use addi.
1240 }
1241
1242 // Extract i64 in the target that has XLEN=32 need more instruction.
1243 if (Val->getScalarType()->isIntegerTy() &&
1244 ST->getXLen() < Val->getScalarSizeInBits()) {
1245 // For extractelement, we need the following instructions:
1246 // vsetivli zero, 1, e64, m1, ta, mu (not count)
1247 // vslidedown.vx v8, v8, a0
1248 // vmv.x.s a0, v8
1249 // li a1, 32
1250 // vsrl.vx v8, v8, a1
1251 // vmv.x.s a1, v8
1252
1253 // For insertelement, we need the following instructions:
1254 // vsetivli zero, 2, e32, m4, ta, mu (not count)
1255 // vmv.v.i v12, 0
1256 // vslide1up.vx v16, v12, a1
1257 // vslide1up.vx v12, v16, a0
1258 // addi a0, a2, 1
1259 // vsetvli zero, a0, e64, m4, tu, mu (not count)
1260 // vslideup.vx v8, v12, a2
1261
1262 // TODO: should we count these special vsetvlis?
1263 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
1264 }
1265 return BaseCost + SlideCost;
1266 }
1267
getArithmeticInstrCost(unsigned Opcode,Type * Ty,TTI::TargetCostKind CostKind,TTI::OperandValueInfo Op1Info,TTI::OperandValueInfo Op2Info,ArrayRef<const Value * > Args,const Instruction * CxtI)1268 InstructionCost RISCVTTIImpl::getArithmeticInstrCost(
1269 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1270 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
1271 ArrayRef<const Value *> Args, const Instruction *CxtI) {
1272
1273 // TODO: Handle more cost kinds.
1274 if (CostKind != TTI::TCK_RecipThroughput)
1275 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1276 Args, CxtI);
1277
1278 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1279 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1280 Args, CxtI);
1281
1282 // Skip if scalar size of Ty is bigger than ELEN.
1283 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())
1284 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1285 Args, CxtI);
1286
1287 // Legalize the type.
1288 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1289
1290 // TODO: Handle scalar type.
1291 if (!LT.second.isVector())
1292 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1293 Args, CxtI);
1294
1295
1296 auto getConstantMatCost =
1297 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
1298 if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand))
1299 // Two sub-cases:
1300 // * Has a 5 bit immediate operand which can be splatted.
1301 // * Has a larger immediate which must be materialized in scalar register
1302 // We return 0 for both as we currently ignore the cost of materializing
1303 // scalar constants in GPRs.
1304 return 0;
1305
1306 return getConstantPoolLoadCost(Ty, CostKind);
1307 };
1308
1309 // Add the cost of materializing any constant vectors required.
1310 InstructionCost ConstantMatCost = 0;
1311 if (Op1Info.isConstant())
1312 ConstantMatCost += getConstantMatCost(0, Op1Info);
1313 if (Op2Info.isConstant())
1314 ConstantMatCost += getConstantMatCost(1, Op2Info);
1315
1316 switch (TLI->InstructionOpcodeToISD(Opcode)) {
1317 case ISD::ADD:
1318 case ISD::SUB:
1319 case ISD::AND:
1320 case ISD::OR:
1321 case ISD::XOR:
1322 case ISD::SHL:
1323 case ISD::SRL:
1324 case ISD::SRA:
1325 case ISD::MUL:
1326 case ISD::MULHS:
1327 case ISD::MULHU:
1328 case ISD::FADD:
1329 case ISD::FSUB:
1330 case ISD::FMUL:
1331 case ISD::FNEG: {
1332 return ConstantMatCost + TLI->getLMULCost(LT.second) * LT.first * 1;
1333 }
1334 default:
1335 return ConstantMatCost +
1336 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1337 Args, CxtI);
1338 }
1339 }
1340
1341 // TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
getPointersChainCost(ArrayRef<const Value * > Ptrs,const Value * Base,const TTI::PointersChainInfo & Info,Type * AccessTy,TTI::TargetCostKind CostKind)1342 InstructionCost RISCVTTIImpl::getPointersChainCost(
1343 ArrayRef<const Value *> Ptrs, const Value *Base,
1344 const TTI::PointersChainInfo &Info, Type *AccessTy,
1345 TTI::TargetCostKind CostKind) {
1346 InstructionCost Cost = TTI::TCC_Free;
1347 // In the basic model we take into account GEP instructions only
1348 // (although here can come alloca instruction, a value, constants and/or
1349 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
1350 // pointer). Typically, if Base is a not a GEP-instruction and all the
1351 // pointers are relative to the same base address, all the rest are
1352 // either GEP instructions, PHIs, bitcasts or constants. When we have same
1353 // base, we just calculate cost of each non-Base GEP as an ADD operation if
1354 // any their index is a non-const.
1355 // If no known dependecies between the pointers cost is calculated as a sum
1356 // of costs of GEP instructions.
1357 for (auto [I, V] : enumerate(Ptrs)) {
1358 const auto *GEP = dyn_cast<GetElementPtrInst>(V);
1359 if (!GEP)
1360 continue;
1361 if (Info.isSameBase() && V != Base) {
1362 if (GEP->hasAllConstantIndices())
1363 continue;
1364 // If the chain is unit-stride and BaseReg + stride*i is a legal
1365 // addressing mode, then presume the base GEP is sitting around in a
1366 // register somewhere and check if we can fold the offset relative to
1367 // it.
1368 unsigned Stride = DL.getTypeStoreSize(AccessTy);
1369 if (Info.isUnitStride() &&
1370 isLegalAddressingMode(AccessTy,
1371 /* BaseGV */ nullptr,
1372 /* BaseOffset */ Stride * I,
1373 /* HasBaseReg */ true,
1374 /* Scale */ 0,
1375 GEP->getType()->getPointerAddressSpace()))
1376 continue;
1377 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
1378 {TTI::OK_AnyValue, TTI::OP_None},
1379 {TTI::OK_AnyValue, TTI::OP_None},
1380 std::nullopt);
1381 } else {
1382 SmallVector<const Value *> Indices(GEP->indices());
1383 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
1384 Indices, AccessTy, CostKind);
1385 }
1386 }
1387 return Cost;
1388 }
1389
getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP,OptimizationRemarkEmitter * ORE)1390 void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1391 TTI::UnrollingPreferences &UP,
1392 OptimizationRemarkEmitter *ORE) {
1393 // TODO: More tuning on benchmarks and metrics with changes as needed
1394 // would apply to all settings below to enable performance.
1395
1396
1397 if (ST->enableDefaultUnroll())
1398 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
1399
1400 // Enable Upper bound unrolling universally, not dependant upon the conditions
1401 // below.
1402 UP.UpperBound = true;
1403
1404 // Disable loop unrolling for Oz and Os.
1405 UP.OptSizeThreshold = 0;
1406 UP.PartialOptSizeThreshold = 0;
1407 if (L->getHeader()->getParent()->hasOptSize())
1408 return;
1409
1410 SmallVector<BasicBlock *, 4> ExitingBlocks;
1411 L->getExitingBlocks(ExitingBlocks);
1412 LLVM_DEBUG(dbgs() << "Loop has:\n"
1413 << "Blocks: " << L->getNumBlocks() << "\n"
1414 << "Exit blocks: " << ExitingBlocks.size() << "\n");
1415
1416 // Only allow another exit other than the latch. This acts as an early exit
1417 // as it mirrors the profitability calculation of the runtime unroller.
1418 if (ExitingBlocks.size() > 2)
1419 return;
1420
1421 // Limit the CFG of the loop body for targets with a branch predictor.
1422 // Allowing 4 blocks permits if-then-else diamonds in the body.
1423 if (L->getNumBlocks() > 4)
1424 return;
1425
1426 // Don't unroll vectorized loops, including the remainder loop
1427 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
1428 return;
1429
1430 // Scan the loop: don't unroll loops with calls as this could prevent
1431 // inlining.
1432 InstructionCost Cost = 0;
1433 for (auto *BB : L->getBlocks()) {
1434 for (auto &I : *BB) {
1435 // Initial setting - Don't unroll loops containing vectorized
1436 // instructions.
1437 if (I.getType()->isVectorTy())
1438 return;
1439
1440 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1441 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
1442 if (!isLoweredToCall(F))
1443 continue;
1444 }
1445 return;
1446 }
1447
1448 SmallVector<const Value *> Operands(I.operand_values());
1449 Cost += getInstructionCost(&I, Operands,
1450 TargetTransformInfo::TCK_SizeAndLatency);
1451 }
1452 }
1453
1454 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
1455
1456 UP.Partial = true;
1457 UP.Runtime = true;
1458 UP.UnrollRemainder = true;
1459 UP.UnrollAndJam = true;
1460 UP.UnrollAndJamInnerLoopThreshold = 60;
1461
1462 // Force unrolling small loops can be very useful because of the branch
1463 // taken cost of the backedge.
1464 if (Cost < 12)
1465 UP.Force = true;
1466 }
1467
getPeelingPreferences(Loop * L,ScalarEvolution & SE,TTI::PeelingPreferences & PP)1468 void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1469 TTI::PeelingPreferences &PP) {
1470 BaseT::getPeelingPreferences(L, SE, PP);
1471 }
1472
getRegUsageForType(Type * Ty)1473 unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) {
1474 TypeSize Size = DL.getTypeSizeInBits(Ty);
1475 if (Ty->isVectorTy()) {
1476 if (Size.isScalable() && ST->hasVInstructions())
1477 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
1478
1479 if (ST->useRVVForFixedLengthVectors())
1480 return divideCeil(Size, ST->getRealMinVLen());
1481 }
1482
1483 return BaseT::getRegUsageForType(Ty);
1484 }
1485
getMaximumVF(unsigned ElemWidth,unsigned Opcode) const1486 unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1487 if (SLPMaxVF.getNumOccurrences())
1488 return SLPMaxVF;
1489
1490 // Return how many elements can fit in getRegisterBitwidth. This is the
1491 // same routine as used in LoopVectorizer. We should probably be
1492 // accounting for whether we actually have instructions with the right
1493 // lane type, but we don't have enough information to do that without
1494 // some additional plumbing which hasn't been justified yet.
1495 TypeSize RegWidth =
1496 getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector);
1497 // If no vector registers, or absurd element widths, disable
1498 // vectorization by returning 1.
1499 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
1500 }
1501
isLSRCostLess(const TargetTransformInfo::LSRCost & C1,const TargetTransformInfo::LSRCost & C2)1502 bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
1503 const TargetTransformInfo::LSRCost &C2) {
1504 // RISC-V specific here are "instruction number 1st priority".
1505 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
1506 C1.NumIVMuls, C1.NumBaseAdds,
1507 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
1508 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
1509 C2.NumIVMuls, C2.NumBaseAdds,
1510 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
1511 }
1512