1 //===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8
9 #include "RISCVTargetTransformInfo.h"
10 #include "MCTargetDesc/RISCVMatInt.h"
11 #include "llvm/Analysis/TargetTransformInfo.h"
12 #include "llvm/CodeGen/BasicTTIImpl.h"
13 #include "llvm/CodeGen/TargetLowering.h"
14 #include <cmath>
15 using namespace llvm;
16
17 #define DEBUG_TYPE "riscvtti"
18
19 static cl::opt<unsigned> RVVRegisterWidthLMUL(
20 "riscv-v-register-bit-width-lmul",
21 cl::desc(
22 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
23 "by autovectorized code. Fractional LMULs are not supported."),
24 cl::init(1), cl::Hidden);
25
getIntImmCost(const APInt & Imm,Type * Ty,TTI::TargetCostKind CostKind)26 InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
27 TTI::TargetCostKind CostKind) {
28 assert(Ty->isIntegerTy() &&
29 "getIntImmCost can only estimate cost of materialising integers");
30
31 // We have a Zero register, so 0 is always free.
32 if (Imm == 0)
33 return TTI::TCC_Free;
34
35 // Otherwise, we check how many instructions it will take to materialise.
36 const DataLayout &DL = getDataLayout();
37 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty),
38 getST()->getFeatureBits());
39 }
40
getIntImmCostInst(unsigned Opcode,unsigned Idx,const APInt & Imm,Type * Ty,TTI::TargetCostKind CostKind,Instruction * Inst)41 InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
42 const APInt &Imm, Type *Ty,
43 TTI::TargetCostKind CostKind,
44 Instruction *Inst) {
45 assert(Ty->isIntegerTy() &&
46 "getIntImmCost can only estimate cost of materialising integers");
47
48 // We have a Zero register, so 0 is always free.
49 if (Imm == 0)
50 return TTI::TCC_Free;
51
52 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
53 // commutative, in others the immediate comes from a specific argument index.
54 bool Takes12BitImm = false;
55 unsigned ImmArgIdx = ~0U;
56
57 switch (Opcode) {
58 case Instruction::GetElementPtr:
59 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
60 // split up large offsets in GEP into better parts than ConstantHoisting
61 // can.
62 return TTI::TCC_Free;
63 case Instruction::And:
64 // zext.h
65 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
66 return TTI::TCC_Free;
67 // zext.w
68 if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba())
69 return TTI::TCC_Free;
70 LLVM_FALLTHROUGH;
71 case Instruction::Add:
72 case Instruction::Or:
73 case Instruction::Xor:
74 case Instruction::Mul:
75 Takes12BitImm = true;
76 break;
77 case Instruction::Sub:
78 case Instruction::Shl:
79 case Instruction::LShr:
80 case Instruction::AShr:
81 Takes12BitImm = true;
82 ImmArgIdx = 1;
83 break;
84 default:
85 break;
86 }
87
88 if (Takes12BitImm) {
89 // Check immediate is the correct argument...
90 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
91 // ... and fits into the 12-bit immediate.
92 if (Imm.getMinSignedBits() <= 64 &&
93 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
94 return TTI::TCC_Free;
95 }
96 }
97
98 // Otherwise, use the full materialisation cost.
99 return getIntImmCost(Imm, Ty, CostKind);
100 }
101
102 // By default, prevent hoisting.
103 return TTI::TCC_Free;
104 }
105
106 InstructionCost
getIntImmCostIntrin(Intrinsic::ID IID,unsigned Idx,const APInt & Imm,Type * Ty,TTI::TargetCostKind CostKind)107 RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
108 const APInt &Imm, Type *Ty,
109 TTI::TargetCostKind CostKind) {
110 // Prevent hoisting in unknown cases.
111 return TTI::TCC_Free;
112 }
113
114 TargetTransformInfo::PopcntSupportKind
getPopcntSupport(unsigned TyWidth)115 RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) {
116 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
117 return ST->hasStdExtZbb() ? TTI::PSK_FastHardware : TTI::PSK_Software;
118 }
119
shouldExpandReduction(const IntrinsicInst * II) const120 bool RISCVTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const {
121 // Currently, the ExpandReductions pass can't expand scalable-vector
122 // reductions, but we still request expansion as RVV doesn't support certain
123 // reductions and the SelectionDAG can't legalize them either.
124 switch (II->getIntrinsicID()) {
125 default:
126 return false;
127 // These reductions have no equivalent in RVV
128 case Intrinsic::vector_reduce_mul:
129 case Intrinsic::vector_reduce_fmul:
130 return true;
131 }
132 }
133
getMaxVScale() const134 Optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
135 if (ST->hasVInstructions())
136 return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock;
137 return BaseT::getMaxVScale();
138 }
139
getVScaleForTuning() const140 Optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
141 if (ST->hasVInstructions())
142 return ST->getRealMinVLen() / RISCV::RVVBitsPerBlock;
143 return BaseT::getVScaleForTuning();
144 }
145
146 TypeSize
getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const147 RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
148 unsigned LMUL = PowerOf2Floor(
149 std::max<unsigned>(std::min<unsigned>(RVVRegisterWidthLMUL, 8), 1));
150 switch (K) {
151 case TargetTransformInfo::RGK_Scalar:
152 return TypeSize::getFixed(ST->getXLen());
153 case TargetTransformInfo::RGK_FixedWidthVector:
154 return TypeSize::getFixed(
155 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
156 case TargetTransformInfo::RGK_ScalableVector:
157 return TypeSize::getScalable(
158 ST->hasVInstructions() ? LMUL * RISCV::RVVBitsPerBlock : 0);
159 }
160
161 llvm_unreachable("Unsupported register kind");
162 }
163
getSpliceCost(VectorType * Tp,int Index)164 InstructionCost RISCVTTIImpl::getSpliceCost(VectorType *Tp, int Index) {
165 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
166
167 unsigned Cost = 2; // vslidedown+vslideup.
168 // TODO: LMUL should increase cost.
169 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
170 // of similar code, but I think we expand through memory.
171 return Cost * LT.first;
172 }
173
getShuffleCost(TTI::ShuffleKind Kind,VectorType * Tp,ArrayRef<int> Mask,int Index,VectorType * SubTp,ArrayRef<const Value * > Args)174 InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
175 VectorType *Tp, ArrayRef<int> Mask,
176 int Index, VectorType *SubTp,
177 ArrayRef<const Value *> Args) {
178 if (isa<ScalableVectorType>(Tp)) {
179 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
180 switch (Kind) {
181 default:
182 // Fallthrough to generic handling.
183 // TODO: Most of these cases will return getInvalid in generic code, and
184 // must be implemented here.
185 break;
186 case TTI::SK_Broadcast: {
187 return LT.first * 1;
188 }
189 case TTI::SK_Splice:
190 return getSpliceCost(Tp, Index);
191 case TTI::SK_Reverse:
192 // Most of the cost here is producing the vrgather index register
193 // Example sequence:
194 // csrr a0, vlenb
195 // srli a0, a0, 3
196 // addi a0, a0, -1
197 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
198 // vid.v v9
199 // vrsub.vx v10, v9, a0
200 // vrgather.vv v9, v8, v10
201 if (Tp->getElementType()->isIntegerTy(1))
202 // Mask operation additionally required extend and truncate
203 return LT.first * 9;
204 return LT.first * 6;
205 }
206 }
207
208 return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
209 }
210
211 InstructionCost
getMaskedMemoryOpCost(unsigned Opcode,Type * Src,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind)212 RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
213 unsigned AddressSpace,
214 TTI::TargetCostKind CostKind) {
215 if (!isa<ScalableVectorType>(Src))
216 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
217 CostKind);
218
219 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
220 }
221
getGatherScatterOpCost(unsigned Opcode,Type * DataTy,const Value * Ptr,bool VariableMask,Align Alignment,TTI::TargetCostKind CostKind,const Instruction * I)222 InstructionCost RISCVTTIImpl::getGatherScatterOpCost(
223 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
224 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
225 if (CostKind != TTI::TCK_RecipThroughput)
226 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
227 Alignment, CostKind, I);
228
229 if ((Opcode == Instruction::Load &&
230 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
231 (Opcode == Instruction::Store &&
232 !isLegalMaskedScatter(DataTy, Align(Alignment))))
233 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
234 Alignment, CostKind, I);
235
236 // Cost is proportional to the number of memory operations implied. For
237 // scalable vectors, we use an upper bound on that number since we don't
238 // know exactly what VL will be.
239 auto &VTy = *cast<VectorType>(DataTy);
240 InstructionCost MemOpCost = getMemoryOpCost(Opcode, VTy.getElementType(),
241 Alignment, 0, CostKind, I);
242 unsigned NumLoads = getMaxVLFor(&VTy);
243 return NumLoads * MemOpCost;
244 }
245
246 InstructionCost
getIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)247 RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
248 TTI::TargetCostKind CostKind) {
249 auto *RetTy = ICA.getReturnType();
250 switch (ICA.getID()) {
251 // TODO: add more intrinsic
252 case Intrinsic::experimental_stepvector: {
253 unsigned Cost = 1; // vid
254 auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
255 return Cost + (LT.first - 1);
256 }
257 default:
258 break;
259 }
260 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
261 }
262
getCastInstrCost(unsigned Opcode,Type * Dst,Type * Src,TTI::CastContextHint CCH,TTI::TargetCostKind CostKind,const Instruction * I)263 InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
264 Type *Src,
265 TTI::CastContextHint CCH,
266 TTI::TargetCostKind CostKind,
267 const Instruction *I) {
268 if (isa<VectorType>(Dst) && isa<VectorType>(Src)) {
269 // FIXME: Need to compute legalizing cost for illegal types.
270 if (!isTypeLegal(Src) || !isTypeLegal(Dst))
271 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
272
273 // Skip if element size of Dst or Src is bigger than ELEN.
274 if (Src->getScalarSizeInBits() > ST->getELEN() ||
275 Dst->getScalarSizeInBits() > ST->getELEN())
276 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
277
278 int ISD = TLI->InstructionOpcodeToISD(Opcode);
279 assert(ISD && "Invalid opcode");
280
281 // FIXME: Need to consider vsetvli and lmul.
282 int PowDiff = (int)Log2_32(Dst->getScalarSizeInBits()) -
283 (int)Log2_32(Src->getScalarSizeInBits());
284 switch (ISD) {
285 case ISD::SIGN_EXTEND:
286 case ISD::ZERO_EXTEND:
287 return 1;
288 case ISD::TRUNCATE:
289 case ISD::FP_EXTEND:
290 case ISD::FP_ROUND:
291 // Counts of narrow/widen instructions.
292 return std::abs(PowDiff);
293 case ISD::FP_TO_SINT:
294 case ISD::FP_TO_UINT:
295 case ISD::SINT_TO_FP:
296 case ISD::UINT_TO_FP:
297 if (std::abs(PowDiff) <= 1)
298 return 1;
299 // Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8),
300 // so it only need two conversion.
301 if (Src->isIntOrIntVectorTy())
302 return 2;
303 // Counts of narrow/widen instructions.
304 return std::abs(PowDiff);
305 }
306 }
307 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
308 }
309
getMaxVLFor(VectorType * Ty)310 unsigned RISCVTTIImpl::getMaxVLFor(VectorType *Ty) {
311 if (isa<ScalableVectorType>(Ty)) {
312 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
313 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
314 const unsigned VectorBitsMax = ST->getRealMaxVLen();
315 return RISCVTargetLowering::computeVLMAX(VectorBitsMax, EltSize, MinSize);
316 }
317 return cast<FixedVectorType>(Ty)->getNumElements();
318 }
319
320 InstructionCost
getMinMaxReductionCost(VectorType * Ty,VectorType * CondTy,bool IsUnsigned,TTI::TargetCostKind CostKind)321 RISCVTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
322 bool IsUnsigned,
323 TTI::TargetCostKind CostKind) {
324 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
325 return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
326
327 // Skip if scalar size of Ty is bigger than ELEN.
328 if (Ty->getScalarSizeInBits() > ST->getELEN())
329 return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
330
331 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
332 if (Ty->getElementType()->isIntegerTy(1))
333 // vcpop sequences, see vreduction-mask.ll. umax, smin actually only
334 // cost 2, but we don't have enough info here so we slightly over cost.
335 return (LT.first - 1) + 3;
336
337 // IR Reduction is composed by two vmv and one rvv reduction instruction.
338 InstructionCost BaseCost = 2;
339 unsigned VL = getMaxVLFor(Ty);
340 return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
341 }
342
343 InstructionCost
getArithmeticReductionCost(unsigned Opcode,VectorType * Ty,Optional<FastMathFlags> FMF,TTI::TargetCostKind CostKind)344 RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
345 Optional<FastMathFlags> FMF,
346 TTI::TargetCostKind CostKind) {
347 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
348 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
349
350 // Skip if scalar size of Ty is bigger than ELEN.
351 if (Ty->getScalarSizeInBits() > ST->getELEN())
352 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
353
354 int ISD = TLI->InstructionOpcodeToISD(Opcode);
355 assert(ISD && "Invalid opcode");
356
357 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
358 ISD != ISD::FADD)
359 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
360
361 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
362 if (Ty->getElementType()->isIntegerTy(1))
363 // vcpop sequences, see vreduction-mask.ll
364 return (LT.first - 1) + (ISD == ISD::AND ? 3 : 2);
365
366 // IR Reduction is composed by two vmv and one rvv reduction instruction.
367 InstructionCost BaseCost = 2;
368 unsigned VL = getMaxVLFor(Ty);
369 if (TTI::requiresOrderedReduction(FMF))
370 return (LT.first - 1) + BaseCost + VL;
371 return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
372 }
373
getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP,OptimizationRemarkEmitter * ORE)374 void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
375 TTI::UnrollingPreferences &UP,
376 OptimizationRemarkEmitter *ORE) {
377 // TODO: More tuning on benchmarks and metrics with changes as needed
378 // would apply to all settings below to enable performance.
379
380
381 if (ST->enableDefaultUnroll())
382 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
383
384 // Enable Upper bound unrolling universally, not dependant upon the conditions
385 // below.
386 UP.UpperBound = true;
387
388 // Disable loop unrolling for Oz and Os.
389 UP.OptSizeThreshold = 0;
390 UP.PartialOptSizeThreshold = 0;
391 if (L->getHeader()->getParent()->hasOptSize())
392 return;
393
394 SmallVector<BasicBlock *, 4> ExitingBlocks;
395 L->getExitingBlocks(ExitingBlocks);
396 LLVM_DEBUG(dbgs() << "Loop has:\n"
397 << "Blocks: " << L->getNumBlocks() << "\n"
398 << "Exit blocks: " << ExitingBlocks.size() << "\n");
399
400 // Only allow another exit other than the latch. This acts as an early exit
401 // as it mirrors the profitability calculation of the runtime unroller.
402 if (ExitingBlocks.size() > 2)
403 return;
404
405 // Limit the CFG of the loop body for targets with a branch predictor.
406 // Allowing 4 blocks permits if-then-else diamonds in the body.
407 if (L->getNumBlocks() > 4)
408 return;
409
410 // Don't unroll vectorized loops, including the remainder loop
411 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
412 return;
413
414 // Scan the loop: don't unroll loops with calls as this could prevent
415 // inlining.
416 InstructionCost Cost = 0;
417 for (auto *BB : L->getBlocks()) {
418 for (auto &I : *BB) {
419 // Initial setting - Don't unroll loops containing vectorized
420 // instructions.
421 if (I.getType()->isVectorTy())
422 return;
423
424 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
425 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
426 if (!isLoweredToCall(F))
427 continue;
428 }
429 return;
430 }
431
432 SmallVector<const Value *> Operands(I.operand_values());
433 Cost +=
434 getUserCost(&I, Operands, TargetTransformInfo::TCK_SizeAndLatency);
435 }
436 }
437
438 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
439
440 UP.Partial = true;
441 UP.Runtime = true;
442 UP.UnrollRemainder = true;
443 UP.UnrollAndJam = true;
444 UP.UnrollAndJamInnerLoopThreshold = 60;
445
446 // Force unrolling small loops can be very useful because of the branch
447 // taken cost of the backedge.
448 if (Cost < 12)
449 UP.Force = true;
450 }
451
getPeelingPreferences(Loop * L,ScalarEvolution & SE,TTI::PeelingPreferences & PP)452 void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
453 TTI::PeelingPreferences &PP) {
454 BaseT::getPeelingPreferences(L, SE, PP);
455 }
456
getRegUsageForType(Type * Ty)457 unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) {
458 TypeSize Size = Ty->getPrimitiveSizeInBits();
459 if (Ty->isVectorTy()) {
460 if (Size.isScalable() && ST->hasVInstructions())
461 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
462
463 if (ST->useRVVForFixedLengthVectors())
464 return divideCeil(Size, ST->getRealMinVLen());
465 }
466
467 return BaseT::getRegUsageForType(Ty);
468 }
469