1 //===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "RISCVTargetTransformInfo.h"
10 #include "MCTargetDesc/RISCVMatInt.h"
11 #include "llvm/Analysis/TargetTransformInfo.h"
12 #include "llvm/CodeGen/BasicTTIImpl.h"
13 #include "llvm/CodeGen/TargetLowering.h"
14 #include <cmath>
15 using namespace llvm;
16 
17 #define DEBUG_TYPE "riscvtti"
18 
19 static cl::opt<unsigned> RVVRegisterWidthLMUL(
20     "riscv-v-register-bit-width-lmul",
21     cl::desc(
22         "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
23         "by autovectorized code. Fractional LMULs are not supported."),
24     cl::init(1), cl::Hidden);
25 
26 InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
27                                             TTI::TargetCostKind CostKind) {
28   assert(Ty->isIntegerTy() &&
29          "getIntImmCost can only estimate cost of materialising integers");
30 
31   // We have a Zero register, so 0 is always free.
32   if (Imm == 0)
33     return TTI::TCC_Free;
34 
35   // Otherwise, we check how many instructions it will take to materialise.
36   const DataLayout &DL = getDataLayout();
37   return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty),
38                                     getST()->getFeatureBits());
39 }
40 
41 InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
42                                                 const APInt &Imm, Type *Ty,
43                                                 TTI::TargetCostKind CostKind,
44                                                 Instruction *Inst) {
45   assert(Ty->isIntegerTy() &&
46          "getIntImmCost can only estimate cost of materialising integers");
47 
48   // We have a Zero register, so 0 is always free.
49   if (Imm == 0)
50     return TTI::TCC_Free;
51 
52   // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
53   // commutative, in others the immediate comes from a specific argument index.
54   bool Takes12BitImm = false;
55   unsigned ImmArgIdx = ~0U;
56 
57   switch (Opcode) {
58   case Instruction::GetElementPtr:
59     // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
60     // split up large offsets in GEP into better parts than ConstantHoisting
61     // can.
62     return TTI::TCC_Free;
63   case Instruction::And:
64     // zext.h
65     if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
66       return TTI::TCC_Free;
67     // zext.w
68     if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZbb())
69       return TTI::TCC_Free;
70     LLVM_FALLTHROUGH;
71   case Instruction::Add:
72   case Instruction::Or:
73   case Instruction::Xor:
74   case Instruction::Mul:
75     Takes12BitImm = true;
76     break;
77   case Instruction::Sub:
78   case Instruction::Shl:
79   case Instruction::LShr:
80   case Instruction::AShr:
81     Takes12BitImm = true;
82     ImmArgIdx = 1;
83     break;
84   default:
85     break;
86   }
87 
88   if (Takes12BitImm) {
89     // Check immediate is the correct argument...
90     if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
91       // ... and fits into the 12-bit immediate.
92       if (Imm.getMinSignedBits() <= 64 &&
93           getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
94         return TTI::TCC_Free;
95       }
96     }
97 
98     // Otherwise, use the full materialisation cost.
99     return getIntImmCost(Imm, Ty, CostKind);
100   }
101 
102   // By default, prevent hoisting.
103   return TTI::TCC_Free;
104 }
105 
106 InstructionCost
107 RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
108                                   const APInt &Imm, Type *Ty,
109                                   TTI::TargetCostKind CostKind) {
110   // Prevent hoisting in unknown cases.
111   return TTI::TCC_Free;
112 }
113 
114 TargetTransformInfo::PopcntSupportKind
115 RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) {
116   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
117   return ST->hasStdExtZbb() ? TTI::PSK_FastHardware : TTI::PSK_Software;
118 }
119 
120 bool RISCVTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const {
121   // Currently, the ExpandReductions pass can't expand scalable-vector
122   // reductions, but we still request expansion as RVV doesn't support certain
123   // reductions and the SelectionDAG can't legalize them either.
124   switch (II->getIntrinsicID()) {
125   default:
126     return false;
127   // These reductions have no equivalent in RVV
128   case Intrinsic::vector_reduce_mul:
129   case Intrinsic::vector_reduce_fmul:
130     return true;
131   }
132 }
133 
134 Optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
135   // There is no assumption of the maximum vector length in V specification.
136   // We use the value specified by users as the maximum vector length.
137   // This function will use the assumed maximum vector length to get the
138   // maximum vscale for LoopVectorizer.
139   // If users do not specify the maximum vector length, we have no way to
140   // know whether the LoopVectorizer is safe to do or not.
141   // We only consider to use single vector register (LMUL = 1) to vectorize.
142   unsigned MaxVectorSizeInBits = ST->getMaxRVVVectorSizeInBits();
143   if (ST->hasVInstructions() && MaxVectorSizeInBits != 0)
144     return MaxVectorSizeInBits / RISCV::RVVBitsPerBlock;
145   return BaseT::getMaxVScale();
146 }
147 
148 TypeSize
149 RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
150   unsigned LMUL = PowerOf2Floor(
151       std::max<unsigned>(std::min<unsigned>(RVVRegisterWidthLMUL, 8), 1));
152   switch (K) {
153   case TargetTransformInfo::RGK_Scalar:
154     return TypeSize::getFixed(ST->getXLen());
155   case TargetTransformInfo::RGK_FixedWidthVector:
156     return TypeSize::getFixed(
157         ST->hasVInstructions() ? LMUL * ST->getMinRVVVectorSizeInBits() : 0);
158   case TargetTransformInfo::RGK_ScalableVector:
159     return TypeSize::getScalable(
160         ST->hasVInstructions() ? LMUL * RISCV::RVVBitsPerBlock : 0);
161   }
162 
163   llvm_unreachable("Unsupported register kind");
164 }
165 
166 InstructionCost RISCVTTIImpl::getSpliceCost(VectorType *Tp, int Index) {
167   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
168 
169   unsigned Cost = 2; // vslidedown+vslideup.
170   // TODO: LMUL should increase cost.
171   // TODO: Multiplying by LT.first implies this legalizes into multiple copies
172   // of similar code, but I think we expand through memory.
173   return Cost * LT.first;
174 }
175 
176 InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
177                                              VectorType *Tp, ArrayRef<int> Mask,
178                                              int Index, VectorType *SubTp,
179                                              ArrayRef<const Value *> Args) {
180   if (isa<ScalableVectorType>(Tp)) {
181     std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
182     switch (Kind) {
183     default:
184       // Fallthrough to generic handling.
185       // TODO: Most of these cases will return getInvalid in generic code, and
186       // must be implemented here.
187       break;
188     case TTI::SK_Broadcast: {
189       return LT.first * 1;
190     }
191     case TTI::SK_Splice:
192       return getSpliceCost(Tp, Index);
193     case TTI::SK_Reverse:
194       // Most of the cost here is producing the vrgather index register
195       // Example sequence:
196       //   csrr a0, vlenb
197       //   srli a0, a0, 3
198       //   addi a0, a0, -1
199       //   vsetvli a1, zero, e8, mf8, ta, mu (ignored)
200       //   vid.v v9
201       //   vrsub.vx v10, v9, a0
202       //   vrgather.vv v9, v8, v10
203       return LT.first * 6;
204     }
205   }
206 
207   return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
208 }
209 
210 InstructionCost
211 RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
212                                     unsigned AddressSpace,
213                                     TTI::TargetCostKind CostKind) {
214   if (!isa<ScalableVectorType>(Src))
215     return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
216                                         CostKind);
217 
218   return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
219 }
220 
221 InstructionCost RISCVTTIImpl::getGatherScatterOpCost(
222     unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
223     Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
224   if (CostKind != TTI::TCK_RecipThroughput)
225     return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
226                                          Alignment, CostKind, I);
227 
228   if ((Opcode == Instruction::Load &&
229        !isLegalMaskedGather(DataTy, Align(Alignment))) ||
230       (Opcode == Instruction::Store &&
231        !isLegalMaskedScatter(DataTy, Align(Alignment))))
232     return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
233                                          Alignment, CostKind, I);
234 
235   // Cost is proportional to the number of memory operations implied.  For
236   // scalable vectors, we use an upper bound on that number since we don't
237   // know exactly what VL will be.
238   auto &VTy = *cast<VectorType>(DataTy);
239   InstructionCost MemOpCost = getMemoryOpCost(Opcode, VTy.getElementType(),
240                                               Alignment, 0, CostKind, I);
241   if (isa<ScalableVectorType>(VTy)) {
242     const unsigned EltSize = DL.getTypeSizeInBits(VTy.getElementType());
243     const unsigned MinSize = DL.getTypeSizeInBits(&VTy).getKnownMinValue();
244     const unsigned VectorBitsMax = ST->getRealMaxVLen();
245     const unsigned MaxVLMAX =
246       RISCVTargetLowering::computeVLMAX(VectorBitsMax, EltSize, MinSize);
247     return MaxVLMAX * MemOpCost;
248   }
249   unsigned NumLoads = cast<FixedVectorType>(VTy).getNumElements();
250   return NumLoads * MemOpCost;
251 }
252 
253 InstructionCost
254 RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
255                                     TTI::TargetCostKind CostKind) {
256   auto *RetTy = ICA.getReturnType();
257   switch (ICA.getID()) {
258   // TODO: add more intrinsic
259   case Intrinsic::experimental_stepvector: {
260     unsigned Cost = 1; // vid
261     auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
262     return Cost + (LT.first - 1);
263   }
264   default:
265     break;
266   }
267   return BaseT::getIntrinsicInstrCost(ICA, CostKind);
268 }
269 
270 InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
271                                                Type *Src,
272                                                TTI::CastContextHint CCH,
273                                                TTI::TargetCostKind CostKind,
274                                                const Instruction *I) {
275   if (isa<VectorType>(Dst) && isa<VectorType>(Src)) {
276     // FIXME: Need to compute legalizing cost for illegal types.
277     if (!isTypeLegal(Src) || !isTypeLegal(Dst))
278       return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
279 
280     // Skip if element size of Dst or Src is bigger than ELEN.
281     if (Src->getScalarSizeInBits() > ST->getELEN() ||
282         Dst->getScalarSizeInBits() > ST->getELEN())
283       return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
284 
285     int ISD = TLI->InstructionOpcodeToISD(Opcode);
286     assert(ISD && "Invalid opcode");
287 
288     // FIXME: Need to consider vsetvli and lmul.
289     int PowDiff = (int)Log2_32(Dst->getScalarSizeInBits()) -
290                   (int)Log2_32(Src->getScalarSizeInBits());
291     switch (ISD) {
292     case ISD::SIGN_EXTEND:
293     case ISD::ZERO_EXTEND:
294       return 1;
295     case ISD::TRUNCATE:
296     case ISD::FP_EXTEND:
297     case ISD::FP_ROUND:
298       // Counts of narrow/widen instructions.
299       return std::abs(PowDiff);
300     case ISD::FP_TO_SINT:
301     case ISD::FP_TO_UINT:
302     case ISD::SINT_TO_FP:
303     case ISD::UINT_TO_FP:
304       if (std::abs(PowDiff) <= 1)
305         return 1;
306       // Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8),
307       // so it only need two conversion.
308       if (Src->isIntOrIntVectorTy())
309         return 2;
310       // Counts of narrow/widen instructions.
311       return std::abs(PowDiff);
312     }
313   }
314   return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
315 }
316 
317 InstructionCost
318 RISCVTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
319                                      bool IsUnsigned,
320                                      TTI::TargetCostKind CostKind) {
321   // FIXME: Only supporting fixed vectors for now.
322   if (!isa<FixedVectorType>(Ty))
323     return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
324 
325   if (!ST->useRVVForFixedLengthVectors())
326     return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
327 
328   // Skip if scalar size of Ty is bigger than ELEN.
329   if (Ty->getScalarSizeInBits() > ST->getELEN())
330     return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
331 
332   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
333   if (Ty->getElementType()->isIntegerTy(1))
334     // vcpop sequences, see vreduction-mask.ll.  umax, smin actually only
335     // cost 2, but we don't have enough info here so we slightly over cost.
336     return (LT.first - 1) + 3;
337 
338   // IR Reduction is composed by two vmv and one rvv reduction instruction.
339   InstructionCost BaseCost = 2;
340   unsigned VL = cast<FixedVectorType>(Ty)->getNumElements();
341   return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
342 }
343 
344 InstructionCost
345 RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
346                                          Optional<FastMathFlags> FMF,
347                                          TTI::TargetCostKind CostKind) {
348   // FIXME: Only supporting fixed vectors for now.
349   if (!isa<FixedVectorType>(Ty))
350     return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
351 
352   if (!ST->useRVVForFixedLengthVectors())
353     return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
354 
355   // Skip if scalar size of Ty is bigger than ELEN.
356   if (Ty->getScalarSizeInBits() > ST->getELEN())
357     return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
358 
359   int ISD = TLI->InstructionOpcodeToISD(Opcode);
360   assert(ISD && "Invalid opcode");
361 
362   if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
363       ISD != ISD::FADD)
364     return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
365 
366   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
367   if (Ty->getElementType()->isIntegerTy(1))
368     // vcpop sequences, see vreduction-mask.ll
369     return (LT.first - 1) + (ISD == ISD::AND ? 3 : 2);
370 
371   // IR Reduction is composed by two vmv and one rvv reduction instruction.
372   InstructionCost BaseCost = 2;
373   unsigned VL = cast<FixedVectorType>(Ty)->getNumElements();
374   if (TTI::requiresOrderedReduction(FMF))
375     return (LT.first - 1) + BaseCost + VL;
376   return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
377 }
378 
379 void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
380                                            TTI::UnrollingPreferences &UP,
381                                            OptimizationRemarkEmitter *ORE) {
382   // TODO: More tuning on benchmarks and metrics with changes as needed
383   //       would apply to all settings below to enable performance.
384 
385 
386   if (ST->enableDefaultUnroll())
387     return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
388 
389   // Enable Upper bound unrolling universally, not dependant upon the conditions
390   // below.
391   UP.UpperBound = true;
392 
393   // Disable loop unrolling for Oz and Os.
394   UP.OptSizeThreshold = 0;
395   UP.PartialOptSizeThreshold = 0;
396   if (L->getHeader()->getParent()->hasOptSize())
397     return;
398 
399   SmallVector<BasicBlock *, 4> ExitingBlocks;
400   L->getExitingBlocks(ExitingBlocks);
401   LLVM_DEBUG(dbgs() << "Loop has:\n"
402                     << "Blocks: " << L->getNumBlocks() << "\n"
403                     << "Exit blocks: " << ExitingBlocks.size() << "\n");
404 
405   // Only allow another exit other than the latch. This acts as an early exit
406   // as it mirrors the profitability calculation of the runtime unroller.
407   if (ExitingBlocks.size() > 2)
408     return;
409 
410   // Limit the CFG of the loop body for targets with a branch predictor.
411   // Allowing 4 blocks permits if-then-else diamonds in the body.
412   if (L->getNumBlocks() > 4)
413     return;
414 
415   // Don't unroll vectorized loops, including the remainder loop
416   if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
417     return;
418 
419   // Scan the loop: don't unroll loops with calls as this could prevent
420   // inlining.
421   InstructionCost Cost = 0;
422   for (auto *BB : L->getBlocks()) {
423     for (auto &I : *BB) {
424       // Initial setting - Don't unroll loops containing vectorized
425       // instructions.
426       if (I.getType()->isVectorTy())
427         return;
428 
429       if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
430         if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
431           if (!isLoweredToCall(F))
432             continue;
433         }
434         return;
435       }
436 
437       SmallVector<const Value *> Operands(I.operand_values());
438       Cost +=
439           getUserCost(&I, Operands, TargetTransformInfo::TCK_SizeAndLatency);
440     }
441   }
442 
443   LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
444 
445   UP.Partial = true;
446   UP.Runtime = true;
447   UP.UnrollRemainder = true;
448   UP.UnrollAndJam = true;
449   UP.UnrollAndJamInnerLoopThreshold = 60;
450 
451   // Force unrolling small loops can be very useful because of the branch
452   // taken cost of the backedge.
453   if (Cost < 12)
454     UP.Force = true;
455 }
456 
457 void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
458                                          TTI::PeelingPreferences &PP) {
459   BaseT::getPeelingPreferences(L, SE, PP);
460 }
461 
462 unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) {
463   TypeSize Size = Ty->getPrimitiveSizeInBits();
464   if (Ty->isVectorTy()) {
465     if (Size.isScalable() && ST->hasVInstructions())
466       return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
467 
468     if (ST->useRVVForFixedLengthVectors())
469       return divideCeil(Size, ST->getMinRVVVectorSizeInBits());
470   }
471 
472   return BaseT::getRegUsageForType(Ty);
473 }
474