1 //===- HexagonTargetTransformInfo.cpp - Hexagon specific TTI pass ---------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 /// \file 8 /// This file implements a TargetTransformInfo analysis pass specific to the 9 /// Hexagon target machine. It uses the target's detailed information to provide 10 /// more precise answers to certain TTI queries, while letting the target 11 /// independent and default TTI implementations handle the rest. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #include "HexagonTargetTransformInfo.h" 16 #include "HexagonSubtarget.h" 17 #include "llvm/Analysis/TargetTransformInfo.h" 18 #include "llvm/CodeGen/ValueTypes.h" 19 #include "llvm/IR/InstrTypes.h" 20 #include "llvm/IR/Instructions.h" 21 #include "llvm/IR/User.h" 22 #include "llvm/Support/Casting.h" 23 #include "llvm/Support/CommandLine.h" 24 #include "llvm/Transforms/Utils/LoopPeel.h" 25 #include "llvm/Transforms/Utils/UnrollLoop.h" 26 27 using namespace llvm; 28 29 #define DEBUG_TYPE "hexagontti" 30 31 static cl::opt<bool> HexagonAutoHVX("hexagon-autohvx", cl::init(false), 32 cl::Hidden, cl::desc("Enable loop vectorizer for HVX")); 33 34 static cl::opt<bool> EmitLookupTables("hexagon-emit-lookup-tables", 35 cl::init(true), cl::Hidden, 36 cl::desc("Control lookup table emission on Hexagon target")); 37 38 static cl::opt<bool> HexagonMaskedVMem("hexagon-masked-vmem", cl::init(true), 39 cl::Hidden, cl::desc("Enable loop vectorizer for HVX")); 40 41 // Constant "cost factor" to make floating point operations more expensive 42 // in terms of vectorization cost. This isn't the best way, but it should 43 // do. Ultimately, the cost should use cycles. 44 static const unsigned FloatFactor = 4; 45 46 bool HexagonTTIImpl::useHVX() const { 47 return ST.useHVXOps() && HexagonAutoHVX; 48 } 49 50 bool HexagonTTIImpl::isTypeForHVX(Type *VecTy) const { 51 if (!VecTy->isVectorTy() || isa<ScalableVectorType>(VecTy)) 52 return false; 53 // Avoid types like <2 x i32*>. 54 if (!cast<VectorType>(VecTy)->getElementType()->isIntegerTy()) 55 return false; 56 EVT VecVT = EVT::getEVT(VecTy); 57 if (!VecVT.isSimple() || VecVT.getSizeInBits() <= 64) 58 return false; 59 if (ST.isHVXVectorType(VecVT.getSimpleVT())) 60 return true; 61 auto Action = TLI.getPreferredVectorAction(VecVT.getSimpleVT()); 62 return Action == TargetLoweringBase::TypeWidenVector; 63 } 64 65 unsigned HexagonTTIImpl::getTypeNumElements(Type *Ty) const { 66 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) 67 return VTy->getNumElements(); 68 assert((Ty->isIntegerTy() || Ty->isFloatingPointTy()) && 69 "Expecting scalar type"); 70 return 1; 71 } 72 73 TargetTransformInfo::PopcntSupportKind 74 HexagonTTIImpl::getPopcntSupport(unsigned IntTyWidthInBit) const { 75 // Return fast hardware support as every input < 64 bits will be promoted 76 // to 64 bits. 77 return TargetTransformInfo::PSK_FastHardware; 78 } 79 80 // The Hexagon target can unroll loops with run-time trip counts. 81 void HexagonTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 82 TTI::UnrollingPreferences &UP) { 83 UP.Runtime = UP.Partial = true; 84 } 85 86 void HexagonTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 87 TTI::PeelingPreferences &PP) { 88 BaseT::getPeelingPreferences(L, SE, PP); 89 // Only try to peel innermost loops with small runtime trip counts. 90 if (L && L->empty() && canPeel(L) && 91 SE.getSmallConstantTripCount(L) == 0 && 92 SE.getSmallConstantMaxTripCount(L) > 0 && 93 SE.getSmallConstantMaxTripCount(L) <= 5) { 94 PP.PeelCount = 2; 95 } 96 } 97 98 bool HexagonTTIImpl::shouldFavorPostInc() const { 99 return true; 100 } 101 102 /// --- Vector TTI begin --- 103 104 unsigned HexagonTTIImpl::getNumberOfRegisters(bool Vector) const { 105 if (Vector) 106 return useHVX() ? 32 : 0; 107 return 32; 108 } 109 110 unsigned HexagonTTIImpl::getMaxInterleaveFactor(unsigned VF) { 111 return useHVX() ? 2 : 0; 112 } 113 114 unsigned HexagonTTIImpl::getRegisterBitWidth(bool Vector) const { 115 return Vector ? getMinVectorRegisterBitWidth() : 32; 116 } 117 118 unsigned HexagonTTIImpl::getMinVectorRegisterBitWidth() const { 119 return useHVX() ? ST.getVectorLength()*8 : 32; 120 } 121 122 unsigned HexagonTTIImpl::getMinimumVF(unsigned ElemWidth) const { 123 return (8 * ST.getVectorLength()) / ElemWidth; 124 } 125 126 unsigned HexagonTTIImpl::getScalarizationOverhead(VectorType *Ty, 127 const APInt &DemandedElts, 128 bool Insert, bool Extract) { 129 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract); 130 } 131 132 unsigned HexagonTTIImpl::getOperandsScalarizationOverhead( 133 ArrayRef<const Value*> Args, unsigned VF) { 134 return BaseT::getOperandsScalarizationOverhead(Args, VF); 135 } 136 137 unsigned HexagonTTIImpl::getCallInstrCost(Function *F, Type *RetTy, 138 ArrayRef<Type*> Tys, TTI::TargetCostKind CostKind) { 139 return BaseT::getCallInstrCost(F, RetTy, Tys, CostKind); 140 } 141 142 unsigned 143 HexagonTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 144 TTI::TargetCostKind CostKind) { 145 if (ICA.getID() == Intrinsic::bswap) { 146 std::pair<int, MVT> LT = TLI.getTypeLegalizationCost(DL, ICA.getReturnType()); 147 return LT.first + 2; 148 } 149 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 150 } 151 152 unsigned HexagonTTIImpl::getAddressComputationCost(Type *Tp, 153 ScalarEvolution *SE, const SCEV *S) { 154 return 0; 155 } 156 157 unsigned HexagonTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, 158 MaybeAlign Alignment, 159 unsigned AddressSpace, 160 TTI::TargetCostKind CostKind, 161 const Instruction *I) { 162 assert(Opcode == Instruction::Load || Opcode == Instruction::Store); 163 // TODO: Handle other cost kinds. 164 if (CostKind != TTI::TCK_RecipThroughput) 165 return 1; 166 167 if (Opcode == Instruction::Store) 168 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 169 CostKind, I); 170 171 if (Src->isVectorTy()) { 172 VectorType *VecTy = cast<VectorType>(Src); 173 unsigned VecWidth = VecTy->getPrimitiveSizeInBits().getFixedSize(); 174 if (useHVX() && isTypeForHVX(VecTy)) { 175 unsigned RegWidth = getRegisterBitWidth(true); 176 assert(RegWidth && "Non-zero vector register width expected"); 177 // Cost of HVX loads. 178 if (VecWidth % RegWidth == 0) 179 return VecWidth / RegWidth; 180 // Cost of constructing HVX vector from scalar loads 181 const Align RegAlign(RegWidth / 8); 182 if (!Alignment || *Alignment > RegAlign) 183 Alignment = RegAlign; 184 assert(Alignment); 185 unsigned AlignWidth = 8 * Alignment->value(); 186 unsigned NumLoads = alignTo(VecWidth, AlignWidth) / AlignWidth; 187 return 3 * NumLoads; 188 } 189 190 // Non-HVX vectors. 191 // Add extra cost for floating point types. 192 unsigned Cost = 193 VecTy->getElementType()->isFloatingPointTy() ? FloatFactor : 1; 194 195 // At this point unspecified alignment is considered as Align(1). 196 const Align BoundAlignment = std::min(Alignment.valueOrOne(), Align(8)); 197 unsigned AlignWidth = 8 * BoundAlignment.value(); 198 unsigned NumLoads = alignTo(VecWidth, AlignWidth) / AlignWidth; 199 if (Alignment == Align(4) || Alignment == Align(8)) 200 return Cost * NumLoads; 201 // Loads of less than 32 bits will need extra inserts to compose a vector. 202 assert(BoundAlignment <= Align(8)); 203 unsigned LogA = Log2(BoundAlignment); 204 return (3 - LogA) * Cost * NumLoads; 205 } 206 207 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 208 CostKind, I); 209 } 210 211 unsigned HexagonTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, 212 Align Alignment, 213 unsigned AddressSpace, 214 TTI::TargetCostKind CostKind) { 215 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 216 CostKind); 217 } 218 219 unsigned HexagonTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, 220 int Index, Type *SubTp) { 221 return 1; 222 } 223 224 unsigned HexagonTTIImpl::getGatherScatterOpCost( 225 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, 226 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { 227 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 228 Alignment, CostKind, I); 229 } 230 231 unsigned HexagonTTIImpl::getInterleavedMemoryOpCost( 232 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 233 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 234 bool UseMaskForCond, bool UseMaskForGaps) { 235 if (Indices.size() != Factor || UseMaskForCond || UseMaskForGaps) 236 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 237 Alignment, AddressSpace, 238 CostKind, 239 UseMaskForCond, UseMaskForGaps); 240 return getMemoryOpCost(Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, 241 CostKind); 242 } 243 244 unsigned HexagonTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 245 Type *CondTy, TTI::TargetCostKind CostKind, const Instruction *I) { 246 if (ValTy->isVectorTy() && CostKind == TTI::TCK_RecipThroughput) { 247 std::pair<int, MVT> LT = TLI.getTypeLegalizationCost(DL, ValTy); 248 if (Opcode == Instruction::FCmp) 249 return LT.first + FloatFactor * getTypeNumElements(ValTy); 250 } 251 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I); 252 } 253 254 unsigned HexagonTTIImpl::getArithmeticInstrCost( 255 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 256 TTI::OperandValueKind Opd1Info, 257 TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, 258 TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args, 259 const Instruction *CxtI) { 260 // TODO: Handle more cost kinds. 261 if (CostKind != TTI::TCK_RecipThroughput) 262 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, 263 Opd2Info, Opd1PropInfo, 264 Opd2PropInfo, Args, CxtI); 265 266 if (Ty->isVectorTy()) { 267 std::pair<int, MVT> LT = TLI.getTypeLegalizationCost(DL, Ty); 268 if (LT.second.isFloatingPoint()) 269 return LT.first + FloatFactor * getTypeNumElements(Ty); 270 } 271 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, Opd2Info, 272 Opd1PropInfo, Opd2PropInfo, Args, CxtI); 273 } 274 275 unsigned HexagonTTIImpl::getCastInstrCost(unsigned Opcode, Type *DstTy, 276 Type *SrcTy, TTI::CastContextHint CCH, 277 TTI::TargetCostKind CostKind, 278 const Instruction *I) { 279 if (SrcTy->isFPOrFPVectorTy() || DstTy->isFPOrFPVectorTy()) { 280 unsigned SrcN = SrcTy->isFPOrFPVectorTy() ? getTypeNumElements(SrcTy) : 0; 281 unsigned DstN = DstTy->isFPOrFPVectorTy() ? getTypeNumElements(DstTy) : 0; 282 283 std::pair<int, MVT> SrcLT = TLI.getTypeLegalizationCost(DL, SrcTy); 284 std::pair<int, MVT> DstLT = TLI.getTypeLegalizationCost(DL, DstTy); 285 unsigned Cost = std::max(SrcLT.first, DstLT.first) + FloatFactor * (SrcN + DstN); 286 // TODO: Allow non-throughput costs that aren't binary. 287 if (CostKind != TTI::TCK_RecipThroughput) 288 return Cost == 0 ? 0 : 1; 289 return Cost; 290 } 291 return 1; 292 } 293 294 unsigned HexagonTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, 295 unsigned Index) { 296 Type *ElemTy = Val->isVectorTy() ? cast<VectorType>(Val)->getElementType() 297 : Val; 298 if (Opcode == Instruction::InsertElement) { 299 // Need two rotations for non-zero index. 300 unsigned Cost = (Index != 0) ? 2 : 0; 301 if (ElemTy->isIntegerTy(32)) 302 return Cost; 303 // If it's not a 32-bit value, there will need to be an extract. 304 return Cost + getVectorInstrCost(Instruction::ExtractElement, Val, Index); 305 } 306 307 if (Opcode == Instruction::ExtractElement) 308 return 2; 309 310 return 1; 311 } 312 313 bool HexagonTTIImpl::isLegalMaskedStore(Type *DataType, Align /*Alignment*/) { 314 return HexagonMaskedVMem && isTypeForHVX(DataType); 315 } 316 317 bool HexagonTTIImpl::isLegalMaskedLoad(Type *DataType, Align /*Alignment*/) { 318 return HexagonMaskedVMem && isTypeForHVX(DataType); 319 } 320 321 /// --- Vector TTI end --- 322 323 unsigned HexagonTTIImpl::getPrefetchDistance() const { 324 return ST.getL1PrefetchDistance(); 325 } 326 327 unsigned HexagonTTIImpl::getCacheLineSize() const { 328 return ST.getL1CacheLineSize(); 329 } 330 331 int 332 HexagonTTIImpl::getUserCost(const User *U, 333 ArrayRef<const Value *> Operands, 334 TTI::TargetCostKind CostKind) { 335 auto isCastFoldedIntoLoad = [this](const CastInst *CI) -> bool { 336 if (!CI->isIntegerCast()) 337 return false; 338 // Only extensions from an integer type shorter than 32-bit to i32 339 // can be folded into the load. 340 const DataLayout &DL = getDataLayout(); 341 unsigned SBW = DL.getTypeSizeInBits(CI->getSrcTy()); 342 unsigned DBW = DL.getTypeSizeInBits(CI->getDestTy()); 343 if (DBW != 32 || SBW >= DBW) 344 return false; 345 346 const LoadInst *LI = dyn_cast<const LoadInst>(CI->getOperand(0)); 347 // Technically, this code could allow multiple uses of the load, and 348 // check if all the uses are the same extension operation, but this 349 // should be sufficient for most cases. 350 return LI && LI->hasOneUse(); 351 }; 352 353 if (const CastInst *CI = dyn_cast<const CastInst>(U)) 354 if (isCastFoldedIntoLoad(CI)) 355 return TargetTransformInfo::TCC_Free; 356 return BaseT::getUserCost(U, Operands, CostKind); 357 } 358 359 bool HexagonTTIImpl::shouldBuildLookupTables() const { 360 return EmitLookupTables; 361 } 362