1 //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI pass --------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 /// \file 10 /// This file implements a TargetTransformInfo analysis pass specific to the 11 /// AArch64 target machine. It uses the target's detailed information to provide 12 /// more precise answers to certain TTI queries, while letting the target 13 /// independent and default TTI implementations handle the rest. 14 /// 15 //===----------------------------------------------------------------------===// 16 17 #include "AArch64.h" 18 #include "AArch64TargetMachine.h" 19 #include "MCTargetDesc/AArch64AddressingModes.h" 20 #include "llvm/Analysis/TargetTransformInfo.h" 21 #include "llvm/Support/Debug.h" 22 #include "llvm/Target/CostTable.h" 23 #include "llvm/Target/TargetLowering.h" 24 #include <algorithm> 25 using namespace llvm; 26 27 #define DEBUG_TYPE "aarch64tti" 28 29 // Declare the pass initialization routine locally as target-specific passes 30 // don't have a target-wide initialization entry point, and so we rely on the 31 // pass constructor initialization. 32 namespace llvm { 33 void initializeAArch64TTIPass(PassRegistry &); 34 } 35 36 namespace { 37 38 class AArch64TTI final : public ImmutablePass, public TargetTransformInfo { 39 const AArch64TargetMachine *TM; 40 const AArch64Subtarget *ST; 41 const AArch64TargetLowering *TLI; 42 43 /// Estimate the overhead of scalarizing an instruction. Insert and Extract 44 /// are set if the result needs to be inserted and/or extracted from vectors. 45 unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const; 46 47 public: 48 AArch64TTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) { 49 llvm_unreachable("This pass cannot be directly constructed"); 50 } 51 52 AArch64TTI(const AArch64TargetMachine *TM) 53 : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()), 54 TLI(TM->getSubtargetImpl()->getTargetLowering()) { 55 initializeAArch64TTIPass(*PassRegistry::getPassRegistry()); 56 } 57 58 void initializePass() override { pushTTIStack(this); } 59 60 void getAnalysisUsage(AnalysisUsage &AU) const override { 61 TargetTransformInfo::getAnalysisUsage(AU); 62 } 63 64 /// Pass identification. 65 static char ID; 66 67 /// Provide necessary pointer adjustments for the two base classes. 68 void *getAdjustedAnalysisPointer(const void *ID) override { 69 if (ID == &TargetTransformInfo::ID) 70 return (TargetTransformInfo *)this; 71 return this; 72 } 73 74 /// \name Scalar TTI Implementations 75 /// @{ 76 unsigned getIntImmCost(int64_t Val) const; 77 unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override; 78 unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, 79 Type *Ty) const override; 80 unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, 81 Type *Ty) const override; 82 PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override; 83 84 /// @} 85 86 /// \name Vector TTI Implementations 87 /// @{ 88 89 unsigned getNumberOfRegisters(bool Vector) const override { 90 if (Vector) { 91 if (ST->hasNEON()) 92 return 32; 93 return 0; 94 } 95 return 31; 96 } 97 98 unsigned getRegisterBitWidth(bool Vector) const override { 99 if (Vector) { 100 if (ST->hasNEON()) 101 return 128; 102 return 0; 103 } 104 return 64; 105 } 106 107 unsigned getMaximumUnrollFactor() const override { return 2; } 108 109 unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const 110 override; 111 112 unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const 113 override; 114 115 unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, 116 OperandValueKind Opd1Info = OK_AnyValue, 117 OperandValueKind Opd2Info = OK_AnyValue) const 118 override; 119 120 unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const override; 121 122 unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) const 123 override; 124 125 unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, 126 unsigned AddressSpace) const override; 127 128 unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type*> Tys) const override; 129 130 /// @} 131 }; 132 133 } // end anonymous namespace 134 135 INITIALIZE_AG_PASS(AArch64TTI, TargetTransformInfo, "aarch64tti", 136 "AArch64 Target Transform Info", true, true, false) 137 char AArch64TTI::ID = 0; 138 139 ImmutablePass * 140 llvm::createAArch64TargetTransformInfoPass(const AArch64TargetMachine *TM) { 141 return new AArch64TTI(TM); 142 } 143 144 /// \brief Calculate the cost of materializing a 64-bit value. This helper 145 /// method might only calculate a fraction of a larger immediate. Therefore it 146 /// is valid to return a cost of ZERO. 147 unsigned AArch64TTI::getIntImmCost(int64_t Val) const { 148 // Check if the immediate can be encoded within an instruction. 149 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64)) 150 return 0; 151 152 if (Val < 0) 153 Val = ~Val; 154 155 // Calculate how many moves we will need to materialize this constant. 156 unsigned LZ = countLeadingZeros((uint64_t)Val); 157 return (64 - LZ + 15) / 16; 158 } 159 160 /// \brief Calculate the cost of materializing the given constant. 161 unsigned AArch64TTI::getIntImmCost(const APInt &Imm, Type *Ty) const { 162 assert(Ty->isIntegerTy()); 163 164 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 165 if (BitSize == 0) 166 return ~0U; 167 168 // Sign-extend all constants to a multiple of 64-bit. 169 APInt ImmVal = Imm; 170 if (BitSize & 0x3f) 171 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU); 172 173 // Split the constant into 64-bit chunks and calculate the cost for each 174 // chunk. 175 unsigned Cost = 0; 176 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { 177 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); 178 int64_t Val = Tmp.getSExtValue(); 179 Cost += getIntImmCost(Val); 180 } 181 // We need at least one instruction to materialze the constant. 182 return std::max(1U, Cost); 183 } 184 185 unsigned AArch64TTI::getIntImmCost(unsigned Opcode, unsigned Idx, 186 const APInt &Imm, Type *Ty) const { 187 assert(Ty->isIntegerTy()); 188 189 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 190 // There is no cost model for constants with a bit size of 0. Return TCC_Free 191 // here, so that constant hoisting will ignore this constant. 192 if (BitSize == 0) 193 return TCC_Free; 194 195 unsigned ImmIdx = ~0U; 196 switch (Opcode) { 197 default: 198 return TCC_Free; 199 case Instruction::GetElementPtr: 200 // Always hoist the base address of a GetElementPtr. 201 if (Idx == 0) 202 return 2 * TCC_Basic; 203 return TCC_Free; 204 case Instruction::Store: 205 ImmIdx = 0; 206 break; 207 case Instruction::Add: 208 case Instruction::Sub: 209 case Instruction::Mul: 210 case Instruction::UDiv: 211 case Instruction::SDiv: 212 case Instruction::URem: 213 case Instruction::SRem: 214 case Instruction::And: 215 case Instruction::Or: 216 case Instruction::Xor: 217 case Instruction::ICmp: 218 ImmIdx = 1; 219 break; 220 // Always return TCC_Free for the shift value of a shift instruction. 221 case Instruction::Shl: 222 case Instruction::LShr: 223 case Instruction::AShr: 224 if (Idx == 1) 225 return TCC_Free; 226 break; 227 case Instruction::Trunc: 228 case Instruction::ZExt: 229 case Instruction::SExt: 230 case Instruction::IntToPtr: 231 case Instruction::PtrToInt: 232 case Instruction::BitCast: 233 case Instruction::PHI: 234 case Instruction::Call: 235 case Instruction::Select: 236 case Instruction::Ret: 237 case Instruction::Load: 238 break; 239 } 240 241 if (Idx == ImmIdx) { 242 unsigned NumConstants = (BitSize + 63) / 64; 243 unsigned Cost = AArch64TTI::getIntImmCost(Imm, Ty); 244 return (Cost <= NumConstants * TCC_Basic) 245 ? static_cast<unsigned>(TCC_Free) : Cost; 246 } 247 return AArch64TTI::getIntImmCost(Imm, Ty); 248 } 249 250 unsigned AArch64TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx, 251 const APInt &Imm, Type *Ty) const { 252 assert(Ty->isIntegerTy()); 253 254 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 255 // There is no cost model for constants with a bit size of 0. Return TCC_Free 256 // here, so that constant hoisting will ignore this constant. 257 if (BitSize == 0) 258 return TCC_Free; 259 260 switch (IID) { 261 default: 262 return TCC_Free; 263 case Intrinsic::sadd_with_overflow: 264 case Intrinsic::uadd_with_overflow: 265 case Intrinsic::ssub_with_overflow: 266 case Intrinsic::usub_with_overflow: 267 case Intrinsic::smul_with_overflow: 268 case Intrinsic::umul_with_overflow: 269 if (Idx == 1) { 270 unsigned NumConstants = (BitSize + 63) / 64; 271 unsigned Cost = AArch64TTI::getIntImmCost(Imm, Ty); 272 return (Cost <= NumConstants * TCC_Basic) 273 ? static_cast<unsigned>(TCC_Free) : Cost; 274 } 275 break; 276 case Intrinsic::experimental_stackmap: 277 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 278 return TCC_Free; 279 break; 280 case Intrinsic::experimental_patchpoint_void: 281 case Intrinsic::experimental_patchpoint_i64: 282 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 283 return TCC_Free; 284 break; 285 } 286 return AArch64TTI::getIntImmCost(Imm, Ty); 287 } 288 289 AArch64TTI::PopcntSupportKind 290 AArch64TTI::getPopcntSupport(unsigned TyWidth) const { 291 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 292 if (TyWidth == 32 || TyWidth == 64) 293 return PSK_FastHardware; 294 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount. 295 return PSK_Software; 296 } 297 298 unsigned AArch64TTI::getCastInstrCost(unsigned Opcode, Type *Dst, 299 Type *Src) const { 300 int ISD = TLI->InstructionOpcodeToISD(Opcode); 301 assert(ISD && "Invalid opcode"); 302 303 EVT SrcTy = TLI->getValueType(Src); 304 EVT DstTy = TLI->getValueType(Dst); 305 306 if (!SrcTy.isSimple() || !DstTy.isSimple()) 307 return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); 308 309 static const TypeConversionCostTblEntry<MVT> ConversionTbl[] = { 310 // LowerVectorINT_TO_FP: 311 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 312 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 313 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 314 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 315 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 316 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 317 318 // Complex: to v2f32 319 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 320 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, 321 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, 322 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 323 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, 324 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, 325 326 // Complex: to v4f32 327 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 }, 328 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 329 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, 330 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 331 332 // Complex: to v2f64 333 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 334 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, 335 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 336 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 337 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, 338 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 339 340 341 // LowerVectorFP_TO_INT 342 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 }, 343 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, 344 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, 345 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, 346 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, 347 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, 348 349 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext). 350 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 }, 351 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 }, 352 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 }, 353 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 }, 354 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 }, 355 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 }, 356 357 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2 358 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 }, 359 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 }, 360 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 }, 361 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 }, 362 363 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2. 364 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 }, 365 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 }, 366 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 }, 367 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 }, 368 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 }, 369 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 }, 370 }; 371 372 int Idx = ConvertCostTableLookup<MVT>( 373 ConversionTbl, array_lengthof(ConversionTbl), ISD, DstTy.getSimpleVT(), 374 SrcTy.getSimpleVT()); 375 if (Idx != -1) 376 return ConversionTbl[Idx].Cost; 377 378 return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); 379 } 380 381 unsigned AArch64TTI::getVectorInstrCost(unsigned Opcode, Type *Val, 382 unsigned Index) const { 383 assert(Val->isVectorTy() && "This must be a vector type"); 384 385 if (Index != -1U) { 386 // Legalize the type. 387 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val); 388 389 // This type is legalized to a scalar type. 390 if (!LT.second.isVector()) 391 return 0; 392 393 // The type may be split. Normalize the index to the new type. 394 unsigned Width = LT.second.getVectorNumElements(); 395 Index = Index % Width; 396 397 // The element at index zero is already inside the vector. 398 if (Index == 0) 399 return 0; 400 } 401 402 // All other insert/extracts cost this much. 403 return 2; 404 } 405 406 unsigned AArch64TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, 407 OperandValueKind Opd1Info, 408 OperandValueKind Opd2Info) const { 409 // Legalize the type. 410 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty); 411 412 int ISD = TLI->InstructionOpcodeToISD(Opcode); 413 414 switch (ISD) { 415 default: 416 return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Opd1Info, 417 Opd2Info); 418 case ISD::ADD: 419 case ISD::MUL: 420 case ISD::XOR: 421 case ISD::OR: 422 case ISD::AND: 423 // These nodes are marked as 'custom' for combining purposes only. 424 // We know that they are legal. See LowerAdd in ISelLowering. 425 return 1 * LT.first; 426 } 427 } 428 429 unsigned AArch64TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const { 430 // Address computations in vectorized code with non-consecutive addresses will 431 // likely result in more instructions compared to scalar code where the 432 // computation can more often be merged into the index mode. The resulting 433 // extra micro-ops can significantly decrease throughput. 434 unsigned NumVectorInstToHideOverhead = 10; 435 436 if (Ty->isVectorTy() && IsComplex) 437 return NumVectorInstToHideOverhead; 438 439 // In many cases the address computation is not merged into the instruction 440 // addressing mode. 441 return 1; 442 } 443 444 unsigned AArch64TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 445 Type *CondTy) const { 446 447 int ISD = TLI->InstructionOpcodeToISD(Opcode); 448 // We don't lower vector selects well that are wider than the register width. 449 if (ValTy->isVectorTy() && ISD == ISD::SELECT) { 450 // We would need this many instructions to hide the scalarization happening. 451 unsigned AmortizationCost = 20; 452 static const TypeConversionCostTblEntry<MVT::SimpleValueType> 453 VectorSelectTbl[] = { 454 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 * AmortizationCost }, 455 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 * AmortizationCost }, 456 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 * AmortizationCost }, 457 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost }, 458 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost }, 459 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost } 460 }; 461 462 EVT SelCondTy = TLI->getValueType(CondTy); 463 EVT SelValTy = TLI->getValueType(ValTy); 464 if (SelCondTy.isSimple() && SelValTy.isSimple()) { 465 int Idx = 466 ConvertCostTableLookup(VectorSelectTbl, ISD, SelCondTy.getSimpleVT(), 467 SelValTy.getSimpleVT()); 468 if (Idx != -1) 469 return VectorSelectTbl[Idx].Cost; 470 } 471 } 472 return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy); 473 } 474 475 unsigned AArch64TTI::getMemoryOpCost(unsigned Opcode, Type *Src, 476 unsigned Alignment, 477 unsigned AddressSpace) const { 478 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src); 479 480 if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 && 481 Src->getVectorElementType()->isIntegerTy(64)) { 482 // Unaligned stores are extremely inefficient. We don't split 483 // unaligned v2i64 stores because the negative impact that has shown in 484 // practice on inlined memcpy code. 485 // We make v2i64 stores expensive so that we will only vectorize if there 486 // are 6 other instructions getting vectorized. 487 unsigned AmortizationCost = 6; 488 489 return LT.first * 2 * AmortizationCost; 490 } 491 492 if (Src->isVectorTy() && Src->getVectorElementType()->isIntegerTy(8) && 493 Src->getVectorNumElements() < 8) { 494 // We scalarize the loads/stores because there is not v.4b register and we 495 // have to promote the elements to v.4h. 496 unsigned NumVecElts = Src->getVectorNumElements(); 497 unsigned NumVectorizableInstsToAmortize = NumVecElts * 2; 498 // We generate 2 instructions per vector element. 499 return NumVectorizableInstsToAmortize * NumVecElts * 2; 500 } 501 502 return LT.first; 503 } 504 505 unsigned AArch64TTI::getCostOfKeepingLiveOverCall(ArrayRef<Type*> Tys) const { 506 unsigned Cost = 0; 507 for (auto *I : Tys) { 508 if (!I->isVectorTy()) 509 continue; 510 if (I->getScalarSizeInBits() * I->getVectorNumElements() == 128) 511 Cost += getMemoryOpCost(Instruction::Store, I, 128, 0) + 512 getMemoryOpCost(Instruction::Load, I, 128, 0); 513 } 514 return Cost; 515 } 516