1 //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 10 #include "AArch64TargetTransformInfo.h" 11 #include "MCTargetDesc/AArch64AddressingModes.h" 12 #include "llvm/Analysis/TargetTransformInfo.h" 13 #include "llvm/CodeGen/BasicTTIImpl.h" 14 #include "llvm/Support/Debug.h" 15 #include "llvm/Target/CostTable.h" 16 #include "llvm/Target/TargetLowering.h" 17 #include <algorithm> 18 using namespace llvm; 19 20 #define DEBUG_TYPE "aarch64tti" 21 22 /// \brief Calculate the cost of materializing a 64-bit value. This helper 23 /// method might only calculate a fraction of a larger immediate. Therefore it 24 /// is valid to return a cost of ZERO. 25 unsigned AArch64TTIImpl::getIntImmCost(int64_t Val) { 26 // Check if the immediate can be encoded within an instruction. 27 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64)) 28 return 0; 29 30 if (Val < 0) 31 Val = ~Val; 32 33 // Calculate how many moves we will need to materialize this constant. 34 unsigned LZ = countLeadingZeros((uint64_t)Val); 35 return (64 - LZ + 15) / 16; 36 } 37 38 /// \brief Calculate the cost of materializing the given constant. 39 unsigned AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { 40 assert(Ty->isIntegerTy()); 41 42 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 43 if (BitSize == 0) 44 return ~0U; 45 46 // Sign-extend all constants to a multiple of 64-bit. 47 APInt ImmVal = Imm; 48 if (BitSize & 0x3f) 49 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU); 50 51 // Split the constant into 64-bit chunks and calculate the cost for each 52 // chunk. 53 unsigned Cost = 0; 54 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { 55 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); 56 int64_t Val = Tmp.getSExtValue(); 57 Cost += getIntImmCost(Val); 58 } 59 // We need at least one instruction to materialze the constant. 60 return std::max(1U, Cost); 61 } 62 63 unsigned AArch64TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, 64 const APInt &Imm, Type *Ty) { 65 assert(Ty->isIntegerTy()); 66 67 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 68 // There is no cost model for constants with a bit size of 0. Return TCC_Free 69 // here, so that constant hoisting will ignore this constant. 70 if (BitSize == 0) 71 return TTI::TCC_Free; 72 73 unsigned ImmIdx = ~0U; 74 switch (Opcode) { 75 default: 76 return TTI::TCC_Free; 77 case Instruction::GetElementPtr: 78 // Always hoist the base address of a GetElementPtr. 79 if (Idx == 0) 80 return 2 * TTI::TCC_Basic; 81 return TTI::TCC_Free; 82 case Instruction::Store: 83 ImmIdx = 0; 84 break; 85 case Instruction::Add: 86 case Instruction::Sub: 87 case Instruction::Mul: 88 case Instruction::UDiv: 89 case Instruction::SDiv: 90 case Instruction::URem: 91 case Instruction::SRem: 92 case Instruction::And: 93 case Instruction::Or: 94 case Instruction::Xor: 95 case Instruction::ICmp: 96 ImmIdx = 1; 97 break; 98 // Always return TCC_Free for the shift value of a shift instruction. 99 case Instruction::Shl: 100 case Instruction::LShr: 101 case Instruction::AShr: 102 if (Idx == 1) 103 return TTI::TCC_Free; 104 break; 105 case Instruction::Trunc: 106 case Instruction::ZExt: 107 case Instruction::SExt: 108 case Instruction::IntToPtr: 109 case Instruction::PtrToInt: 110 case Instruction::BitCast: 111 case Instruction::PHI: 112 case Instruction::Call: 113 case Instruction::Select: 114 case Instruction::Ret: 115 case Instruction::Load: 116 break; 117 } 118 119 if (Idx == ImmIdx) { 120 unsigned NumConstants = (BitSize + 63) / 64; 121 unsigned Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty); 122 return (Cost <= NumConstants * TTI::TCC_Basic) 123 ? static_cast<unsigned>(TTI::TCC_Free) 124 : Cost; 125 } 126 return AArch64TTIImpl::getIntImmCost(Imm, Ty); 127 } 128 129 unsigned AArch64TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, 130 const APInt &Imm, Type *Ty) { 131 assert(Ty->isIntegerTy()); 132 133 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 134 // There is no cost model for constants with a bit size of 0. Return TCC_Free 135 // here, so that constant hoisting will ignore this constant. 136 if (BitSize == 0) 137 return TTI::TCC_Free; 138 139 switch (IID) { 140 default: 141 return TTI::TCC_Free; 142 case Intrinsic::sadd_with_overflow: 143 case Intrinsic::uadd_with_overflow: 144 case Intrinsic::ssub_with_overflow: 145 case Intrinsic::usub_with_overflow: 146 case Intrinsic::smul_with_overflow: 147 case Intrinsic::umul_with_overflow: 148 if (Idx == 1) { 149 unsigned NumConstants = (BitSize + 63) / 64; 150 unsigned Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty); 151 return (Cost <= NumConstants * TTI::TCC_Basic) 152 ? static_cast<unsigned>(TTI::TCC_Free) 153 : Cost; 154 } 155 break; 156 case Intrinsic::experimental_stackmap: 157 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 158 return TTI::TCC_Free; 159 break; 160 case Intrinsic::experimental_patchpoint_void: 161 case Intrinsic::experimental_patchpoint_i64: 162 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 163 return TTI::TCC_Free; 164 break; 165 } 166 return AArch64TTIImpl::getIntImmCost(Imm, Ty); 167 } 168 169 TargetTransformInfo::PopcntSupportKind 170 AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) { 171 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 172 if (TyWidth == 32 || TyWidth == 64) 173 return TTI::PSK_FastHardware; 174 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount. 175 return TTI::PSK_Software; 176 } 177 178 unsigned AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, 179 Type *Src) { 180 int ISD = TLI->InstructionOpcodeToISD(Opcode); 181 assert(ISD && "Invalid opcode"); 182 183 EVT SrcTy = TLI->getValueType(Src); 184 EVT DstTy = TLI->getValueType(Dst); 185 186 if (!SrcTy.isSimple() || !DstTy.isSimple()) 187 return BaseT::getCastInstrCost(Opcode, Dst, Src); 188 189 static const TypeConversionCostTblEntry<MVT> ConversionTbl[] = { 190 // LowerVectorINT_TO_FP: 191 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 192 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 193 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 194 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 195 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 196 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 197 198 // Complex: to v2f32 199 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 200 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, 201 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, 202 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 203 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, 204 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, 205 206 // Complex: to v4f32 207 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 }, 208 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 209 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, 210 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 211 212 // Complex: to v2f64 213 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 214 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, 215 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 216 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 217 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, 218 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 219 220 221 // LowerVectorFP_TO_INT 222 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 }, 223 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, 224 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, 225 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, 226 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, 227 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, 228 229 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext). 230 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 }, 231 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 }, 232 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 }, 233 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 }, 234 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 }, 235 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 }, 236 237 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2 238 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 }, 239 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 }, 240 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 }, 241 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 }, 242 243 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2. 244 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 }, 245 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 }, 246 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 }, 247 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 }, 248 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 }, 249 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 }, 250 }; 251 252 int Idx = ConvertCostTableLookup<MVT>( 253 ConversionTbl, array_lengthof(ConversionTbl), ISD, DstTy.getSimpleVT(), 254 SrcTy.getSimpleVT()); 255 if (Idx != -1) 256 return ConversionTbl[Idx].Cost; 257 258 return BaseT::getCastInstrCost(Opcode, Dst, Src); 259 } 260 261 unsigned AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, 262 unsigned Index) { 263 assert(Val->isVectorTy() && "This must be a vector type"); 264 265 if (Index != -1U) { 266 // Legalize the type. 267 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val); 268 269 // This type is legalized to a scalar type. 270 if (!LT.second.isVector()) 271 return 0; 272 273 // The type may be split. Normalize the index to the new type. 274 unsigned Width = LT.second.getVectorNumElements(); 275 Index = Index % Width; 276 277 // The element at index zero is already inside the vector. 278 if (Index == 0) 279 return 0; 280 } 281 282 // All other insert/extracts cost this much. 283 return 2; 284 } 285 286 unsigned AArch64TTIImpl::getArithmeticInstrCost( 287 unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, 288 TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, 289 TTI::OperandValueProperties Opd2PropInfo) { 290 // Legalize the type. 291 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty); 292 293 int ISD = TLI->InstructionOpcodeToISD(Opcode); 294 295 if (ISD == ISD::SDIV && 296 Opd2Info == TargetTransformInfo::OK_UniformConstantValue && 297 Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { 298 // On AArch64, scalar signed division by constants power-of-two are 299 // normally expanded to the sequence ADD + CMP + SELECT + SRA. 300 // The OperandValue properties many not be same as that of previous 301 // operation; conservatively assume OP_None. 302 unsigned Cost = 303 getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info, 304 TargetTransformInfo::OP_None, 305 TargetTransformInfo::OP_None); 306 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info, 307 TargetTransformInfo::OP_None, 308 TargetTransformInfo::OP_None); 309 Cost += getArithmeticInstrCost(Instruction::Select, Ty, Opd1Info, Opd2Info, 310 TargetTransformInfo::OP_None, 311 TargetTransformInfo::OP_None); 312 Cost += getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info, Opd2Info, 313 TargetTransformInfo::OP_None, 314 TargetTransformInfo::OP_None); 315 return Cost; 316 } 317 318 switch (ISD) { 319 default: 320 return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, 321 Opd1PropInfo, Opd2PropInfo); 322 case ISD::ADD: 323 case ISD::MUL: 324 case ISD::XOR: 325 case ISD::OR: 326 case ISD::AND: 327 // These nodes are marked as 'custom' for combining purposes only. 328 // We know that they are legal. See LowerAdd in ISelLowering. 329 return 1 * LT.first; 330 } 331 } 332 333 unsigned AArch64TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { 334 // Address computations in vectorized code with non-consecutive addresses will 335 // likely result in more instructions compared to scalar code where the 336 // computation can more often be merged into the index mode. The resulting 337 // extra micro-ops can significantly decrease throughput. 338 unsigned NumVectorInstToHideOverhead = 10; 339 340 if (Ty->isVectorTy() && IsComplex) 341 return NumVectorInstToHideOverhead; 342 343 // In many cases the address computation is not merged into the instruction 344 // addressing mode. 345 return 1; 346 } 347 348 unsigned AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 349 Type *CondTy) { 350 351 int ISD = TLI->InstructionOpcodeToISD(Opcode); 352 // We don't lower vector selects well that are wider than the register width. 353 if (ValTy->isVectorTy() && ISD == ISD::SELECT) { 354 // We would need this many instructions to hide the scalarization happening. 355 unsigned AmortizationCost = 20; 356 static const TypeConversionCostTblEntry<MVT::SimpleValueType> 357 VectorSelectTbl[] = { 358 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 * AmortizationCost }, 359 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 * AmortizationCost }, 360 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 * AmortizationCost }, 361 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost }, 362 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost }, 363 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost } 364 }; 365 366 EVT SelCondTy = TLI->getValueType(CondTy); 367 EVT SelValTy = TLI->getValueType(ValTy); 368 if (SelCondTy.isSimple() && SelValTy.isSimple()) { 369 int Idx = 370 ConvertCostTableLookup(VectorSelectTbl, ISD, SelCondTy.getSimpleVT(), 371 SelValTy.getSimpleVT()); 372 if (Idx != -1) 373 return VectorSelectTbl[Idx].Cost; 374 } 375 } 376 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); 377 } 378 379 unsigned AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, 380 unsigned Alignment, 381 unsigned AddressSpace) { 382 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src); 383 384 if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 && 385 Src->getVectorElementType()->isIntegerTy(64)) { 386 // Unaligned stores are extremely inefficient. We don't split 387 // unaligned v2i64 stores because the negative impact that has shown in 388 // practice on inlined memcpy code. 389 // We make v2i64 stores expensive so that we will only vectorize if there 390 // are 6 other instructions getting vectorized. 391 unsigned AmortizationCost = 6; 392 393 return LT.first * 2 * AmortizationCost; 394 } 395 396 if (Src->isVectorTy() && Src->getVectorElementType()->isIntegerTy(8) && 397 Src->getVectorNumElements() < 8) { 398 // We scalarize the loads/stores because there is not v.4b register and we 399 // have to promote the elements to v.4h. 400 unsigned NumVecElts = Src->getVectorNumElements(); 401 unsigned NumVectorizableInstsToAmortize = NumVecElts * 2; 402 // We generate 2 instructions per vector element. 403 return NumVectorizableInstsToAmortize * NumVecElts * 2; 404 } 405 406 return LT.first; 407 } 408 409 unsigned AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { 410 unsigned Cost = 0; 411 for (auto *I : Tys) { 412 if (!I->isVectorTy()) 413 continue; 414 if (I->getScalarSizeInBits() * I->getVectorNumElements() == 128) 415 Cost += getMemoryOpCost(Instruction::Store, I, 128, 0) + 416 getMemoryOpCost(Instruction::Load, I, 128, 0); 417 } 418 return Cost; 419 } 420 421 unsigned AArch64TTIImpl::getMaxInterleaveFactor() { 422 if (ST->isCortexA57()) 423 return 4; 424 return 2; 425 } 426 427 void AArch64TTIImpl::getUnrollingPreferences(Loop *L, 428 TTI::UnrollingPreferences &UP) { 429 // Disable partial & runtime unrolling on -Os. 430 UP.PartialOptSizeThreshold = 0; 431 } 432 433 Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, 434 Type *ExpectedType) { 435 switch (Inst->getIntrinsicID()) { 436 default: 437 return nullptr; 438 case Intrinsic::aarch64_neon_st2: 439 case Intrinsic::aarch64_neon_st3: 440 case Intrinsic::aarch64_neon_st4: { 441 // Create a struct type 442 StructType *ST = dyn_cast<StructType>(ExpectedType); 443 if (!ST) 444 return nullptr; 445 unsigned NumElts = Inst->getNumArgOperands() - 1; 446 if (ST->getNumElements() != NumElts) 447 return nullptr; 448 for (unsigned i = 0, e = NumElts; i != e; ++i) { 449 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i)) 450 return nullptr; 451 } 452 Value *Res = UndefValue::get(ExpectedType); 453 IRBuilder<> Builder(Inst); 454 for (unsigned i = 0, e = NumElts; i != e; ++i) { 455 Value *L = Inst->getArgOperand(i); 456 Res = Builder.CreateInsertValue(Res, L, i); 457 } 458 return Res; 459 } 460 case Intrinsic::aarch64_neon_ld2: 461 case Intrinsic::aarch64_neon_ld3: 462 case Intrinsic::aarch64_neon_ld4: 463 if (Inst->getType() == ExpectedType) 464 return Inst; 465 return nullptr; 466 } 467 } 468 469 bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, 470 MemIntrinsicInfo &Info) { 471 switch (Inst->getIntrinsicID()) { 472 default: 473 break; 474 case Intrinsic::aarch64_neon_ld2: 475 case Intrinsic::aarch64_neon_ld3: 476 case Intrinsic::aarch64_neon_ld4: 477 Info.ReadMem = true; 478 Info.WriteMem = false; 479 Info.Vol = false; 480 Info.NumMemRefs = 1; 481 Info.PtrVal = Inst->getArgOperand(0); 482 break; 483 case Intrinsic::aarch64_neon_st2: 484 case Intrinsic::aarch64_neon_st3: 485 case Intrinsic::aarch64_neon_st4: 486 Info.ReadMem = false; 487 Info.WriteMem = true; 488 Info.Vol = false; 489 Info.NumMemRefs = 1; 490 Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1); 491 break; 492 } 493 494 switch (Inst->getIntrinsicID()) { 495 default: 496 return false; 497 case Intrinsic::aarch64_neon_ld2: 498 case Intrinsic::aarch64_neon_st2: 499 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS; 500 break; 501 case Intrinsic::aarch64_neon_ld3: 502 case Intrinsic::aarch64_neon_st3: 503 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS; 504 break; 505 case Intrinsic::aarch64_neon_ld4: 506 case Intrinsic::aarch64_neon_st4: 507 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS; 508 break; 509 } 510 return true; 511 } 512