1 //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 10 #include "AArch64TargetTransformInfo.h" 11 #include "MCTargetDesc/AArch64AddressingModes.h" 12 #include "llvm/Analysis/LoopInfo.h" 13 #include "llvm/Analysis/TargetTransformInfo.h" 14 #include "llvm/CodeGen/BasicTTIImpl.h" 15 #include "llvm/Support/Debug.h" 16 #include "llvm/Target/CostTable.h" 17 #include "llvm/Target/TargetLowering.h" 18 #include <algorithm> 19 using namespace llvm; 20 21 #define DEBUG_TYPE "aarch64tti" 22 23 /// \brief Calculate the cost of materializing a 64-bit value. This helper 24 /// method might only calculate a fraction of a larger immediate. Therefore it 25 /// is valid to return a cost of ZERO. 26 int AArch64TTIImpl::getIntImmCost(int64_t Val) { 27 // Check if the immediate can be encoded within an instruction. 28 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64)) 29 return 0; 30 31 if (Val < 0) 32 Val = ~Val; 33 34 // Calculate how many moves we will need to materialize this constant. 35 unsigned LZ = countLeadingZeros((uint64_t)Val); 36 return (64 - LZ + 15) / 16; 37 } 38 39 /// \brief Calculate the cost of materializing the given constant. 40 int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { 41 assert(Ty->isIntegerTy()); 42 43 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 44 if (BitSize == 0) 45 return ~0U; 46 47 // Sign-extend all constants to a multiple of 64-bit. 48 APInt ImmVal = Imm; 49 if (BitSize & 0x3f) 50 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU); 51 52 // Split the constant into 64-bit chunks and calculate the cost for each 53 // chunk. 54 int Cost = 0; 55 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { 56 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); 57 int64_t Val = Tmp.getSExtValue(); 58 Cost += getIntImmCost(Val); 59 } 60 // We need at least one instruction to materialze the constant. 61 return std::max(1, Cost); 62 } 63 64 int AArch64TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, 65 const APInt &Imm, Type *Ty) { 66 assert(Ty->isIntegerTy()); 67 68 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 69 // There is no cost model for constants with a bit size of 0. Return TCC_Free 70 // here, so that constant hoisting will ignore this constant. 71 if (BitSize == 0) 72 return TTI::TCC_Free; 73 74 unsigned ImmIdx = ~0U; 75 switch (Opcode) { 76 default: 77 return TTI::TCC_Free; 78 case Instruction::GetElementPtr: 79 // Always hoist the base address of a GetElementPtr. 80 if (Idx == 0) 81 return 2 * TTI::TCC_Basic; 82 return TTI::TCC_Free; 83 case Instruction::Store: 84 ImmIdx = 0; 85 break; 86 case Instruction::Add: 87 case Instruction::Sub: 88 case Instruction::Mul: 89 case Instruction::UDiv: 90 case Instruction::SDiv: 91 case Instruction::URem: 92 case Instruction::SRem: 93 case Instruction::And: 94 case Instruction::Or: 95 case Instruction::Xor: 96 case Instruction::ICmp: 97 ImmIdx = 1; 98 break; 99 // Always return TCC_Free for the shift value of a shift instruction. 100 case Instruction::Shl: 101 case Instruction::LShr: 102 case Instruction::AShr: 103 if (Idx == 1) 104 return TTI::TCC_Free; 105 break; 106 case Instruction::Trunc: 107 case Instruction::ZExt: 108 case Instruction::SExt: 109 case Instruction::IntToPtr: 110 case Instruction::PtrToInt: 111 case Instruction::BitCast: 112 case Instruction::PHI: 113 case Instruction::Call: 114 case Instruction::Select: 115 case Instruction::Ret: 116 case Instruction::Load: 117 break; 118 } 119 120 if (Idx == ImmIdx) { 121 int NumConstants = (BitSize + 63) / 64; 122 int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty); 123 return (Cost <= NumConstants * TTI::TCC_Basic) 124 ? static_cast<int>(TTI::TCC_Free) 125 : Cost; 126 } 127 return AArch64TTIImpl::getIntImmCost(Imm, Ty); 128 } 129 130 int AArch64TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, 131 const APInt &Imm, Type *Ty) { 132 assert(Ty->isIntegerTy()); 133 134 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 135 // There is no cost model for constants with a bit size of 0. Return TCC_Free 136 // here, so that constant hoisting will ignore this constant. 137 if (BitSize == 0) 138 return TTI::TCC_Free; 139 140 switch (IID) { 141 default: 142 return TTI::TCC_Free; 143 case Intrinsic::sadd_with_overflow: 144 case Intrinsic::uadd_with_overflow: 145 case Intrinsic::ssub_with_overflow: 146 case Intrinsic::usub_with_overflow: 147 case Intrinsic::smul_with_overflow: 148 case Intrinsic::umul_with_overflow: 149 if (Idx == 1) { 150 int NumConstants = (BitSize + 63) / 64; 151 int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty); 152 return (Cost <= NumConstants * TTI::TCC_Basic) 153 ? static_cast<int>(TTI::TCC_Free) 154 : Cost; 155 } 156 break; 157 case Intrinsic::experimental_stackmap: 158 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 159 return TTI::TCC_Free; 160 break; 161 case Intrinsic::experimental_patchpoint_void: 162 case Intrinsic::experimental_patchpoint_i64: 163 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 164 return TTI::TCC_Free; 165 break; 166 } 167 return AArch64TTIImpl::getIntImmCost(Imm, Ty); 168 } 169 170 TargetTransformInfo::PopcntSupportKind 171 AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) { 172 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 173 if (TyWidth == 32 || TyWidth == 64) 174 return TTI::PSK_FastHardware; 175 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount. 176 return TTI::PSK_Software; 177 } 178 179 bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, 180 ArrayRef<const Value *> Args) { 181 182 // A helper that returns a vector type from the given type. The number of 183 // elements in type Ty determine the vector width. 184 auto toVectorTy = [&](Type *ArgTy) { 185 return VectorType::get(ArgTy->getScalarType(), 186 DstTy->getVectorNumElements()); 187 }; 188 189 // Exit early if DstTy is not a vector type whose elements are at least 190 // 16-bits wide. 191 if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16) 192 return false; 193 194 // Determine if the operation has a widening variant. We consider both the 195 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the 196 // instructions. 197 // 198 // TODO: Add additional widening operations (e.g., mul, shl, etc.) once we 199 // verify that their extending operands are eliminated during code 200 // generation. 201 switch (Opcode) { 202 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2). 203 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2). 204 break; 205 default: 206 return false; 207 } 208 209 // To be a widening instruction (either the "wide" or "long" versions), the 210 // second operand must be a sign- or zero extend having a single user. We 211 // only consider extends having a single user because they may otherwise not 212 // be eliminated. 213 if (Args.size() != 2 || 214 (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) || 215 !Args[1]->hasOneUse()) 216 return false; 217 auto *Extend = cast<CastInst>(Args[1]); 218 219 // Legalize the destination type and ensure it can be used in a widening 220 // operation. 221 auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy); 222 unsigned DstElTySize = DstTyL.second.getScalarSizeInBits(); 223 if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits()) 224 return false; 225 226 // Legalize the source type and ensure it can be used in a widening 227 // operation. 228 Type *SrcTy = toVectorTy(Extend->getSrcTy()); 229 auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy); 230 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits(); 231 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits()) 232 return false; 233 234 // Get the total number of vector elements in the legalized types. 235 unsigned NumDstEls = DstTyL.first * DstTyL.second.getVectorNumElements(); 236 unsigned NumSrcEls = SrcTyL.first * SrcTyL.second.getVectorNumElements(); 237 238 // Return true if the legalized types have the same number of vector elements 239 // and the destination element type size is twice that of the source type. 240 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize; 241 } 242 243 int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, 244 const Instruction *I) { 245 int ISD = TLI->InstructionOpcodeToISD(Opcode); 246 assert(ISD && "Invalid opcode"); 247 248 // If the cast is observable, and it is used by a widening instruction (e.g., 249 // uaddl, saddw, etc.), it may be free. 250 if (I && I->hasOneUse()) { 251 auto *SingleUser = cast<Instruction>(*I->user_begin()); 252 SmallVector<const Value *, 4> Operands(SingleUser->operand_values()); 253 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) { 254 // If the cast is the second operand, it is free. We will generate either 255 // a "wide" or "long" version of the widening instruction. 256 if (I == SingleUser->getOperand(1)) 257 return 0; 258 // If the cast is not the second operand, it will be free if it looks the 259 // same as the second operand. In this case, we will generate a "long" 260 // version of the widening instruction. 261 if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1))) 262 if (I->getOpcode() == Cast->getOpcode() && 263 cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy()) 264 return 0; 265 } 266 } 267 268 EVT SrcTy = TLI->getValueType(DL, Src); 269 EVT DstTy = TLI->getValueType(DL, Dst); 270 271 if (!SrcTy.isSimple() || !DstTy.isSimple()) 272 return BaseT::getCastInstrCost(Opcode, Dst, Src); 273 274 static const TypeConversionCostTblEntry 275 ConversionTbl[] = { 276 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, 277 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 }, 278 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, 279 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 }, 280 281 // The number of shll instructions for the extension. 282 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 283 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 284 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, 285 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, 286 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 287 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 288 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, 289 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, 290 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, 291 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, 292 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, 293 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, 294 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, 295 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, 296 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, 297 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, 298 299 // LowerVectorINT_TO_FP: 300 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 301 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 302 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 303 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 304 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 305 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 306 307 // Complex: to v2f32 308 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 309 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, 310 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, 311 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 312 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, 313 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, 314 315 // Complex: to v4f32 316 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 }, 317 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 318 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, 319 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 320 321 // Complex: to v8f32 322 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, 323 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 324 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, 325 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 326 327 // Complex: to v16f32 328 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, 329 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, 330 331 // Complex: to v2f64 332 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 333 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, 334 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 335 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 336 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, 337 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 338 339 340 // LowerVectorFP_TO_INT 341 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 }, 342 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, 343 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, 344 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, 345 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, 346 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, 347 348 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext). 349 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 }, 350 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 }, 351 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 }, 352 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 }, 353 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 }, 354 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 }, 355 356 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2 357 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 }, 358 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 }, 359 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 }, 360 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 }, 361 362 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2. 363 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 }, 364 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 }, 365 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 }, 366 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 }, 367 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 }, 368 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 }, 369 }; 370 371 if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD, 372 DstTy.getSimpleVT(), 373 SrcTy.getSimpleVT())) 374 return Entry->Cost; 375 376 return BaseT::getCastInstrCost(Opcode, Dst, Src); 377 } 378 379 int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst, 380 VectorType *VecTy, 381 unsigned Index) { 382 383 // Make sure we were given a valid extend opcode. 384 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && 385 "Invalid opcode"); 386 387 // We are extending an element we extract from a vector, so the source type 388 // of the extend is the element type of the vector. 389 auto *Src = VecTy->getElementType(); 390 391 // Sign- and zero-extends are for integer types only. 392 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type"); 393 394 // Get the cost for the extract. We compute the cost (if any) for the extend 395 // below. 396 auto Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy, Index); 397 398 // Legalize the types. 399 auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy); 400 auto DstVT = TLI->getValueType(DL, Dst); 401 auto SrcVT = TLI->getValueType(DL, Src); 402 403 // If the resulting type is still a vector and the destination type is legal, 404 // we may get the extension for free. If not, get the default cost for the 405 // extend. 406 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT)) 407 return Cost + getCastInstrCost(Opcode, Dst, Src); 408 409 // The destination type should be larger than the element type. If not, get 410 // the default cost for the extend. 411 if (DstVT.getSizeInBits() < SrcVT.getSizeInBits()) 412 return Cost + getCastInstrCost(Opcode, Dst, Src); 413 414 switch (Opcode) { 415 default: 416 llvm_unreachable("Opcode should be either SExt or ZExt"); 417 418 // For sign-extends, we only need a smov, which performs the extension 419 // automatically. 420 case Instruction::SExt: 421 return Cost; 422 423 // For zero-extends, the extend is performed automatically by a umov unless 424 // the destination type is i64 and the element type is i8 or i16. 425 case Instruction::ZExt: 426 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u) 427 return Cost; 428 } 429 430 // If we are unable to perform the extend for free, get the default cost. 431 return Cost + getCastInstrCost(Opcode, Dst, Src); 432 } 433 434 int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, 435 unsigned Index) { 436 assert(Val->isVectorTy() && "This must be a vector type"); 437 438 if (Index != -1U) { 439 // Legalize the type. 440 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); 441 442 // This type is legalized to a scalar type. 443 if (!LT.second.isVector()) 444 return 0; 445 446 // The type may be split. Normalize the index to the new type. 447 unsigned Width = LT.second.getVectorNumElements(); 448 Index = Index % Width; 449 450 // The element at index zero is already inside the vector. 451 if (Index == 0) 452 return 0; 453 } 454 455 // All other insert/extracts cost this much. 456 return ST->getVectorInsertExtractBaseCost(); 457 } 458 459 int AArch64TTIImpl::getArithmeticInstrCost( 460 unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, 461 TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, 462 TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) { 463 // Legalize the type. 464 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 465 466 // If the instruction is a widening instruction (e.g., uaddl, saddw, etc.), 467 // add in the widening overhead specified by the sub-target. Since the 468 // extends feeding widening instructions are performed automatically, they 469 // aren't present in the generated code and have a zero cost. By adding a 470 // widening overhead here, we attach the total cost of the combined operation 471 // to the widening instruction. 472 int Cost = 0; 473 if (isWideningInstruction(Ty, Opcode, Args)) 474 Cost += ST->getWideningBaseCost(); 475 476 int ISD = TLI->InstructionOpcodeToISD(Opcode); 477 478 if (ISD == ISD::SDIV && 479 Opd2Info == TargetTransformInfo::OK_UniformConstantValue && 480 Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { 481 // On AArch64, scalar signed division by constants power-of-two are 482 // normally expanded to the sequence ADD + CMP + SELECT + SRA. 483 // The OperandValue properties many not be same as that of previous 484 // operation; conservatively assume OP_None. 485 Cost += getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info, 486 TargetTransformInfo::OP_None, 487 TargetTransformInfo::OP_None); 488 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info, 489 TargetTransformInfo::OP_None, 490 TargetTransformInfo::OP_None); 491 Cost += getArithmeticInstrCost(Instruction::Select, Ty, Opd1Info, Opd2Info, 492 TargetTransformInfo::OP_None, 493 TargetTransformInfo::OP_None); 494 Cost += getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info, Opd2Info, 495 TargetTransformInfo::OP_None, 496 TargetTransformInfo::OP_None); 497 return Cost; 498 } 499 500 switch (ISD) { 501 default: 502 return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, 503 Opd1PropInfo, Opd2PropInfo); 504 case ISD::ADD: 505 case ISD::MUL: 506 case ISD::XOR: 507 case ISD::OR: 508 case ISD::AND: 509 // These nodes are marked as 'custom' for combining purposes only. 510 // We know that they are legal. See LowerAdd in ISelLowering. 511 return (Cost + 1) * LT.first; 512 } 513 } 514 515 int AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, 516 const SCEV *Ptr) { 517 // Address computations in vectorized code with non-consecutive addresses will 518 // likely result in more instructions compared to scalar code where the 519 // computation can more often be merged into the index mode. The resulting 520 // extra micro-ops can significantly decrease throughput. 521 unsigned NumVectorInstToHideOverhead = 10; 522 int MaxMergeDistance = 64; 523 524 if (Ty->isVectorTy() && SE && 525 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) 526 return NumVectorInstToHideOverhead; 527 528 // In many cases the address computation is not merged into the instruction 529 // addressing mode. 530 return 1; 531 } 532 533 int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 534 Type *CondTy, const Instruction *I) { 535 536 int ISD = TLI->InstructionOpcodeToISD(Opcode); 537 // We don't lower some vector selects well that are wider than the register 538 // width. 539 if (ValTy->isVectorTy() && ISD == ISD::SELECT) { 540 // We would need this many instructions to hide the scalarization happening. 541 const int AmortizationCost = 20; 542 static const TypeConversionCostTblEntry 543 VectorSelectTbl[] = { 544 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 }, 545 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 }, 546 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 }, 547 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost }, 548 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost }, 549 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost } 550 }; 551 552 EVT SelCondTy = TLI->getValueType(DL, CondTy); 553 EVT SelValTy = TLI->getValueType(DL, ValTy); 554 if (SelCondTy.isSimple() && SelValTy.isSimple()) { 555 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD, 556 SelCondTy.getSimpleVT(), 557 SelValTy.getSimpleVT())) 558 return Entry->Cost; 559 } 560 } 561 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); 562 } 563 564 int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, 565 unsigned Alignment, unsigned AddressSpace, 566 const Instruction *I) { 567 auto LT = TLI->getTypeLegalizationCost(DL, Ty); 568 569 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store && 570 LT.second.is128BitVector() && Alignment < 16) { 571 // Unaligned stores are extremely inefficient. We don't split all 572 // unaligned 128-bit stores because the negative impact that has shown in 573 // practice on inlined block copy code. 574 // We make such stores expensive so that we will only vectorize if there 575 // are 6 other instructions getting vectorized. 576 const int AmortizationCost = 6; 577 578 return LT.first * 2 * AmortizationCost; 579 } 580 581 if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8) && 582 Ty->getVectorNumElements() < 8) { 583 // We scalarize the loads/stores because there is not v.4b register and we 584 // have to promote the elements to v.4h. 585 unsigned NumVecElts = Ty->getVectorNumElements(); 586 unsigned NumVectorizableInstsToAmortize = NumVecElts * 2; 587 // We generate 2 instructions per vector element. 588 return NumVectorizableInstsToAmortize * NumVecElts * 2; 589 } 590 591 return LT.first; 592 } 593 594 int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, 595 unsigned Factor, 596 ArrayRef<unsigned> Indices, 597 unsigned Alignment, 598 unsigned AddressSpace) { 599 assert(Factor >= 2 && "Invalid interleave factor"); 600 assert(isa<VectorType>(VecTy) && "Expect a vector type"); 601 602 if (Factor <= TLI->getMaxSupportedInterleaveFactor()) { 603 unsigned NumElts = VecTy->getVectorNumElements(); 604 auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor); 605 606 // ldN/stN only support legal vector types of size 64 or 128 in bits. 607 // Accesses having vector types that are a multiple of 128 bits can be 608 // matched to more than one ldN/stN instruction. 609 if (NumElts % Factor == 0 && 610 TLI->isLegalInterleavedAccessType(SubVecTy, DL)) 611 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL); 612 } 613 614 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 615 Alignment, AddressSpace); 616 } 617 618 int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { 619 int Cost = 0; 620 for (auto *I : Tys) { 621 if (!I->isVectorTy()) 622 continue; 623 if (I->getScalarSizeInBits() * I->getVectorNumElements() == 128) 624 Cost += getMemoryOpCost(Instruction::Store, I, 128, 0) + 625 getMemoryOpCost(Instruction::Load, I, 128, 0); 626 } 627 return Cost; 628 } 629 630 unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) { 631 return ST->getMaxInterleaveFactor(); 632 } 633 634 void AArch64TTIImpl::getUnrollingPreferences(Loop *L, 635 TTI::UnrollingPreferences &UP) { 636 // Enable partial unrolling and runtime unrolling. 637 BaseT::getUnrollingPreferences(L, UP); 638 639 // For inner loop, it is more likely to be a hot one, and the runtime check 640 // can be promoted out from LICM pass, so the overhead is less, let's try 641 // a larger threshold to unroll more loops. 642 if (L->getLoopDepth() > 1) 643 UP.PartialThreshold *= 2; 644 645 // Disable partial & runtime unrolling on -Os. 646 UP.PartialOptSizeThreshold = 0; 647 } 648 649 Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, 650 Type *ExpectedType) { 651 switch (Inst->getIntrinsicID()) { 652 default: 653 return nullptr; 654 case Intrinsic::aarch64_neon_st2: 655 case Intrinsic::aarch64_neon_st3: 656 case Intrinsic::aarch64_neon_st4: { 657 // Create a struct type 658 StructType *ST = dyn_cast<StructType>(ExpectedType); 659 if (!ST) 660 return nullptr; 661 unsigned NumElts = Inst->getNumArgOperands() - 1; 662 if (ST->getNumElements() != NumElts) 663 return nullptr; 664 for (unsigned i = 0, e = NumElts; i != e; ++i) { 665 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i)) 666 return nullptr; 667 } 668 Value *Res = UndefValue::get(ExpectedType); 669 IRBuilder<> Builder(Inst); 670 for (unsigned i = 0, e = NumElts; i != e; ++i) { 671 Value *L = Inst->getArgOperand(i); 672 Res = Builder.CreateInsertValue(Res, L, i); 673 } 674 return Res; 675 } 676 case Intrinsic::aarch64_neon_ld2: 677 case Intrinsic::aarch64_neon_ld3: 678 case Intrinsic::aarch64_neon_ld4: 679 if (Inst->getType() == ExpectedType) 680 return Inst; 681 return nullptr; 682 } 683 } 684 685 bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, 686 MemIntrinsicInfo &Info) { 687 switch (Inst->getIntrinsicID()) { 688 default: 689 break; 690 case Intrinsic::aarch64_neon_ld2: 691 case Intrinsic::aarch64_neon_ld3: 692 case Intrinsic::aarch64_neon_ld4: 693 Info.ReadMem = true; 694 Info.WriteMem = false; 695 Info.PtrVal = Inst->getArgOperand(0); 696 break; 697 case Intrinsic::aarch64_neon_st2: 698 case Intrinsic::aarch64_neon_st3: 699 case Intrinsic::aarch64_neon_st4: 700 Info.ReadMem = false; 701 Info.WriteMem = true; 702 Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1); 703 break; 704 } 705 706 switch (Inst->getIntrinsicID()) { 707 default: 708 return false; 709 case Intrinsic::aarch64_neon_ld2: 710 case Intrinsic::aarch64_neon_st2: 711 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS; 712 break; 713 case Intrinsic::aarch64_neon_ld3: 714 case Intrinsic::aarch64_neon_st3: 715 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS; 716 break; 717 case Intrinsic::aarch64_neon_ld4: 718 case Intrinsic::aarch64_neon_st4: 719 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS; 720 break; 721 } 722 return true; 723 } 724 725 /// See if \p I should be considered for address type promotion. We check if \p 726 /// I is a sext with right type and used in memory accesses. If it used in a 727 /// "complex" getelementptr, we allow it to be promoted without finding other 728 /// sext instructions that sign extended the same initial value. A getelementptr 729 /// is considered as "complex" if it has more than 2 operands. 730 bool AArch64TTIImpl::shouldConsiderAddressTypePromotion( 731 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) { 732 bool Considerable = false; 733 AllowPromotionWithoutCommonHeader = false; 734 if (!isa<SExtInst>(&I)) 735 return false; 736 Type *ConsideredSExtType = 737 Type::getInt64Ty(I.getParent()->getParent()->getContext()); 738 if (I.getType() != ConsideredSExtType) 739 return false; 740 // See if the sext is the one with the right type and used in at least one 741 // GetElementPtrInst. 742 for (const User *U : I.users()) { 743 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) { 744 Considerable = true; 745 // A getelementptr is considered as "complex" if it has more than 2 746 // operands. We will promote a SExt used in such complex GEP as we 747 // expect some computation to be merged if they are done on 64 bits. 748 if (GEPInst->getNumOperands() > 2) { 749 AllowPromotionWithoutCommonHeader = true; 750 break; 751 } 752 } 753 } 754 return Considerable; 755 } 756 757 unsigned AArch64TTIImpl::getCacheLineSize() { 758 return ST->getCacheLineSize(); 759 } 760 761 unsigned AArch64TTIImpl::getPrefetchDistance() { 762 return ST->getPrefetchDistance(); 763 } 764 765 unsigned AArch64TTIImpl::getMinPrefetchStride() { 766 return ST->getMinPrefetchStride(); 767 } 768 769 unsigned AArch64TTIImpl::getMaxPrefetchIterationsAhead() { 770 return ST->getMaxPrefetchIterationsAhead(); 771 } 772 773 bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty, 774 TTI::ReductionFlags Flags) const { 775 assert(isa<VectorType>(Ty) && "Expected Ty to be a vector type"); 776 unsigned ScalarBits = Ty->getScalarSizeInBits(); 777 switch (Opcode) { 778 case Instruction::FAdd: 779 case Instruction::FMul: 780 case Instruction::And: 781 case Instruction::Or: 782 case Instruction::Xor: 783 case Instruction::Mul: 784 return false; 785 case Instruction::Add: 786 return ScalarBits * Ty->getVectorNumElements() >= 128; 787 case Instruction::ICmp: 788 return (ScalarBits < 64) && 789 (ScalarBits * Ty->getVectorNumElements() >= 128); 790 case Instruction::FCmp: 791 return Flags.NoNaN; 792 default: 793 llvm_unreachable("Unhandled reduction opcode"); 794 } 795 return false; 796 } 797