1 //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "AArch64TargetTransformInfo.h" 10 #include "AArch64ExpandImm.h" 11 #include "MCTargetDesc/AArch64AddressingModes.h" 12 #include "llvm/Analysis/LoopInfo.h" 13 #include "llvm/Analysis/TargetTransformInfo.h" 14 #include "llvm/CodeGen/BasicTTIImpl.h" 15 #include "llvm/CodeGen/CostTable.h" 16 #include "llvm/CodeGen/TargetLowering.h" 17 #include "llvm/IR/IntrinsicInst.h" 18 #include "llvm/IR/IntrinsicsAArch64.h" 19 #include "llvm/IR/PatternMatch.h" 20 #include "llvm/Support/Debug.h" 21 #include <algorithm> 22 using namespace llvm; 23 using namespace llvm::PatternMatch; 24 25 #define DEBUG_TYPE "aarch64tti" 26 27 static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", 28 cl::init(true), cl::Hidden); 29 30 bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, 31 const Function *Callee) const { 32 const TargetMachine &TM = getTLI()->getTargetMachine(); 33 34 const FeatureBitset &CallerBits = 35 TM.getSubtargetImpl(*Caller)->getFeatureBits(); 36 const FeatureBitset &CalleeBits = 37 TM.getSubtargetImpl(*Callee)->getFeatureBits(); 38 39 // Inline a callee if its target-features are a subset of the callers 40 // target-features. 41 return (CallerBits & CalleeBits) == CalleeBits; 42 } 43 44 /// Calculate the cost of materializing a 64-bit value. This helper 45 /// method might only calculate a fraction of a larger immediate. Therefore it 46 /// is valid to return a cost of ZERO. 47 int AArch64TTIImpl::getIntImmCost(int64_t Val) { 48 // Check if the immediate can be encoded within an instruction. 49 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64)) 50 return 0; 51 52 if (Val < 0) 53 Val = ~Val; 54 55 // Calculate how many moves we will need to materialize this constant. 56 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 57 AArch64_IMM::expandMOVImm(Val, 64, Insn); 58 return Insn.size(); 59 } 60 61 /// Calculate the cost of materializing the given constant. 62 int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, 63 TTI::TargetCostKind CostKind) { 64 assert(Ty->isIntegerTy()); 65 66 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 67 if (BitSize == 0) 68 return ~0U; 69 70 // Sign-extend all constants to a multiple of 64-bit. 71 APInt ImmVal = Imm; 72 if (BitSize & 0x3f) 73 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU); 74 75 // Split the constant into 64-bit chunks and calculate the cost for each 76 // chunk. 77 int Cost = 0; 78 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { 79 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); 80 int64_t Val = Tmp.getSExtValue(); 81 Cost += getIntImmCost(Val); 82 } 83 // We need at least one instruction to materialze the constant. 84 return std::max(1, Cost); 85 } 86 87 int AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, 88 const APInt &Imm, Type *Ty, 89 TTI::TargetCostKind CostKind, 90 Instruction *Inst) { 91 assert(Ty->isIntegerTy()); 92 93 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 94 // There is no cost model for constants with a bit size of 0. Return TCC_Free 95 // here, so that constant hoisting will ignore this constant. 96 if (BitSize == 0) 97 return TTI::TCC_Free; 98 99 unsigned ImmIdx = ~0U; 100 switch (Opcode) { 101 default: 102 return TTI::TCC_Free; 103 case Instruction::GetElementPtr: 104 // Always hoist the base address of a GetElementPtr. 105 if (Idx == 0) 106 return 2 * TTI::TCC_Basic; 107 return TTI::TCC_Free; 108 case Instruction::Store: 109 ImmIdx = 0; 110 break; 111 case Instruction::Add: 112 case Instruction::Sub: 113 case Instruction::Mul: 114 case Instruction::UDiv: 115 case Instruction::SDiv: 116 case Instruction::URem: 117 case Instruction::SRem: 118 case Instruction::And: 119 case Instruction::Or: 120 case Instruction::Xor: 121 case Instruction::ICmp: 122 ImmIdx = 1; 123 break; 124 // Always return TCC_Free for the shift value of a shift instruction. 125 case Instruction::Shl: 126 case Instruction::LShr: 127 case Instruction::AShr: 128 if (Idx == 1) 129 return TTI::TCC_Free; 130 break; 131 case Instruction::Trunc: 132 case Instruction::ZExt: 133 case Instruction::SExt: 134 case Instruction::IntToPtr: 135 case Instruction::PtrToInt: 136 case Instruction::BitCast: 137 case Instruction::PHI: 138 case Instruction::Call: 139 case Instruction::Select: 140 case Instruction::Ret: 141 case Instruction::Load: 142 break; 143 } 144 145 if (Idx == ImmIdx) { 146 int NumConstants = (BitSize + 63) / 64; 147 int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 148 return (Cost <= NumConstants * TTI::TCC_Basic) 149 ? static_cast<int>(TTI::TCC_Free) 150 : Cost; 151 } 152 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 153 } 154 155 int AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, 156 const APInt &Imm, Type *Ty, 157 TTI::TargetCostKind CostKind) { 158 assert(Ty->isIntegerTy()); 159 160 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 161 // There is no cost model for constants with a bit size of 0. Return TCC_Free 162 // here, so that constant hoisting will ignore this constant. 163 if (BitSize == 0) 164 return TTI::TCC_Free; 165 166 // Most (all?) AArch64 intrinsics do not support folding immediates into the 167 // selected instruction, so we compute the materialization cost for the 168 // immediate directly. 169 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv) 170 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 171 172 switch (IID) { 173 default: 174 return TTI::TCC_Free; 175 case Intrinsic::sadd_with_overflow: 176 case Intrinsic::uadd_with_overflow: 177 case Intrinsic::ssub_with_overflow: 178 case Intrinsic::usub_with_overflow: 179 case Intrinsic::smul_with_overflow: 180 case Intrinsic::umul_with_overflow: 181 if (Idx == 1) { 182 int NumConstants = (BitSize + 63) / 64; 183 int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 184 return (Cost <= NumConstants * TTI::TCC_Basic) 185 ? static_cast<int>(TTI::TCC_Free) 186 : Cost; 187 } 188 break; 189 case Intrinsic::experimental_stackmap: 190 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 191 return TTI::TCC_Free; 192 break; 193 case Intrinsic::experimental_patchpoint_void: 194 case Intrinsic::experimental_patchpoint_i64: 195 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 196 return TTI::TCC_Free; 197 break; 198 case Intrinsic::experimental_gc_statepoint: 199 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 200 return TTI::TCC_Free; 201 break; 202 } 203 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 204 } 205 206 TargetTransformInfo::PopcntSupportKind 207 AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) { 208 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 209 if (TyWidth == 32 || TyWidth == 64) 210 return TTI::PSK_FastHardware; 211 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount. 212 return TTI::PSK_Software; 213 } 214 215 unsigned 216 AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 217 TTI::TargetCostKind CostKind) { 218 auto *RetTy = ICA.getReturnType(); 219 switch (ICA.getID()) { 220 case Intrinsic::smin: 221 case Intrinsic::umin: 222 case Intrinsic::smax: 223 case Intrinsic::umax: { 224 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, 225 MVT::v8i16, MVT::v2i32, MVT::v4i32}; 226 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 227 if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; })) 228 return LT.first; 229 break; 230 } 231 default: 232 break; 233 } 234 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 235 } 236 237 bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, 238 ArrayRef<const Value *> Args) { 239 240 // A helper that returns a vector type from the given type. The number of 241 // elements in type Ty determine the vector width. 242 auto toVectorTy = [&](Type *ArgTy) { 243 return VectorType::get(ArgTy->getScalarType(), 244 cast<VectorType>(DstTy)->getElementCount()); 245 }; 246 247 // Exit early if DstTy is not a vector type whose elements are at least 248 // 16-bits wide. 249 if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16) 250 return false; 251 252 // Determine if the operation has a widening variant. We consider both the 253 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the 254 // instructions. 255 // 256 // TODO: Add additional widening operations (e.g., mul, shl, etc.) once we 257 // verify that their extending operands are eliminated during code 258 // generation. 259 switch (Opcode) { 260 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2). 261 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2). 262 break; 263 default: 264 return false; 265 } 266 267 // To be a widening instruction (either the "wide" or "long" versions), the 268 // second operand must be a sign- or zero extend having a single user. We 269 // only consider extends having a single user because they may otherwise not 270 // be eliminated. 271 if (Args.size() != 2 || 272 (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) || 273 !Args[1]->hasOneUse()) 274 return false; 275 auto *Extend = cast<CastInst>(Args[1]); 276 277 // Legalize the destination type and ensure it can be used in a widening 278 // operation. 279 auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy); 280 unsigned DstElTySize = DstTyL.second.getScalarSizeInBits(); 281 if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits()) 282 return false; 283 284 // Legalize the source type and ensure it can be used in a widening 285 // operation. 286 auto *SrcTy = toVectorTy(Extend->getSrcTy()); 287 auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy); 288 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits(); 289 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits()) 290 return false; 291 292 // Get the total number of vector elements in the legalized types. 293 unsigned NumDstEls = DstTyL.first * DstTyL.second.getVectorMinNumElements(); 294 unsigned NumSrcEls = SrcTyL.first * SrcTyL.second.getVectorMinNumElements(); 295 296 // Return true if the legalized types have the same number of vector elements 297 // and the destination element type size is twice that of the source type. 298 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize; 299 } 300 301 int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, 302 TTI::CastContextHint CCH, 303 TTI::TargetCostKind CostKind, 304 const Instruction *I) { 305 int ISD = TLI->InstructionOpcodeToISD(Opcode); 306 assert(ISD && "Invalid opcode"); 307 308 // If the cast is observable, and it is used by a widening instruction (e.g., 309 // uaddl, saddw, etc.), it may be free. 310 if (I && I->hasOneUse()) { 311 auto *SingleUser = cast<Instruction>(*I->user_begin()); 312 SmallVector<const Value *, 4> Operands(SingleUser->operand_values()); 313 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) { 314 // If the cast is the second operand, it is free. We will generate either 315 // a "wide" or "long" version of the widening instruction. 316 if (I == SingleUser->getOperand(1)) 317 return 0; 318 // If the cast is not the second operand, it will be free if it looks the 319 // same as the second operand. In this case, we will generate a "long" 320 // version of the widening instruction. 321 if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1))) 322 if (I->getOpcode() == unsigned(Cast->getOpcode()) && 323 cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy()) 324 return 0; 325 } 326 } 327 328 // TODO: Allow non-throughput costs that aren't binary. 329 auto AdjustCost = [&CostKind](int Cost) { 330 if (CostKind != TTI::TCK_RecipThroughput) 331 return Cost == 0 ? 0 : 1; 332 return Cost; 333 }; 334 335 EVT SrcTy = TLI->getValueType(DL, Src); 336 EVT DstTy = TLI->getValueType(DL, Dst); 337 338 if (!SrcTy.isSimple() || !DstTy.isSimple()) 339 return AdjustCost( 340 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); 341 342 static const TypeConversionCostTblEntry 343 ConversionTbl[] = { 344 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, 345 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 }, 346 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, 347 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 }, 348 349 // The number of shll instructions for the extension. 350 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 351 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 352 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, 353 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, 354 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 355 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 356 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, 357 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, 358 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, 359 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, 360 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, 361 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, 362 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, 363 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, 364 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, 365 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, 366 367 // LowerVectorINT_TO_FP: 368 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 369 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 370 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 371 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 372 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 373 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 374 375 // Complex: to v2f32 376 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 377 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, 378 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, 379 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 380 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, 381 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, 382 383 // Complex: to v4f32 384 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 }, 385 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 386 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, 387 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 388 389 // Complex: to v8f32 390 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, 391 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 392 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, 393 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 394 395 // Complex: to v16f32 396 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, 397 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, 398 399 // Complex: to v2f64 400 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 401 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, 402 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 403 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 404 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, 405 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 406 407 408 // LowerVectorFP_TO_INT 409 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 }, 410 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, 411 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, 412 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, 413 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, 414 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, 415 416 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext). 417 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 }, 418 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 }, 419 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 }, 420 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 }, 421 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 }, 422 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 }, 423 424 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2 425 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 }, 426 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 }, 427 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 }, 428 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 }, 429 430 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2. 431 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 }, 432 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 }, 433 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 }, 434 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 }, 435 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 }, 436 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 }, 437 }; 438 439 if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD, 440 DstTy.getSimpleVT(), 441 SrcTy.getSimpleVT())) 442 return AdjustCost(Entry->Cost); 443 444 return AdjustCost( 445 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); 446 } 447 448 int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst, 449 VectorType *VecTy, 450 unsigned Index) { 451 452 // Make sure we were given a valid extend opcode. 453 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && 454 "Invalid opcode"); 455 456 // We are extending an element we extract from a vector, so the source type 457 // of the extend is the element type of the vector. 458 auto *Src = VecTy->getElementType(); 459 460 // Sign- and zero-extends are for integer types only. 461 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type"); 462 463 // Get the cost for the extract. We compute the cost (if any) for the extend 464 // below. 465 auto Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy, Index); 466 467 // Legalize the types. 468 auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy); 469 auto DstVT = TLI->getValueType(DL, Dst); 470 auto SrcVT = TLI->getValueType(DL, Src); 471 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 472 473 // If the resulting type is still a vector and the destination type is legal, 474 // we may get the extension for free. If not, get the default cost for the 475 // extend. 476 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT)) 477 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 478 CostKind); 479 480 // The destination type should be larger than the element type. If not, get 481 // the default cost for the extend. 482 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits()) 483 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 484 CostKind); 485 486 switch (Opcode) { 487 default: 488 llvm_unreachable("Opcode should be either SExt or ZExt"); 489 490 // For sign-extends, we only need a smov, which performs the extension 491 // automatically. 492 case Instruction::SExt: 493 return Cost; 494 495 // For zero-extends, the extend is performed automatically by a umov unless 496 // the destination type is i64 and the element type is i8 or i16. 497 case Instruction::ZExt: 498 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u) 499 return Cost; 500 } 501 502 // If we are unable to perform the extend for free, get the default cost. 503 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 504 CostKind); 505 } 506 507 unsigned AArch64TTIImpl::getCFInstrCost(unsigned Opcode, 508 TTI::TargetCostKind CostKind) { 509 if (CostKind != TTI::TCK_RecipThroughput) 510 return Opcode == Instruction::PHI ? 0 : 1; 511 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind"); 512 // Branches are assumed to be predicted. 513 return 0; 514 } 515 516 int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, 517 unsigned Index) { 518 assert(Val->isVectorTy() && "This must be a vector type"); 519 520 if (Index != -1U) { 521 // Legalize the type. 522 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); 523 524 // This type is legalized to a scalar type. 525 if (!LT.second.isVector()) 526 return 0; 527 528 // The type may be split. Normalize the index to the new type. 529 unsigned Width = LT.second.getVectorNumElements(); 530 Index = Index % Width; 531 532 // The element at index zero is already inside the vector. 533 if (Index == 0) 534 return 0; 535 } 536 537 // All other insert/extracts cost this much. 538 return ST->getVectorInsertExtractBaseCost(); 539 } 540 541 int AArch64TTIImpl::getArithmeticInstrCost( 542 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 543 TTI::OperandValueKind Opd1Info, 544 TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, 545 TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args, 546 const Instruction *CxtI) { 547 // TODO: Handle more cost kinds. 548 if (CostKind != TTI::TCK_RecipThroughput) 549 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, 550 Opd2Info, Opd1PropInfo, 551 Opd2PropInfo, Args, CxtI); 552 553 // Legalize the type. 554 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 555 556 // If the instruction is a widening instruction (e.g., uaddl, saddw, etc.), 557 // add in the widening overhead specified by the sub-target. Since the 558 // extends feeding widening instructions are performed automatically, they 559 // aren't present in the generated code and have a zero cost. By adding a 560 // widening overhead here, we attach the total cost of the combined operation 561 // to the widening instruction. 562 int Cost = 0; 563 if (isWideningInstruction(Ty, Opcode, Args)) 564 Cost += ST->getWideningBaseCost(); 565 566 int ISD = TLI->InstructionOpcodeToISD(Opcode); 567 568 switch (ISD) { 569 default: 570 return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, 571 Opd2Info, 572 Opd1PropInfo, Opd2PropInfo); 573 case ISD::SDIV: 574 if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue && 575 Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { 576 // On AArch64, scalar signed division by constants power-of-two are 577 // normally expanded to the sequence ADD + CMP + SELECT + SRA. 578 // The OperandValue properties many not be same as that of previous 579 // operation; conservatively assume OP_None. 580 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, 581 Opd1Info, Opd2Info, 582 TargetTransformInfo::OP_None, 583 TargetTransformInfo::OP_None); 584 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, 585 Opd1Info, Opd2Info, 586 TargetTransformInfo::OP_None, 587 TargetTransformInfo::OP_None); 588 Cost += getArithmeticInstrCost(Instruction::Select, Ty, CostKind, 589 Opd1Info, Opd2Info, 590 TargetTransformInfo::OP_None, 591 TargetTransformInfo::OP_None); 592 Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, 593 Opd1Info, Opd2Info, 594 TargetTransformInfo::OP_None, 595 TargetTransformInfo::OP_None); 596 return Cost; 597 } 598 LLVM_FALLTHROUGH; 599 case ISD::UDIV: 600 if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue) { 601 auto VT = TLI->getValueType(DL, Ty); 602 if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) { 603 // Vector signed division by constant are expanded to the 604 // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division 605 // to MULHS + SUB + SRL + ADD + SRL. 606 int MulCost = getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, 607 Opd1Info, Opd2Info, 608 TargetTransformInfo::OP_None, 609 TargetTransformInfo::OP_None); 610 int AddCost = getArithmeticInstrCost(Instruction::Add, Ty, CostKind, 611 Opd1Info, Opd2Info, 612 TargetTransformInfo::OP_None, 613 TargetTransformInfo::OP_None); 614 int ShrCost = getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, 615 Opd1Info, Opd2Info, 616 TargetTransformInfo::OP_None, 617 TargetTransformInfo::OP_None); 618 return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1; 619 } 620 } 621 622 Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, 623 Opd2Info, 624 Opd1PropInfo, Opd2PropInfo); 625 if (Ty->isVectorTy()) { 626 // On AArch64, vector divisions are not supported natively and are 627 // expanded into scalar divisions of each pair of elements. 628 Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, CostKind, 629 Opd1Info, Opd2Info, Opd1PropInfo, 630 Opd2PropInfo); 631 Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind, 632 Opd1Info, Opd2Info, Opd1PropInfo, 633 Opd2PropInfo); 634 // TODO: if one of the arguments is scalar, then it's not necessary to 635 // double the cost of handling the vector elements. 636 Cost += Cost; 637 } 638 return Cost; 639 640 case ISD::ADD: 641 case ISD::MUL: 642 case ISD::XOR: 643 case ISD::OR: 644 case ISD::AND: 645 // These nodes are marked as 'custom' for combining purposes only. 646 // We know that they are legal. See LowerAdd in ISelLowering. 647 return (Cost + 1) * LT.first; 648 649 case ISD::FADD: 650 // These nodes are marked as 'custom' just to lower them to SVE. 651 // We know said lowering will incur no additional cost. 652 if (isa<FixedVectorType>(Ty) && !Ty->getScalarType()->isFP128Ty()) 653 return (Cost + 2) * LT.first; 654 655 return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, 656 Opd2Info, 657 Opd1PropInfo, Opd2PropInfo); 658 } 659 } 660 661 int AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, 662 const SCEV *Ptr) { 663 // Address computations in vectorized code with non-consecutive addresses will 664 // likely result in more instructions compared to scalar code where the 665 // computation can more often be merged into the index mode. The resulting 666 // extra micro-ops can significantly decrease throughput. 667 unsigned NumVectorInstToHideOverhead = 10; 668 int MaxMergeDistance = 64; 669 670 if (Ty->isVectorTy() && SE && 671 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) 672 return NumVectorInstToHideOverhead; 673 674 // In many cases the address computation is not merged into the instruction 675 // addressing mode. 676 return 1; 677 } 678 679 int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 680 Type *CondTy, CmpInst::Predicate VecPred, 681 TTI::TargetCostKind CostKind, 682 const Instruction *I) { 683 // TODO: Handle other cost kinds. 684 if (CostKind != TTI::TCK_RecipThroughput) 685 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 686 I); 687 688 int ISD = TLI->InstructionOpcodeToISD(Opcode); 689 // We don't lower some vector selects well that are wider than the register 690 // width. 691 if (ValTy->isVectorTy() && ISD == ISD::SELECT) { 692 // We would need this many instructions to hide the scalarization happening. 693 const int AmortizationCost = 20; 694 695 // If VecPred is not set, check if we can get a predicate from the context 696 // instruction, if its type matches the requested ValTy. 697 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) { 698 CmpInst::Predicate CurrentPred; 699 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(), 700 m_Value()))) 701 VecPred = CurrentPred; 702 } 703 // Check if we have a compare/select chain that can be lowered using CMxx & 704 // BFI pair. 705 if (CmpInst::isIntPredicate(VecPred)) { 706 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, 707 MVT::v8i16, MVT::v2i32, MVT::v4i32, 708 MVT::v2i64}; 709 auto LT = TLI->getTypeLegalizationCost(DL, ValTy); 710 if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; })) 711 return LT.first; 712 } 713 714 static const TypeConversionCostTblEntry 715 VectorSelectTbl[] = { 716 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 }, 717 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 }, 718 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 }, 719 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost }, 720 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost }, 721 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost } 722 }; 723 724 EVT SelCondTy = TLI->getValueType(DL, CondTy); 725 EVT SelValTy = TLI->getValueType(DL, ValTy); 726 if (SelCondTy.isSimple() && SelValTy.isSimple()) { 727 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD, 728 SelCondTy.getSimpleVT(), 729 SelValTy.getSimpleVT())) 730 return Entry->Cost; 731 } 732 } 733 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); 734 } 735 736 AArch64TTIImpl::TTI::MemCmpExpansionOptions 737 AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { 738 TTI::MemCmpExpansionOptions Options; 739 if (ST->requiresStrictAlign()) { 740 // TODO: Add cost modeling for strict align. Misaligned loads expand to 741 // a bunch of instructions when strict align is enabled. 742 return Options; 743 } 744 Options.AllowOverlappingLoads = true; 745 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); 746 Options.NumLoadsPerBlock = Options.MaxNumLoads; 747 // TODO: Though vector loads usually perform well on AArch64, in some targets 748 // they may wake up the FP unit, which raises the power consumption. Perhaps 749 // they could be used with no holds barred (-O3). 750 Options.LoadSizes = {8, 4, 2, 1}; 751 return Options; 752 } 753 754 bool AArch64TTIImpl::useNeonVector(const Type *Ty) const { 755 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors(); 756 } 757 758 int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, 759 MaybeAlign Alignment, unsigned AddressSpace, 760 TTI::TargetCostKind CostKind, 761 const Instruction *I) { 762 // TODO: Handle other cost kinds. 763 if (CostKind != TTI::TCK_RecipThroughput) 764 return 1; 765 766 // Type legalization can't handle structs 767 if (TLI->getValueType(DL, Ty, true) == MVT::Other) 768 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, 769 CostKind); 770 771 auto LT = TLI->getTypeLegalizationCost(DL, Ty); 772 773 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store && 774 LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) { 775 // Unaligned stores are extremely inefficient. We don't split all 776 // unaligned 128-bit stores because the negative impact that has shown in 777 // practice on inlined block copy code. 778 // We make such stores expensive so that we will only vectorize if there 779 // are 6 other instructions getting vectorized. 780 const int AmortizationCost = 6; 781 782 return LT.first * 2 * AmortizationCost; 783 } 784 785 if (useNeonVector(Ty) && 786 cast<VectorType>(Ty)->getElementType()->isIntegerTy(8)) { 787 unsigned ProfitableNumElements; 788 if (Opcode == Instruction::Store) 789 // We use a custom trunc store lowering so v.4b should be profitable. 790 ProfitableNumElements = 4; 791 else 792 // We scalarize the loads because there is not v.4b register and we 793 // have to promote the elements to v.2. 794 ProfitableNumElements = 8; 795 796 if (cast<FixedVectorType>(Ty)->getNumElements() < ProfitableNumElements) { 797 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements(); 798 unsigned NumVectorizableInstsToAmortize = NumVecElts * 2; 799 // We generate 2 instructions per vector element. 800 return NumVectorizableInstsToAmortize * NumVecElts * 2; 801 } 802 } 803 804 return LT.first; 805 } 806 807 int AArch64TTIImpl::getInterleavedMemoryOpCost( 808 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 809 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 810 bool UseMaskForCond, bool UseMaskForGaps) { 811 assert(Factor >= 2 && "Invalid interleave factor"); 812 auto *VecVTy = cast<FixedVectorType>(VecTy); 813 814 if (!UseMaskForCond && !UseMaskForGaps && 815 Factor <= TLI->getMaxSupportedInterleaveFactor()) { 816 unsigned NumElts = VecVTy->getNumElements(); 817 auto *SubVecTy = 818 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor); 819 820 // ldN/stN only support legal vector types of size 64 or 128 in bits. 821 // Accesses having vector types that are a multiple of 128 bits can be 822 // matched to more than one ldN/stN instruction. 823 if (NumElts % Factor == 0 && 824 TLI->isLegalInterleavedAccessType(SubVecTy, DL)) 825 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL); 826 } 827 828 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 829 Alignment, AddressSpace, CostKind, 830 UseMaskForCond, UseMaskForGaps); 831 } 832 833 int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { 834 int Cost = 0; 835 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 836 for (auto *I : Tys) { 837 if (!I->isVectorTy()) 838 continue; 839 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() == 840 128) 841 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) + 842 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind); 843 } 844 return Cost; 845 } 846 847 unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) { 848 return ST->getMaxInterleaveFactor(); 849 } 850 851 // For Falkor, we want to avoid having too many strided loads in a loop since 852 // that can exhaust the HW prefetcher resources. We adjust the unroller 853 // MaxCount preference below to attempt to ensure unrolling doesn't create too 854 // many strided loads. 855 static void 856 getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, 857 TargetTransformInfo::UnrollingPreferences &UP) { 858 enum { MaxStridedLoads = 7 }; 859 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) { 860 int StridedLoads = 0; 861 // FIXME? We could make this more precise by looking at the CFG and 862 // e.g. not counting loads in each side of an if-then-else diamond. 863 for (const auto BB : L->blocks()) { 864 for (auto &I : *BB) { 865 LoadInst *LMemI = dyn_cast<LoadInst>(&I); 866 if (!LMemI) 867 continue; 868 869 Value *PtrValue = LMemI->getPointerOperand(); 870 if (L->isLoopInvariant(PtrValue)) 871 continue; 872 873 const SCEV *LSCEV = SE.getSCEV(PtrValue); 874 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV); 875 if (!LSCEVAddRec || !LSCEVAddRec->isAffine()) 876 continue; 877 878 // FIXME? We could take pairing of unrolled load copies into account 879 // by looking at the AddRec, but we would probably have to limit this 880 // to loops with no stores or other memory optimization barriers. 881 ++StridedLoads; 882 // We've seen enough strided loads that seeing more won't make a 883 // difference. 884 if (StridedLoads > MaxStridedLoads / 2) 885 return StridedLoads; 886 } 887 } 888 return StridedLoads; 889 }; 890 891 int StridedLoads = countStridedLoads(L, SE); 892 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads 893 << " strided loads\n"); 894 // Pick the largest power of 2 unroll count that won't result in too many 895 // strided loads. 896 if (StridedLoads) { 897 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads); 898 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to " 899 << UP.MaxCount << '\n'); 900 } 901 } 902 903 void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 904 TTI::UnrollingPreferences &UP) { 905 // Enable partial unrolling and runtime unrolling. 906 BaseT::getUnrollingPreferences(L, SE, UP); 907 908 // For inner loop, it is more likely to be a hot one, and the runtime check 909 // can be promoted out from LICM pass, so the overhead is less, let's try 910 // a larger threshold to unroll more loops. 911 if (L->getLoopDepth() > 1) 912 UP.PartialThreshold *= 2; 913 914 // Disable partial & runtime unrolling on -Os. 915 UP.PartialOptSizeThreshold = 0; 916 917 if (ST->getProcFamily() == AArch64Subtarget::Falkor && 918 EnableFalkorHWPFUnrollFix) 919 getFalkorUnrollingPreferences(L, SE, UP); 920 } 921 922 void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 923 TTI::PeelingPreferences &PP) { 924 BaseT::getPeelingPreferences(L, SE, PP); 925 } 926 927 Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, 928 Type *ExpectedType) { 929 switch (Inst->getIntrinsicID()) { 930 default: 931 return nullptr; 932 case Intrinsic::aarch64_neon_st2: 933 case Intrinsic::aarch64_neon_st3: 934 case Intrinsic::aarch64_neon_st4: { 935 // Create a struct type 936 StructType *ST = dyn_cast<StructType>(ExpectedType); 937 if (!ST) 938 return nullptr; 939 unsigned NumElts = Inst->getNumArgOperands() - 1; 940 if (ST->getNumElements() != NumElts) 941 return nullptr; 942 for (unsigned i = 0, e = NumElts; i != e; ++i) { 943 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i)) 944 return nullptr; 945 } 946 Value *Res = UndefValue::get(ExpectedType); 947 IRBuilder<> Builder(Inst); 948 for (unsigned i = 0, e = NumElts; i != e; ++i) { 949 Value *L = Inst->getArgOperand(i); 950 Res = Builder.CreateInsertValue(Res, L, i); 951 } 952 return Res; 953 } 954 case Intrinsic::aarch64_neon_ld2: 955 case Intrinsic::aarch64_neon_ld3: 956 case Intrinsic::aarch64_neon_ld4: 957 if (Inst->getType() == ExpectedType) 958 return Inst; 959 return nullptr; 960 } 961 } 962 963 bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, 964 MemIntrinsicInfo &Info) { 965 switch (Inst->getIntrinsicID()) { 966 default: 967 break; 968 case Intrinsic::aarch64_neon_ld2: 969 case Intrinsic::aarch64_neon_ld3: 970 case Intrinsic::aarch64_neon_ld4: 971 Info.ReadMem = true; 972 Info.WriteMem = false; 973 Info.PtrVal = Inst->getArgOperand(0); 974 break; 975 case Intrinsic::aarch64_neon_st2: 976 case Intrinsic::aarch64_neon_st3: 977 case Intrinsic::aarch64_neon_st4: 978 Info.ReadMem = false; 979 Info.WriteMem = true; 980 Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1); 981 break; 982 } 983 984 switch (Inst->getIntrinsicID()) { 985 default: 986 return false; 987 case Intrinsic::aarch64_neon_ld2: 988 case Intrinsic::aarch64_neon_st2: 989 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS; 990 break; 991 case Intrinsic::aarch64_neon_ld3: 992 case Intrinsic::aarch64_neon_st3: 993 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS; 994 break; 995 case Intrinsic::aarch64_neon_ld4: 996 case Intrinsic::aarch64_neon_st4: 997 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS; 998 break; 999 } 1000 return true; 1001 } 1002 1003 /// See if \p I should be considered for address type promotion. We check if \p 1004 /// I is a sext with right type and used in memory accesses. If it used in a 1005 /// "complex" getelementptr, we allow it to be promoted without finding other 1006 /// sext instructions that sign extended the same initial value. A getelementptr 1007 /// is considered as "complex" if it has more than 2 operands. 1008 bool AArch64TTIImpl::shouldConsiderAddressTypePromotion( 1009 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) { 1010 bool Considerable = false; 1011 AllowPromotionWithoutCommonHeader = false; 1012 if (!isa<SExtInst>(&I)) 1013 return false; 1014 Type *ConsideredSExtType = 1015 Type::getInt64Ty(I.getParent()->getParent()->getContext()); 1016 if (I.getType() != ConsideredSExtType) 1017 return false; 1018 // See if the sext is the one with the right type and used in at least one 1019 // GetElementPtrInst. 1020 for (const User *U : I.users()) { 1021 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) { 1022 Considerable = true; 1023 // A getelementptr is considered as "complex" if it has more than 2 1024 // operands. We will promote a SExt used in such complex GEP as we 1025 // expect some computation to be merged if they are done on 64 bits. 1026 if (GEPInst->getNumOperands() > 2) { 1027 AllowPromotionWithoutCommonHeader = true; 1028 break; 1029 } 1030 } 1031 } 1032 return Considerable; 1033 } 1034 1035 bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty, 1036 TTI::ReductionFlags Flags) const { 1037 auto *VTy = cast<VectorType>(Ty); 1038 unsigned ScalarBits = Ty->getScalarSizeInBits(); 1039 switch (Opcode) { 1040 case Instruction::FAdd: 1041 case Instruction::FMul: 1042 case Instruction::And: 1043 case Instruction::Or: 1044 case Instruction::Xor: 1045 case Instruction::Mul: 1046 return false; 1047 case Instruction::Add: 1048 return ScalarBits * cast<FixedVectorType>(VTy)->getNumElements() >= 128; 1049 case Instruction::ICmp: 1050 return (ScalarBits < 64) && 1051 (ScalarBits * cast<FixedVectorType>(VTy)->getNumElements() >= 128); 1052 case Instruction::FCmp: 1053 return Flags.NoNaN; 1054 default: 1055 llvm_unreachable("Unhandled reduction opcode"); 1056 } 1057 return false; 1058 } 1059 1060 int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, 1061 VectorType *ValTy, 1062 bool IsPairwiseForm, 1063 TTI::TargetCostKind CostKind) { 1064 1065 if (IsPairwiseForm) 1066 return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm, 1067 CostKind); 1068 1069 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 1070 MVT MTy = LT.second; 1071 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1072 assert(ISD && "Invalid opcode"); 1073 1074 // Horizontal adds can use the 'addv' instruction. We model the cost of these 1075 // instructions as normal vector adds. This is the only arithmetic vector 1076 // reduction operation for which we have an instruction. 1077 static const CostTblEntry CostTblNoPairwise[]{ 1078 {ISD::ADD, MVT::v8i8, 1}, 1079 {ISD::ADD, MVT::v16i8, 1}, 1080 {ISD::ADD, MVT::v4i16, 1}, 1081 {ISD::ADD, MVT::v8i16, 1}, 1082 {ISD::ADD, MVT::v4i32, 1}, 1083 }; 1084 1085 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy)) 1086 return LT.first * Entry->Cost; 1087 1088 return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm, 1089 CostKind); 1090 } 1091 1092 int AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, 1093 int Index, VectorType *SubTp) { 1094 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose || 1095 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc) { 1096 static const CostTblEntry ShuffleTbl[] = { 1097 // Broadcast shuffle kinds can be performed with 'dup'. 1098 { TTI::SK_Broadcast, MVT::v8i8, 1 }, 1099 { TTI::SK_Broadcast, MVT::v16i8, 1 }, 1100 { TTI::SK_Broadcast, MVT::v4i16, 1 }, 1101 { TTI::SK_Broadcast, MVT::v8i16, 1 }, 1102 { TTI::SK_Broadcast, MVT::v2i32, 1 }, 1103 { TTI::SK_Broadcast, MVT::v4i32, 1 }, 1104 { TTI::SK_Broadcast, MVT::v2i64, 1 }, 1105 { TTI::SK_Broadcast, MVT::v2f32, 1 }, 1106 { TTI::SK_Broadcast, MVT::v4f32, 1 }, 1107 { TTI::SK_Broadcast, MVT::v2f64, 1 }, 1108 // Transpose shuffle kinds can be performed with 'trn1/trn2' and 1109 // 'zip1/zip2' instructions. 1110 { TTI::SK_Transpose, MVT::v8i8, 1 }, 1111 { TTI::SK_Transpose, MVT::v16i8, 1 }, 1112 { TTI::SK_Transpose, MVT::v4i16, 1 }, 1113 { TTI::SK_Transpose, MVT::v8i16, 1 }, 1114 { TTI::SK_Transpose, MVT::v2i32, 1 }, 1115 { TTI::SK_Transpose, MVT::v4i32, 1 }, 1116 { TTI::SK_Transpose, MVT::v2i64, 1 }, 1117 { TTI::SK_Transpose, MVT::v2f32, 1 }, 1118 { TTI::SK_Transpose, MVT::v4f32, 1 }, 1119 { TTI::SK_Transpose, MVT::v2f64, 1 }, 1120 // Select shuffle kinds. 1121 // TODO: handle vXi8/vXi16. 1122 { TTI::SK_Select, MVT::v2i32, 1 }, // mov. 1123 { TTI::SK_Select, MVT::v4i32, 2 }, // rev+trn (or similar). 1124 { TTI::SK_Select, MVT::v2i64, 1 }, // mov. 1125 { TTI::SK_Select, MVT::v2f32, 1 }, // mov. 1126 { TTI::SK_Select, MVT::v4f32, 2 }, // rev+trn (or similar). 1127 { TTI::SK_Select, MVT::v2f64, 1 }, // mov. 1128 // PermuteSingleSrc shuffle kinds. 1129 // TODO: handle vXi8/vXi16. 1130 { TTI::SK_PermuteSingleSrc, MVT::v2i32, 1 }, // mov. 1131 { TTI::SK_PermuteSingleSrc, MVT::v4i32, 3 }, // perfectshuffle worst case. 1132 { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // mov. 1133 { TTI::SK_PermuteSingleSrc, MVT::v2f32, 1 }, // mov. 1134 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 3 }, // perfectshuffle worst case. 1135 { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // mov. 1136 }; 1137 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 1138 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second)) 1139 return LT.first * Entry->Cost; 1140 } 1141 1142 return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); 1143 } 1144