1 //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "AArch64TargetTransformInfo.h" 10 #include "AArch64ExpandImm.h" 11 #include "AArch64PerfectShuffle.h" 12 #include "MCTargetDesc/AArch64AddressingModes.h" 13 #include "llvm/Analysis/IVDescriptors.h" 14 #include "llvm/Analysis/LoopInfo.h" 15 #include "llvm/Analysis/TargetTransformInfo.h" 16 #include "llvm/CodeGen/BasicTTIImpl.h" 17 #include "llvm/CodeGen/CostTable.h" 18 #include "llvm/CodeGen/TargetLowering.h" 19 #include "llvm/IR/IntrinsicInst.h" 20 #include "llvm/IR/Intrinsics.h" 21 #include "llvm/IR/IntrinsicsAArch64.h" 22 #include "llvm/IR/PatternMatch.h" 23 #include "llvm/Support/Debug.h" 24 #include "llvm/Transforms/InstCombine/InstCombiner.h" 25 #include <algorithm> 26 using namespace llvm; 27 using namespace llvm::PatternMatch; 28 29 #define DEBUG_TYPE "aarch64tti" 30 31 static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", 32 cl::init(true), cl::Hidden); 33 34 static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10), 35 cl::Hidden); 36 37 static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead", 38 cl::init(10), cl::Hidden); 39 40 bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, 41 const Function *Callee) const { 42 const TargetMachine &TM = getTLI()->getTargetMachine(); 43 44 const FeatureBitset &CallerBits = 45 TM.getSubtargetImpl(*Caller)->getFeatureBits(); 46 const FeatureBitset &CalleeBits = 47 TM.getSubtargetImpl(*Callee)->getFeatureBits(); 48 49 // Inline a callee if its target-features are a subset of the callers 50 // target-features. 51 return (CallerBits & CalleeBits) == CalleeBits; 52 } 53 54 /// Calculate the cost of materializing a 64-bit value. This helper 55 /// method might only calculate a fraction of a larger immediate. Therefore it 56 /// is valid to return a cost of ZERO. 57 InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) { 58 // Check if the immediate can be encoded within an instruction. 59 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64)) 60 return 0; 61 62 if (Val < 0) 63 Val = ~Val; 64 65 // Calculate how many moves we will need to materialize this constant. 66 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 67 AArch64_IMM::expandMOVImm(Val, 64, Insn); 68 return Insn.size(); 69 } 70 71 /// Calculate the cost of materializing the given constant. 72 InstructionCost AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, 73 TTI::TargetCostKind CostKind) { 74 assert(Ty->isIntegerTy()); 75 76 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 77 if (BitSize == 0) 78 return ~0U; 79 80 // Sign-extend all constants to a multiple of 64-bit. 81 APInt ImmVal = Imm; 82 if (BitSize & 0x3f) 83 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU); 84 85 // Split the constant into 64-bit chunks and calculate the cost for each 86 // chunk. 87 InstructionCost Cost = 0; 88 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { 89 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); 90 int64_t Val = Tmp.getSExtValue(); 91 Cost += getIntImmCost(Val); 92 } 93 // We need at least one instruction to materialze the constant. 94 return std::max<InstructionCost>(1, Cost); 95 } 96 97 InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, 98 const APInt &Imm, Type *Ty, 99 TTI::TargetCostKind CostKind, 100 Instruction *Inst) { 101 assert(Ty->isIntegerTy()); 102 103 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 104 // There is no cost model for constants with a bit size of 0. Return TCC_Free 105 // here, so that constant hoisting will ignore this constant. 106 if (BitSize == 0) 107 return TTI::TCC_Free; 108 109 unsigned ImmIdx = ~0U; 110 switch (Opcode) { 111 default: 112 return TTI::TCC_Free; 113 case Instruction::GetElementPtr: 114 // Always hoist the base address of a GetElementPtr. 115 if (Idx == 0) 116 return 2 * TTI::TCC_Basic; 117 return TTI::TCC_Free; 118 case Instruction::Store: 119 ImmIdx = 0; 120 break; 121 case Instruction::Add: 122 case Instruction::Sub: 123 case Instruction::Mul: 124 case Instruction::UDiv: 125 case Instruction::SDiv: 126 case Instruction::URem: 127 case Instruction::SRem: 128 case Instruction::And: 129 case Instruction::Or: 130 case Instruction::Xor: 131 case Instruction::ICmp: 132 ImmIdx = 1; 133 break; 134 // Always return TCC_Free for the shift value of a shift instruction. 135 case Instruction::Shl: 136 case Instruction::LShr: 137 case Instruction::AShr: 138 if (Idx == 1) 139 return TTI::TCC_Free; 140 break; 141 case Instruction::Trunc: 142 case Instruction::ZExt: 143 case Instruction::SExt: 144 case Instruction::IntToPtr: 145 case Instruction::PtrToInt: 146 case Instruction::BitCast: 147 case Instruction::PHI: 148 case Instruction::Call: 149 case Instruction::Select: 150 case Instruction::Ret: 151 case Instruction::Load: 152 break; 153 } 154 155 if (Idx == ImmIdx) { 156 int NumConstants = (BitSize + 63) / 64; 157 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 158 return (Cost <= NumConstants * TTI::TCC_Basic) 159 ? static_cast<int>(TTI::TCC_Free) 160 : Cost; 161 } 162 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 163 } 164 165 InstructionCost 166 AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, 167 const APInt &Imm, Type *Ty, 168 TTI::TargetCostKind CostKind) { 169 assert(Ty->isIntegerTy()); 170 171 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 172 // There is no cost model for constants with a bit size of 0. Return TCC_Free 173 // here, so that constant hoisting will ignore this constant. 174 if (BitSize == 0) 175 return TTI::TCC_Free; 176 177 // Most (all?) AArch64 intrinsics do not support folding immediates into the 178 // selected instruction, so we compute the materialization cost for the 179 // immediate directly. 180 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv) 181 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 182 183 switch (IID) { 184 default: 185 return TTI::TCC_Free; 186 case Intrinsic::sadd_with_overflow: 187 case Intrinsic::uadd_with_overflow: 188 case Intrinsic::ssub_with_overflow: 189 case Intrinsic::usub_with_overflow: 190 case Intrinsic::smul_with_overflow: 191 case Intrinsic::umul_with_overflow: 192 if (Idx == 1) { 193 int NumConstants = (BitSize + 63) / 64; 194 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 195 return (Cost <= NumConstants * TTI::TCC_Basic) 196 ? static_cast<int>(TTI::TCC_Free) 197 : Cost; 198 } 199 break; 200 case Intrinsic::experimental_stackmap: 201 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 202 return TTI::TCC_Free; 203 break; 204 case Intrinsic::experimental_patchpoint_void: 205 case Intrinsic::experimental_patchpoint_i64: 206 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 207 return TTI::TCC_Free; 208 break; 209 case Intrinsic::experimental_gc_statepoint: 210 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 211 return TTI::TCC_Free; 212 break; 213 } 214 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 215 } 216 217 TargetTransformInfo::PopcntSupportKind 218 AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) { 219 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 220 if (TyWidth == 32 || TyWidth == 64) 221 return TTI::PSK_FastHardware; 222 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount. 223 return TTI::PSK_Software; 224 } 225 226 InstructionCost 227 AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 228 TTI::TargetCostKind CostKind) { 229 auto *RetTy = ICA.getReturnType(); 230 switch (ICA.getID()) { 231 case Intrinsic::umin: 232 case Intrinsic::umax: 233 case Intrinsic::smin: 234 case Intrinsic::smax: { 235 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, 236 MVT::v8i16, MVT::v2i32, MVT::v4i32}; 237 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 238 // v2i64 types get converted to cmp+bif hence the cost of 2 239 if (LT.second == MVT::v2i64) 240 return LT.first * 2; 241 if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; })) 242 return LT.first; 243 break; 244 } 245 case Intrinsic::sadd_sat: 246 case Intrinsic::ssub_sat: 247 case Intrinsic::uadd_sat: 248 case Intrinsic::usub_sat: { 249 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, 250 MVT::v8i16, MVT::v2i32, MVT::v4i32, 251 MVT::v2i64}; 252 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 253 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we 254 // need to extend the type, as it uses shr(qadd(shl, shl)). 255 unsigned Instrs = 256 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4; 257 if (any_of(ValidSatTys, [<](MVT M) { return M == LT.second; })) 258 return LT.first * Instrs; 259 break; 260 } 261 case Intrinsic::abs: { 262 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, 263 MVT::v8i16, MVT::v2i32, MVT::v4i32, 264 MVT::v2i64}; 265 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 266 if (any_of(ValidAbsTys, [<](MVT M) { return M == LT.second; })) 267 return LT.first; 268 break; 269 } 270 case Intrinsic::experimental_stepvector: { 271 InstructionCost Cost = 1; // Cost of the `index' instruction 272 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 273 // Legalisation of illegal vectors involves an `index' instruction plus 274 // (LT.first - 1) vector adds. 275 if (LT.first > 1) { 276 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext()); 277 InstructionCost AddCost = 278 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind); 279 Cost += AddCost * (LT.first - 1); 280 } 281 return Cost; 282 } 283 case Intrinsic::bitreverse: { 284 static const CostTblEntry BitreverseTbl[] = { 285 {Intrinsic::bitreverse, MVT::i32, 1}, 286 {Intrinsic::bitreverse, MVT::i64, 1}, 287 {Intrinsic::bitreverse, MVT::v8i8, 1}, 288 {Intrinsic::bitreverse, MVT::v16i8, 1}, 289 {Intrinsic::bitreverse, MVT::v4i16, 2}, 290 {Intrinsic::bitreverse, MVT::v8i16, 2}, 291 {Intrinsic::bitreverse, MVT::v2i32, 2}, 292 {Intrinsic::bitreverse, MVT::v4i32, 2}, 293 {Intrinsic::bitreverse, MVT::v1i64, 2}, 294 {Intrinsic::bitreverse, MVT::v2i64, 2}, 295 }; 296 const auto LegalisationCost = TLI->getTypeLegalizationCost(DL, RetTy); 297 const auto *Entry = 298 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second); 299 if (Entry) { 300 // Cost Model is using the legal type(i32) that i8 and i16 will be 301 // converted to +1 so that we match the actual lowering cost 302 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 || 303 TLI->getValueType(DL, RetTy, true) == MVT::i16) 304 return LegalisationCost.first * Entry->Cost + 1; 305 306 return LegalisationCost.first * Entry->Cost; 307 } 308 break; 309 } 310 case Intrinsic::ctpop: { 311 static const CostTblEntry CtpopCostTbl[] = { 312 {ISD::CTPOP, MVT::v2i64, 4}, 313 {ISD::CTPOP, MVT::v4i32, 3}, 314 {ISD::CTPOP, MVT::v8i16, 2}, 315 {ISD::CTPOP, MVT::v16i8, 1}, 316 {ISD::CTPOP, MVT::i64, 4}, 317 {ISD::CTPOP, MVT::v2i32, 3}, 318 {ISD::CTPOP, MVT::v4i16, 2}, 319 {ISD::CTPOP, MVT::v8i8, 1}, 320 {ISD::CTPOP, MVT::i32, 5}, 321 }; 322 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 323 MVT MTy = LT.second; 324 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) { 325 // Extra cost of +1 when illegal vector types are legalized by promoting 326 // the integer type. 327 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() != 328 RetTy->getScalarSizeInBits() 329 ? 1 330 : 0; 331 return LT.first * Entry->Cost + ExtraCost; 332 } 333 break; 334 } 335 case Intrinsic::sadd_with_overflow: 336 case Intrinsic::uadd_with_overflow: 337 case Intrinsic::ssub_with_overflow: 338 case Intrinsic::usub_with_overflow: 339 case Intrinsic::smul_with_overflow: 340 case Intrinsic::umul_with_overflow: { 341 static const CostTblEntry WithOverflowCostTbl[] = { 342 {Intrinsic::sadd_with_overflow, MVT::i8, 3}, 343 {Intrinsic::uadd_with_overflow, MVT::i8, 3}, 344 {Intrinsic::sadd_with_overflow, MVT::i16, 3}, 345 {Intrinsic::uadd_with_overflow, MVT::i16, 3}, 346 {Intrinsic::sadd_with_overflow, MVT::i32, 1}, 347 {Intrinsic::uadd_with_overflow, MVT::i32, 1}, 348 {Intrinsic::sadd_with_overflow, MVT::i64, 1}, 349 {Intrinsic::uadd_with_overflow, MVT::i64, 1}, 350 {Intrinsic::ssub_with_overflow, MVT::i8, 3}, 351 {Intrinsic::usub_with_overflow, MVT::i8, 3}, 352 {Intrinsic::ssub_with_overflow, MVT::i16, 3}, 353 {Intrinsic::usub_with_overflow, MVT::i16, 3}, 354 {Intrinsic::ssub_with_overflow, MVT::i32, 1}, 355 {Intrinsic::usub_with_overflow, MVT::i32, 1}, 356 {Intrinsic::ssub_with_overflow, MVT::i64, 1}, 357 {Intrinsic::usub_with_overflow, MVT::i64, 1}, 358 {Intrinsic::smul_with_overflow, MVT::i8, 5}, 359 {Intrinsic::umul_with_overflow, MVT::i8, 4}, 360 {Intrinsic::smul_with_overflow, MVT::i16, 5}, 361 {Intrinsic::umul_with_overflow, MVT::i16, 4}, 362 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst 363 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw 364 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp 365 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr 366 }; 367 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true); 368 if (MTy.isSimple()) 369 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(), 370 MTy.getSimpleVT())) 371 return Entry->Cost; 372 break; 373 } 374 case Intrinsic::fptosi_sat: 375 case Intrinsic::fptoui_sat: { 376 if (ICA.getArgTypes().empty()) 377 break; 378 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat; 379 auto LT = TLI->getTypeLegalizationCost(DL, ICA.getArgTypes()[0]); 380 EVT MTy = TLI->getValueType(DL, RetTy); 381 // Check for the legal types, which are where the size of the input and the 382 // output are the same, or we are using cvt f64->i32 or f32->i64. 383 if ((LT.second == MVT::f32 || LT.second == MVT::f64 || 384 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 || 385 LT.second == MVT::v2f64) && 386 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() || 387 (LT.second == MVT::f64 && MTy == MVT::i32) || 388 (LT.second == MVT::f32 && MTy == MVT::i64))) 389 return LT.first; 390 // Similarly for fp16 sizes 391 if (ST->hasFullFP16() && 392 ((LT.second == MVT::f16 && MTy == MVT::i32) || 393 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) && 394 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))) 395 return LT.first; 396 397 // Otherwise we use a legal convert followed by a min+max 398 if ((LT.second.getScalarType() == MVT::f32 || 399 LT.second.getScalarType() == MVT::f64 || 400 (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) && 401 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) { 402 Type *LegalTy = 403 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits()); 404 if (LT.second.isVector()) 405 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount()); 406 InstructionCost Cost = 1; 407 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin, 408 LegalTy, {LegalTy, LegalTy}); 409 Cost += getIntrinsicInstrCost(Attrs1, CostKind); 410 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax, 411 LegalTy, {LegalTy, LegalTy}); 412 Cost += getIntrinsicInstrCost(Attrs2, CostKind); 413 return LT.first * Cost; 414 } 415 break; 416 } 417 default: 418 break; 419 } 420 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 421 } 422 423 /// The function will remove redundant reinterprets casting in the presence 424 /// of the control flow 425 static Optional<Instruction *> processPhiNode(InstCombiner &IC, 426 IntrinsicInst &II) { 427 SmallVector<Instruction *, 32> Worklist; 428 auto RequiredType = II.getType(); 429 430 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0)); 431 assert(PN && "Expected Phi Node!"); 432 433 // Don't create a new Phi unless we can remove the old one. 434 if (!PN->hasOneUse()) 435 return None; 436 437 for (Value *IncValPhi : PN->incoming_values()) { 438 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi); 439 if (!Reinterpret || 440 Reinterpret->getIntrinsicID() != 441 Intrinsic::aarch64_sve_convert_to_svbool || 442 RequiredType != Reinterpret->getArgOperand(0)->getType()) 443 return None; 444 } 445 446 // Create the new Phi 447 LLVMContext &Ctx = PN->getContext(); 448 IRBuilder<> Builder(Ctx); 449 Builder.SetInsertPoint(PN); 450 PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues()); 451 Worklist.push_back(PN); 452 453 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) { 454 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I)); 455 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I)); 456 Worklist.push_back(Reinterpret); 457 } 458 459 // Cleanup Phi Node and reinterprets 460 return IC.replaceInstUsesWith(II, NPN); 461 } 462 463 // (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _)))) 464 // => (binop (pred) (from_svbool _) (from_svbool _)) 465 // 466 // The above transformation eliminates a `to_svbool` in the predicate 467 // operand of bitwise operation `binop` by narrowing the vector width of 468 // the operation. For example, it would convert a `<vscale x 16 x i1> 469 // and` into a `<vscale x 4 x i1> and`. This is profitable because 470 // to_svbool must zero the new lanes during widening, whereas 471 // from_svbool is free. 472 static Optional<Instruction *> tryCombineFromSVBoolBinOp(InstCombiner &IC, 473 IntrinsicInst &II) { 474 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0)); 475 if (!BinOp) 476 return None; 477 478 auto IntrinsicID = BinOp->getIntrinsicID(); 479 switch (IntrinsicID) { 480 case Intrinsic::aarch64_sve_and_z: 481 case Intrinsic::aarch64_sve_bic_z: 482 case Intrinsic::aarch64_sve_eor_z: 483 case Intrinsic::aarch64_sve_nand_z: 484 case Intrinsic::aarch64_sve_nor_z: 485 case Intrinsic::aarch64_sve_orn_z: 486 case Intrinsic::aarch64_sve_orr_z: 487 break; 488 default: 489 return None; 490 } 491 492 auto BinOpPred = BinOp->getOperand(0); 493 auto BinOpOp1 = BinOp->getOperand(1); 494 auto BinOpOp2 = BinOp->getOperand(2); 495 496 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred); 497 if (!PredIntr || 498 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool) 499 return None; 500 501 auto PredOp = PredIntr->getOperand(0); 502 auto PredOpTy = cast<VectorType>(PredOp->getType()); 503 if (PredOpTy != II.getType()) 504 return None; 505 506 IRBuilder<> Builder(II.getContext()); 507 Builder.SetInsertPoint(&II); 508 509 SmallVector<Value *> NarrowedBinOpArgs = {PredOp}; 510 auto NarrowBinOpOp1 = Builder.CreateIntrinsic( 511 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1}); 512 NarrowedBinOpArgs.push_back(NarrowBinOpOp1); 513 if (BinOpOp1 == BinOpOp2) 514 NarrowedBinOpArgs.push_back(NarrowBinOpOp1); 515 else 516 NarrowedBinOpArgs.push_back(Builder.CreateIntrinsic( 517 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2})); 518 519 auto NarrowedBinOp = 520 Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs); 521 return IC.replaceInstUsesWith(II, NarrowedBinOp); 522 } 523 524 static Optional<Instruction *> instCombineConvertFromSVBool(InstCombiner &IC, 525 IntrinsicInst &II) { 526 // If the reinterpret instruction operand is a PHI Node 527 if (isa<PHINode>(II.getArgOperand(0))) 528 return processPhiNode(IC, II); 529 530 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II)) 531 return BinOpCombine; 532 533 SmallVector<Instruction *, 32> CandidatesForRemoval; 534 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr; 535 536 const auto *IVTy = cast<VectorType>(II.getType()); 537 538 // Walk the chain of conversions. 539 while (Cursor) { 540 // If the type of the cursor has fewer lanes than the final result, zeroing 541 // must take place, which breaks the equivalence chain. 542 const auto *CursorVTy = cast<VectorType>(Cursor->getType()); 543 if (CursorVTy->getElementCount().getKnownMinValue() < 544 IVTy->getElementCount().getKnownMinValue()) 545 break; 546 547 // If the cursor has the same type as I, it is a viable replacement. 548 if (Cursor->getType() == IVTy) 549 EarliestReplacement = Cursor; 550 551 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor); 552 553 // If this is not an SVE conversion intrinsic, this is the end of the chain. 554 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() == 555 Intrinsic::aarch64_sve_convert_to_svbool || 556 IntrinsicCursor->getIntrinsicID() == 557 Intrinsic::aarch64_sve_convert_from_svbool)) 558 break; 559 560 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor); 561 Cursor = IntrinsicCursor->getOperand(0); 562 } 563 564 // If no viable replacement in the conversion chain was found, there is 565 // nothing to do. 566 if (!EarliestReplacement) 567 return None; 568 569 return IC.replaceInstUsesWith(II, EarliestReplacement); 570 } 571 572 static Optional<Instruction *> instCombineSVESel(InstCombiner &IC, 573 IntrinsicInst &II) { 574 IRBuilder<> Builder(&II); 575 auto Select = Builder.CreateSelect(II.getOperand(0), II.getOperand(1), 576 II.getOperand(2)); 577 return IC.replaceInstUsesWith(II, Select); 578 } 579 580 static Optional<Instruction *> instCombineSVEDup(InstCombiner &IC, 581 IntrinsicInst &II) { 582 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); 583 if (!Pg) 584 return None; 585 586 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) 587 return None; 588 589 const auto PTruePattern = 590 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue(); 591 if (PTruePattern != AArch64SVEPredPattern::vl1) 592 return None; 593 594 // The intrinsic is inserting into lane zero so use an insert instead. 595 auto *IdxTy = Type::getInt64Ty(II.getContext()); 596 auto *Insert = InsertElementInst::Create( 597 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0)); 598 Insert->insertBefore(&II); 599 Insert->takeName(&II); 600 601 return IC.replaceInstUsesWith(II, Insert); 602 } 603 604 static Optional<Instruction *> instCombineSVEDupX(InstCombiner &IC, 605 IntrinsicInst &II) { 606 // Replace DupX with a regular IR splat. 607 IRBuilder<> Builder(II.getContext()); 608 Builder.SetInsertPoint(&II); 609 auto *RetTy = cast<ScalableVectorType>(II.getType()); 610 Value *Splat = 611 Builder.CreateVectorSplat(RetTy->getElementCount(), II.getArgOperand(0)); 612 Splat->takeName(&II); 613 return IC.replaceInstUsesWith(II, Splat); 614 } 615 616 static Optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC, 617 IntrinsicInst &II) { 618 LLVMContext &Ctx = II.getContext(); 619 IRBuilder<> Builder(Ctx); 620 Builder.SetInsertPoint(&II); 621 622 // Check that the predicate is all active 623 auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0)); 624 if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) 625 return None; 626 627 const auto PTruePattern = 628 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue(); 629 if (PTruePattern != AArch64SVEPredPattern::all) 630 return None; 631 632 // Check that we have a compare of zero.. 633 auto *SplatValue = 634 dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2))); 635 if (!SplatValue || !SplatValue->isZero()) 636 return None; 637 638 // ..against a dupq 639 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); 640 if (!DupQLane || 641 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane) 642 return None; 643 644 // Where the dupq is a lane 0 replicate of a vector insert 645 if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero()) 646 return None; 647 648 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0)); 649 if (!VecIns || 650 VecIns->getIntrinsicID() != Intrinsic::experimental_vector_insert) 651 return None; 652 653 // Where the vector insert is a fixed constant vector insert into undef at 654 // index zero 655 if (!isa<UndefValue>(VecIns->getArgOperand(0))) 656 return None; 657 658 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero()) 659 return None; 660 661 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1)); 662 if (!ConstVec) 663 return None; 664 665 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType()); 666 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType()); 667 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements()) 668 return None; 669 670 unsigned NumElts = VecTy->getNumElements(); 671 unsigned PredicateBits = 0; 672 673 // Expand intrinsic operands to a 16-bit byte level predicate 674 for (unsigned I = 0; I < NumElts; ++I) { 675 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I)); 676 if (!Arg) 677 return None; 678 if (!Arg->isZero()) 679 PredicateBits |= 1 << (I * (16 / NumElts)); 680 } 681 682 // If all bits are zero bail early with an empty predicate 683 if (PredicateBits == 0) { 684 auto *PFalse = Constant::getNullValue(II.getType()); 685 PFalse->takeName(&II); 686 return IC.replaceInstUsesWith(II, PFalse); 687 } 688 689 // Calculate largest predicate type used (where byte predicate is largest) 690 unsigned Mask = 8; 691 for (unsigned I = 0; I < 16; ++I) 692 if ((PredicateBits & (1 << I)) != 0) 693 Mask |= (I % 8); 694 695 unsigned PredSize = Mask & -Mask; 696 auto *PredType = ScalableVectorType::get( 697 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8)); 698 699 // Ensure all relevant bits are set 700 for (unsigned I = 0; I < 16; I += PredSize) 701 if ((PredicateBits & (1 << I)) == 0) 702 return None; 703 704 auto *PTruePat = 705 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all); 706 auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, 707 {PredType}, {PTruePat}); 708 auto *ConvertToSVBool = Builder.CreateIntrinsic( 709 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue}); 710 auto *ConvertFromSVBool = 711 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, 712 {II.getType()}, {ConvertToSVBool}); 713 714 ConvertFromSVBool->takeName(&II); 715 return IC.replaceInstUsesWith(II, ConvertFromSVBool); 716 } 717 718 static Optional<Instruction *> instCombineSVELast(InstCombiner &IC, 719 IntrinsicInst &II) { 720 IRBuilder<> Builder(II.getContext()); 721 Builder.SetInsertPoint(&II); 722 Value *Pg = II.getArgOperand(0); 723 Value *Vec = II.getArgOperand(1); 724 auto IntrinsicID = II.getIntrinsicID(); 725 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta; 726 727 // lastX(splat(X)) --> X 728 if (auto *SplatVal = getSplatValue(Vec)) 729 return IC.replaceInstUsesWith(II, SplatVal); 730 731 // If x and/or y is a splat value then: 732 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y)) 733 Value *LHS, *RHS; 734 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) { 735 if (isSplatValue(LHS) || isSplatValue(RHS)) { 736 auto *OldBinOp = cast<BinaryOperator>(Vec); 737 auto OpC = OldBinOp->getOpcode(); 738 auto *NewLHS = 739 Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS}); 740 auto *NewRHS = 741 Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS}); 742 auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags( 743 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), &II); 744 return IC.replaceInstUsesWith(II, NewBinOp); 745 } 746 } 747 748 auto *C = dyn_cast<Constant>(Pg); 749 if (IsAfter && C && C->isNullValue()) { 750 // The intrinsic is extracting lane 0 so use an extract instead. 751 auto *IdxTy = Type::getInt64Ty(II.getContext()); 752 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0)); 753 Extract->insertBefore(&II); 754 Extract->takeName(&II); 755 return IC.replaceInstUsesWith(II, Extract); 756 } 757 758 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg); 759 if (!IntrPG) 760 return None; 761 762 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) 763 return None; 764 765 const auto PTruePattern = 766 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue(); 767 768 // Can the intrinsic's predicate be converted to a known constant index? 769 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern); 770 if (!MinNumElts) 771 return None; 772 773 unsigned Idx = MinNumElts - 1; 774 // Increment the index if extracting the element after the last active 775 // predicate element. 776 if (IsAfter) 777 ++Idx; 778 779 // Ignore extracts whose index is larger than the known minimum vector 780 // length. NOTE: This is an artificial constraint where we prefer to 781 // maintain what the user asked for until an alternative is proven faster. 782 auto *PgVTy = cast<ScalableVectorType>(Pg->getType()); 783 if (Idx >= PgVTy->getMinNumElements()) 784 return None; 785 786 // The intrinsic is extracting a fixed lane so use an extract instead. 787 auto *IdxTy = Type::getInt64Ty(II.getContext()); 788 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx)); 789 Extract->insertBefore(&II); 790 Extract->takeName(&II); 791 return IC.replaceInstUsesWith(II, Extract); 792 } 793 794 static Optional<Instruction *> instCombineRDFFR(InstCombiner &IC, 795 IntrinsicInst &II) { 796 LLVMContext &Ctx = II.getContext(); 797 IRBuilder<> Builder(Ctx); 798 Builder.SetInsertPoint(&II); 799 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr 800 // can work with RDFFR_PP for ptest elimination. 801 auto *AllPat = 802 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all); 803 auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, 804 {II.getType()}, {AllPat}); 805 auto *RDFFR = 806 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue}); 807 RDFFR->takeName(&II); 808 return IC.replaceInstUsesWith(II, RDFFR); 809 } 810 811 static Optional<Instruction *> 812 instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) { 813 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue(); 814 815 if (Pattern == AArch64SVEPredPattern::all) { 816 LLVMContext &Ctx = II.getContext(); 817 IRBuilder<> Builder(Ctx); 818 Builder.SetInsertPoint(&II); 819 820 Constant *StepVal = ConstantInt::get(II.getType(), NumElts); 821 auto *VScale = Builder.CreateVScale(StepVal); 822 VScale->takeName(&II); 823 return IC.replaceInstUsesWith(II, VScale); 824 } 825 826 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern); 827 828 return MinNumElts && NumElts >= MinNumElts 829 ? Optional<Instruction *>(IC.replaceInstUsesWith( 830 II, ConstantInt::get(II.getType(), MinNumElts))) 831 : None; 832 } 833 834 static Optional<Instruction *> instCombineSVEPTest(InstCombiner &IC, 835 IntrinsicInst &II) { 836 IntrinsicInst *Op1 = dyn_cast<IntrinsicInst>(II.getArgOperand(0)); 837 IntrinsicInst *Op2 = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); 838 839 if (Op1 && Op2 && 840 Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && 841 Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && 842 Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) { 843 844 IRBuilder<> Builder(II.getContext()); 845 Builder.SetInsertPoint(&II); 846 847 Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)}; 848 Type *Tys[] = {Op1->getArgOperand(0)->getType()}; 849 850 auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops); 851 852 PTest->takeName(&II); 853 return IC.replaceInstUsesWith(II, PTest); 854 } 855 856 return None; 857 } 858 859 static Optional<Instruction *> instCombineSVEVectorFMLA(InstCombiner &IC, 860 IntrinsicInst &II) { 861 // fold (fadd p a (fmul p b c)) -> (fma p a b c) 862 Value *P = II.getOperand(0); 863 Value *A = II.getOperand(1); 864 auto FMul = II.getOperand(2); 865 Value *B, *C; 866 if (!match(FMul, m_Intrinsic<Intrinsic::aarch64_sve_fmul>( 867 m_Specific(P), m_Value(B), m_Value(C)))) 868 return None; 869 870 if (!FMul->hasOneUse()) 871 return None; 872 873 llvm::FastMathFlags FAddFlags = II.getFastMathFlags(); 874 // Stop the combine when the flags on the inputs differ in case dropping flags 875 // would lead to us missing out on more beneficial optimizations. 876 if (FAddFlags != cast<CallInst>(FMul)->getFastMathFlags()) 877 return None; 878 if (!FAddFlags.allowContract()) 879 return None; 880 881 IRBuilder<> Builder(II.getContext()); 882 Builder.SetInsertPoint(&II); 883 auto FMLA = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_fmla, 884 {II.getType()}, {P, A, B, C}, &II); 885 FMLA->setFastMathFlags(FAddFlags); 886 return IC.replaceInstUsesWith(II, FMLA); 887 } 888 889 static bool isAllActivePredicate(Value *Pred) { 890 // Look through convert.from.svbool(convert.to.svbool(...) chain. 891 Value *UncastedPred; 892 if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>( 893 m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>( 894 m_Value(UncastedPred))))) 895 // If the predicate has the same or less lanes than the uncasted 896 // predicate then we know the casting has no effect. 897 if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <= 898 cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements()) 899 Pred = UncastedPred; 900 901 return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( 902 m_ConstantInt<AArch64SVEPredPattern::all>())); 903 } 904 905 static Optional<Instruction *> 906 instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { 907 IRBuilder<> Builder(II.getContext()); 908 Builder.SetInsertPoint(&II); 909 910 Value *Pred = II.getOperand(0); 911 Value *PtrOp = II.getOperand(1); 912 Type *VecTy = II.getType(); 913 Value *VecPtr = Builder.CreateBitCast(PtrOp, VecTy->getPointerTo()); 914 915 if (isAllActivePredicate(Pred)) { 916 LoadInst *Load = Builder.CreateLoad(VecTy, VecPtr); 917 Load->copyMetadata(II); 918 return IC.replaceInstUsesWith(II, Load); 919 } 920 921 CallInst *MaskedLoad = 922 Builder.CreateMaskedLoad(VecTy, VecPtr, PtrOp->getPointerAlignment(DL), 923 Pred, ConstantAggregateZero::get(VecTy)); 924 MaskedLoad->copyMetadata(II); 925 return IC.replaceInstUsesWith(II, MaskedLoad); 926 } 927 928 static Optional<Instruction *> 929 instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { 930 IRBuilder<> Builder(II.getContext()); 931 Builder.SetInsertPoint(&II); 932 933 Value *VecOp = II.getOperand(0); 934 Value *Pred = II.getOperand(1); 935 Value *PtrOp = II.getOperand(2); 936 Value *VecPtr = 937 Builder.CreateBitCast(PtrOp, VecOp->getType()->getPointerTo()); 938 939 if (isAllActivePredicate(Pred)) { 940 StoreInst *Store = Builder.CreateStore(VecOp, VecPtr); 941 Store->copyMetadata(II); 942 return IC.eraseInstFromFunction(II); 943 } 944 945 CallInst *MaskedStore = Builder.CreateMaskedStore( 946 VecOp, VecPtr, PtrOp->getPointerAlignment(DL), Pred); 947 MaskedStore->copyMetadata(II); 948 return IC.eraseInstFromFunction(II); 949 } 950 951 static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) { 952 switch (Intrinsic) { 953 case Intrinsic::aarch64_sve_fmul: 954 return Instruction::BinaryOps::FMul; 955 case Intrinsic::aarch64_sve_fadd: 956 return Instruction::BinaryOps::FAdd; 957 case Intrinsic::aarch64_sve_fsub: 958 return Instruction::BinaryOps::FSub; 959 default: 960 return Instruction::BinaryOpsEnd; 961 } 962 } 963 964 static Optional<Instruction *> instCombineSVEVectorBinOp(InstCombiner &IC, 965 IntrinsicInst &II) { 966 auto *OpPredicate = II.getOperand(0); 967 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID()); 968 if (BinOpCode == Instruction::BinaryOpsEnd || 969 !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( 970 m_ConstantInt<AArch64SVEPredPattern::all>()))) 971 return None; 972 IRBuilder<> Builder(II.getContext()); 973 Builder.SetInsertPoint(&II); 974 Builder.setFastMathFlags(II.getFastMathFlags()); 975 auto BinOp = 976 Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2)); 977 return IC.replaceInstUsesWith(II, BinOp); 978 } 979 980 static Optional<Instruction *> instCombineSVEVectorFAdd(InstCombiner &IC, 981 IntrinsicInst &II) { 982 if (auto FMLA = instCombineSVEVectorFMLA(IC, II)) 983 return FMLA; 984 return instCombineSVEVectorBinOp(IC, II); 985 } 986 987 static Optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC, 988 IntrinsicInst &II) { 989 auto *OpPredicate = II.getOperand(0); 990 auto *OpMultiplicand = II.getOperand(1); 991 auto *OpMultiplier = II.getOperand(2); 992 993 IRBuilder<> Builder(II.getContext()); 994 Builder.SetInsertPoint(&II); 995 996 // Return true if a given instruction is a unit splat value, false otherwise. 997 auto IsUnitSplat = [](auto *I) { 998 auto *SplatValue = getSplatValue(I); 999 if (!SplatValue) 1000 return false; 1001 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); 1002 }; 1003 1004 // Return true if a given instruction is an aarch64_sve_dup intrinsic call 1005 // with a unit splat value, false otherwise. 1006 auto IsUnitDup = [](auto *I) { 1007 auto *IntrI = dyn_cast<IntrinsicInst>(I); 1008 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup) 1009 return false; 1010 1011 auto *SplatValue = IntrI->getOperand(2); 1012 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); 1013 }; 1014 1015 if (IsUnitSplat(OpMultiplier)) { 1016 // [f]mul pg %n, (dupx 1) => %n 1017 OpMultiplicand->takeName(&II); 1018 return IC.replaceInstUsesWith(II, OpMultiplicand); 1019 } else if (IsUnitDup(OpMultiplier)) { 1020 // [f]mul pg %n, (dup pg 1) => %n 1021 auto *DupInst = cast<IntrinsicInst>(OpMultiplier); 1022 auto *DupPg = DupInst->getOperand(1); 1023 // TODO: this is naive. The optimization is still valid if DupPg 1024 // 'encompasses' OpPredicate, not only if they're the same predicate. 1025 if (OpPredicate == DupPg) { 1026 OpMultiplicand->takeName(&II); 1027 return IC.replaceInstUsesWith(II, OpMultiplicand); 1028 } 1029 } 1030 1031 return instCombineSVEVectorBinOp(IC, II); 1032 } 1033 1034 static Optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC, 1035 IntrinsicInst &II) { 1036 IRBuilder<> Builder(II.getContext()); 1037 Builder.SetInsertPoint(&II); 1038 Value *UnpackArg = II.getArgOperand(0); 1039 auto *RetTy = cast<ScalableVectorType>(II.getType()); 1040 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi || 1041 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo; 1042 1043 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X)) 1044 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X)) 1045 if (auto *ScalarArg = getSplatValue(UnpackArg)) { 1046 ScalarArg = 1047 Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned); 1048 Value *NewVal = 1049 Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg); 1050 NewVal->takeName(&II); 1051 return IC.replaceInstUsesWith(II, NewVal); 1052 } 1053 1054 return None; 1055 } 1056 static Optional<Instruction *> instCombineSVETBL(InstCombiner &IC, 1057 IntrinsicInst &II) { 1058 auto *OpVal = II.getOperand(0); 1059 auto *OpIndices = II.getOperand(1); 1060 VectorType *VTy = cast<VectorType>(II.getType()); 1061 1062 // Check whether OpIndices is a constant splat value < minimal element count 1063 // of result. 1064 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices)); 1065 if (!SplatValue || 1066 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue())) 1067 return None; 1068 1069 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to 1070 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization. 1071 IRBuilder<> Builder(II.getContext()); 1072 Builder.SetInsertPoint(&II); 1073 auto *Extract = Builder.CreateExtractElement(OpVal, SplatValue); 1074 auto *VectorSplat = 1075 Builder.CreateVectorSplat(VTy->getElementCount(), Extract); 1076 1077 VectorSplat->takeName(&II); 1078 return IC.replaceInstUsesWith(II, VectorSplat); 1079 } 1080 1081 static Optional<Instruction *> instCombineSVETupleGet(InstCombiner &IC, 1082 IntrinsicInst &II) { 1083 // Try to remove sequences of tuple get/set. 1084 Value *SetTuple, *SetIndex, *SetValue; 1085 auto *GetTuple = II.getArgOperand(0); 1086 auto *GetIndex = II.getArgOperand(1); 1087 // Check that we have tuple_get(GetTuple, GetIndex) where GetTuple is a 1088 // call to tuple_set i.e. tuple_set(SetTuple, SetIndex, SetValue). 1089 // Make sure that the types of the current intrinsic and SetValue match 1090 // in order to safely remove the sequence. 1091 if (!match(GetTuple, 1092 m_Intrinsic<Intrinsic::aarch64_sve_tuple_set>( 1093 m_Value(SetTuple), m_Value(SetIndex), m_Value(SetValue))) || 1094 SetValue->getType() != II.getType()) 1095 return None; 1096 // Case where we get the same index right after setting it. 1097 // tuple_get(tuple_set(SetTuple, SetIndex, SetValue), GetIndex) --> SetValue 1098 if (GetIndex == SetIndex) 1099 return IC.replaceInstUsesWith(II, SetValue); 1100 // If we are getting a different index than what was set in the tuple_set 1101 // intrinsic. We can just set the input tuple to the one up in the chain. 1102 // tuple_get(tuple_set(SetTuple, SetIndex, SetValue), GetIndex) 1103 // --> tuple_get(SetTuple, GetIndex) 1104 return IC.replaceOperand(II, 0, SetTuple); 1105 } 1106 1107 static Optional<Instruction *> instCombineSVEZip(InstCombiner &IC, 1108 IntrinsicInst &II) { 1109 // zip1(uzp1(A, B), uzp2(A, B)) --> A 1110 // zip2(uzp1(A, B), uzp2(A, B)) --> B 1111 Value *A, *B; 1112 if (match(II.getArgOperand(0), 1113 m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) && 1114 match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>( 1115 m_Specific(A), m_Specific(B)))) 1116 return IC.replaceInstUsesWith( 1117 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B)); 1118 1119 return None; 1120 } 1121 1122 static Optional<Instruction *> instCombineLD1GatherIndex(InstCombiner &IC, 1123 IntrinsicInst &II) { 1124 Value *Mask = II.getOperand(0); 1125 Value *BasePtr = II.getOperand(1); 1126 Value *Index = II.getOperand(2); 1127 Type *Ty = II.getType(); 1128 Value *PassThru = ConstantAggregateZero::get(Ty); 1129 1130 // Contiguous gather => masked load. 1131 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1)) 1132 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer) 1133 Value *IndexBase; 1134 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>( 1135 m_Value(IndexBase), m_SpecificInt(1)))) { 1136 IRBuilder<> Builder(II.getContext()); 1137 Builder.SetInsertPoint(&II); 1138 1139 Align Alignment = 1140 BasePtr->getPointerAlignment(II.getModule()->getDataLayout()); 1141 1142 Type *VecPtrTy = PointerType::getUnqual(Ty); 1143 Value *Ptr = Builder.CreateGEP( 1144 cast<VectorType>(Ty)->getElementType(), BasePtr, IndexBase); 1145 Ptr = Builder.CreateBitCast(Ptr, VecPtrTy); 1146 CallInst *MaskedLoad = 1147 Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru); 1148 MaskedLoad->takeName(&II); 1149 return IC.replaceInstUsesWith(II, MaskedLoad); 1150 } 1151 1152 return None; 1153 } 1154 1155 static Optional<Instruction *> instCombineST1ScatterIndex(InstCombiner &IC, 1156 IntrinsicInst &II) { 1157 Value *Val = II.getOperand(0); 1158 Value *Mask = II.getOperand(1); 1159 Value *BasePtr = II.getOperand(2); 1160 Value *Index = II.getOperand(3); 1161 Type *Ty = Val->getType(); 1162 1163 // Contiguous scatter => masked store. 1164 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1)) 1165 // => (masked.store Value (gep BasePtr IndexBase) Align Mask) 1166 Value *IndexBase; 1167 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>( 1168 m_Value(IndexBase), m_SpecificInt(1)))) { 1169 IRBuilder<> Builder(II.getContext()); 1170 Builder.SetInsertPoint(&II); 1171 1172 Align Alignment = 1173 BasePtr->getPointerAlignment(II.getModule()->getDataLayout()); 1174 1175 Value *Ptr = Builder.CreateGEP( 1176 cast<VectorType>(Ty)->getElementType(), BasePtr, IndexBase); 1177 Type *VecPtrTy = PointerType::getUnqual(Ty); 1178 Ptr = Builder.CreateBitCast(Ptr, VecPtrTy); 1179 1180 (void)Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask); 1181 1182 return IC.eraseInstFromFunction(II); 1183 } 1184 1185 return None; 1186 } 1187 1188 static Optional<Instruction *> instCombineSVESDIV(InstCombiner &IC, 1189 IntrinsicInst &II) { 1190 IRBuilder<> Builder(II.getContext()); 1191 Builder.SetInsertPoint(&II); 1192 Type *Int32Ty = Builder.getInt32Ty(); 1193 Value *Pred = II.getOperand(0); 1194 Value *Vec = II.getOperand(1); 1195 Value *DivVec = II.getOperand(2); 1196 1197 Value *SplatValue = getSplatValue(DivVec); 1198 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue); 1199 if (!SplatConstantInt) 1200 return None; 1201 APInt Divisor = SplatConstantInt->getValue(); 1202 1203 if (Divisor.isPowerOf2()) { 1204 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2()); 1205 auto ASRD = Builder.CreateIntrinsic( 1206 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2}); 1207 return IC.replaceInstUsesWith(II, ASRD); 1208 } 1209 if (Divisor.isNegatedPowerOf2()) { 1210 Divisor.negate(); 1211 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2()); 1212 auto ASRD = Builder.CreateIntrinsic( 1213 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2}); 1214 auto NEG = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_neg, 1215 {ASRD->getType()}, {ASRD, Pred, ASRD}); 1216 return IC.replaceInstUsesWith(II, NEG); 1217 } 1218 1219 return None; 1220 } 1221 1222 static Optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC, 1223 IntrinsicInst &II) { 1224 Value *A = II.getArgOperand(0); 1225 Value *B = II.getArgOperand(1); 1226 if (A == B) 1227 return IC.replaceInstUsesWith(II, A); 1228 1229 return None; 1230 } 1231 1232 Optional<Instruction *> 1233 AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, 1234 IntrinsicInst &II) const { 1235 Intrinsic::ID IID = II.getIntrinsicID(); 1236 switch (IID) { 1237 default: 1238 break; 1239 case Intrinsic::aarch64_neon_fmaxnm: 1240 case Intrinsic::aarch64_neon_fminnm: 1241 return instCombineMaxMinNM(IC, II); 1242 case Intrinsic::aarch64_sve_convert_from_svbool: 1243 return instCombineConvertFromSVBool(IC, II); 1244 case Intrinsic::aarch64_sve_dup: 1245 return instCombineSVEDup(IC, II); 1246 case Intrinsic::aarch64_sve_dup_x: 1247 return instCombineSVEDupX(IC, II); 1248 case Intrinsic::aarch64_sve_cmpne: 1249 case Intrinsic::aarch64_sve_cmpne_wide: 1250 return instCombineSVECmpNE(IC, II); 1251 case Intrinsic::aarch64_sve_rdffr: 1252 return instCombineRDFFR(IC, II); 1253 case Intrinsic::aarch64_sve_lasta: 1254 case Intrinsic::aarch64_sve_lastb: 1255 return instCombineSVELast(IC, II); 1256 case Intrinsic::aarch64_sve_cntd: 1257 return instCombineSVECntElts(IC, II, 2); 1258 case Intrinsic::aarch64_sve_cntw: 1259 return instCombineSVECntElts(IC, II, 4); 1260 case Intrinsic::aarch64_sve_cnth: 1261 return instCombineSVECntElts(IC, II, 8); 1262 case Intrinsic::aarch64_sve_cntb: 1263 return instCombineSVECntElts(IC, II, 16); 1264 case Intrinsic::aarch64_sve_ptest_any: 1265 case Intrinsic::aarch64_sve_ptest_first: 1266 case Intrinsic::aarch64_sve_ptest_last: 1267 return instCombineSVEPTest(IC, II); 1268 case Intrinsic::aarch64_sve_mul: 1269 case Intrinsic::aarch64_sve_fmul: 1270 return instCombineSVEVectorMul(IC, II); 1271 case Intrinsic::aarch64_sve_fadd: 1272 return instCombineSVEVectorFAdd(IC, II); 1273 case Intrinsic::aarch64_sve_fsub: 1274 return instCombineSVEVectorBinOp(IC, II); 1275 case Intrinsic::aarch64_sve_tbl: 1276 return instCombineSVETBL(IC, II); 1277 case Intrinsic::aarch64_sve_uunpkhi: 1278 case Intrinsic::aarch64_sve_uunpklo: 1279 case Intrinsic::aarch64_sve_sunpkhi: 1280 case Intrinsic::aarch64_sve_sunpklo: 1281 return instCombineSVEUnpack(IC, II); 1282 case Intrinsic::aarch64_sve_tuple_get: 1283 return instCombineSVETupleGet(IC, II); 1284 case Intrinsic::aarch64_sve_zip1: 1285 case Intrinsic::aarch64_sve_zip2: 1286 return instCombineSVEZip(IC, II); 1287 case Intrinsic::aarch64_sve_ld1_gather_index: 1288 return instCombineLD1GatherIndex(IC, II); 1289 case Intrinsic::aarch64_sve_st1_scatter_index: 1290 return instCombineST1ScatterIndex(IC, II); 1291 case Intrinsic::aarch64_sve_ld1: 1292 return instCombineSVELD1(IC, II, DL); 1293 case Intrinsic::aarch64_sve_st1: 1294 return instCombineSVEST1(IC, II, DL); 1295 case Intrinsic::aarch64_sve_sdiv: 1296 return instCombineSVESDIV(IC, II); 1297 case Intrinsic::aarch64_sve_sel: 1298 return instCombineSVESel(IC, II); 1299 } 1300 1301 return None; 1302 } 1303 1304 Optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic( 1305 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts, 1306 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, 1307 std::function<void(Instruction *, unsigned, APInt, APInt &)> 1308 SimplifyAndSetOp) const { 1309 switch (II.getIntrinsicID()) { 1310 default: 1311 break; 1312 case Intrinsic::aarch64_neon_fcvtxn: 1313 case Intrinsic::aarch64_neon_rshrn: 1314 case Intrinsic::aarch64_neon_sqrshrn: 1315 case Intrinsic::aarch64_neon_sqrshrun: 1316 case Intrinsic::aarch64_neon_sqshrn: 1317 case Intrinsic::aarch64_neon_sqshrun: 1318 case Intrinsic::aarch64_neon_sqxtn: 1319 case Intrinsic::aarch64_neon_sqxtun: 1320 case Intrinsic::aarch64_neon_uqrshrn: 1321 case Intrinsic::aarch64_neon_uqshrn: 1322 case Intrinsic::aarch64_neon_uqxtn: 1323 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts); 1324 break; 1325 } 1326 1327 return None; 1328 } 1329 1330 bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, 1331 ArrayRef<const Value *> Args) { 1332 1333 // A helper that returns a vector type from the given type. The number of 1334 // elements in type Ty determines the vector width. 1335 auto toVectorTy = [&](Type *ArgTy) { 1336 return VectorType::get(ArgTy->getScalarType(), 1337 cast<VectorType>(DstTy)->getElementCount()); 1338 }; 1339 1340 // Exit early if DstTy is not a vector type whose elements are at least 1341 // 16-bits wide. 1342 if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16) 1343 return false; 1344 1345 // Determine if the operation has a widening variant. We consider both the 1346 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the 1347 // instructions. 1348 // 1349 // TODO: Add additional widening operations (e.g., shl, etc.) once we 1350 // verify that their extending operands are eliminated during code 1351 // generation. 1352 switch (Opcode) { 1353 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2). 1354 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2). 1355 case Instruction::Mul: // SMULL(2), UMULL(2) 1356 break; 1357 default: 1358 return false; 1359 } 1360 1361 // To be a widening instruction (either the "wide" or "long" versions), the 1362 // second operand must be a sign- or zero extend. 1363 if (Args.size() != 2 || 1364 (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1]))) 1365 return false; 1366 auto *Extend = cast<CastInst>(Args[1]); 1367 auto *Arg0 = dyn_cast<CastInst>(Args[0]); 1368 1369 // A mul only has a mull version (not like addw). Both operands need to be 1370 // extending and the same type. 1371 if (Opcode == Instruction::Mul && 1372 (!Arg0 || Arg0->getOpcode() != Extend->getOpcode() || 1373 Arg0->getOperand(0)->getType() != Extend->getOperand(0)->getType())) 1374 return false; 1375 1376 // Legalize the destination type and ensure it can be used in a widening 1377 // operation. 1378 auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy); 1379 unsigned DstElTySize = DstTyL.second.getScalarSizeInBits(); 1380 if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits()) 1381 return false; 1382 1383 // Legalize the source type and ensure it can be used in a widening 1384 // operation. 1385 auto *SrcTy = toVectorTy(Extend->getSrcTy()); 1386 auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy); 1387 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits(); 1388 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits()) 1389 return false; 1390 1391 // Get the total number of vector elements in the legalized types. 1392 InstructionCost NumDstEls = 1393 DstTyL.first * DstTyL.second.getVectorMinNumElements(); 1394 InstructionCost NumSrcEls = 1395 SrcTyL.first * SrcTyL.second.getVectorMinNumElements(); 1396 1397 // Return true if the legalized types have the same number of vector elements 1398 // and the destination element type size is twice that of the source type. 1399 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize; 1400 } 1401 1402 InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, 1403 Type *Src, 1404 TTI::CastContextHint CCH, 1405 TTI::TargetCostKind CostKind, 1406 const Instruction *I) { 1407 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1408 assert(ISD && "Invalid opcode"); 1409 1410 // If the cast is observable, and it is used by a widening instruction (e.g., 1411 // uaddl, saddw, etc.), it may be free. 1412 if (I && I->hasOneUser()) { 1413 auto *SingleUser = cast<Instruction>(*I->user_begin()); 1414 SmallVector<const Value *, 4> Operands(SingleUser->operand_values()); 1415 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) { 1416 // If the cast is the second operand, it is free. We will generate either 1417 // a "wide" or "long" version of the widening instruction. 1418 if (I == SingleUser->getOperand(1)) 1419 return 0; 1420 // If the cast is not the second operand, it will be free if it looks the 1421 // same as the second operand. In this case, we will generate a "long" 1422 // version of the widening instruction. 1423 if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1))) 1424 if (I->getOpcode() == unsigned(Cast->getOpcode()) && 1425 cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy()) 1426 return 0; 1427 } 1428 } 1429 1430 // TODO: Allow non-throughput costs that aren't binary. 1431 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost { 1432 if (CostKind != TTI::TCK_RecipThroughput) 1433 return Cost == 0 ? 0 : 1; 1434 return Cost; 1435 }; 1436 1437 EVT SrcTy = TLI->getValueType(DL, Src); 1438 EVT DstTy = TLI->getValueType(DL, Dst); 1439 1440 if (!SrcTy.isSimple() || !DstTy.isSimple()) 1441 return AdjustCost( 1442 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); 1443 1444 static const TypeConversionCostTblEntry 1445 ConversionTbl[] = { 1446 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, 1447 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 }, 1448 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, 1449 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 }, 1450 1451 // Truncations on nxvmiN 1452 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 }, 1453 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 }, 1454 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 }, 1455 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 }, 1456 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 }, 1457 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 }, 1458 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 }, 1459 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 }, 1460 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 }, 1461 { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 }, 1462 { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 }, 1463 { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 }, 1464 { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 }, 1465 { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 }, 1466 { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 }, 1467 { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 }, 1468 1469 // The number of shll instructions for the extension. 1470 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 1471 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 1472 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, 1473 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, 1474 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 1475 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 1476 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, 1477 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, 1478 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, 1479 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, 1480 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, 1481 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, 1482 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, 1483 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, 1484 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, 1485 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, 1486 1487 // LowerVectorINT_TO_FP: 1488 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 1489 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 1490 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 1491 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 1492 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 1493 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 1494 1495 // Complex: to v2f32 1496 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 1497 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, 1498 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, 1499 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 1500 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, 1501 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, 1502 1503 // Complex: to v4f32 1504 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 }, 1505 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 1506 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, 1507 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 1508 1509 // Complex: to v8f32 1510 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, 1511 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 1512 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, 1513 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 1514 1515 // Complex: to v16f32 1516 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, 1517 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, 1518 1519 // Complex: to v2f64 1520 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 1521 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, 1522 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 1523 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 1524 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, 1525 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 1526 1527 1528 // LowerVectorFP_TO_INT 1529 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 }, 1530 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, 1531 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, 1532 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, 1533 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, 1534 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, 1535 1536 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext). 1537 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 }, 1538 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 }, 1539 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 }, 1540 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 }, 1541 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 }, 1542 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 }, 1543 1544 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2 1545 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 }, 1546 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 }, 1547 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 }, 1548 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 }, 1549 1550 // Complex, from nxv2f32. 1551 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 }, 1552 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 }, 1553 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 }, 1554 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1 }, 1555 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 }, 1556 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 }, 1557 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 }, 1558 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1 }, 1559 1560 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2. 1561 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 }, 1562 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 }, 1563 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 }, 1564 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 }, 1565 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 }, 1566 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 }, 1567 1568 // Complex, from nxv2f64. 1569 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 }, 1570 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 }, 1571 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 }, 1572 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1 }, 1573 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 }, 1574 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 }, 1575 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 }, 1576 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1 }, 1577 1578 // Complex, from nxv4f32. 1579 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 }, 1580 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 }, 1581 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 }, 1582 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1 }, 1583 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 }, 1584 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 }, 1585 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 }, 1586 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1 }, 1587 1588 // Complex, from nxv8f64. Illegal -> illegal conversions not required. 1589 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 }, 1590 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7 }, 1591 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 }, 1592 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7 }, 1593 1594 // Complex, from nxv4f64. Illegal -> illegal conversions not required. 1595 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 }, 1596 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 }, 1597 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3 }, 1598 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 }, 1599 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 }, 1600 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3 }, 1601 1602 // Complex, from nxv8f32. Illegal -> illegal conversions not required. 1603 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 }, 1604 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3 }, 1605 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 }, 1606 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3 }, 1607 1608 // Complex, from nxv8f16. 1609 { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 }, 1610 { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 }, 1611 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 }, 1612 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1 }, 1613 { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 }, 1614 { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 }, 1615 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 }, 1616 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1 }, 1617 1618 // Complex, from nxv4f16. 1619 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 }, 1620 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 }, 1621 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 }, 1622 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1 }, 1623 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 }, 1624 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 }, 1625 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 }, 1626 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1 }, 1627 1628 // Complex, from nxv2f16. 1629 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 }, 1630 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 }, 1631 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 }, 1632 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1 }, 1633 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 }, 1634 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 }, 1635 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 }, 1636 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1 }, 1637 1638 // Truncate from nxvmf32 to nxvmf16. 1639 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 }, 1640 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 }, 1641 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 }, 1642 1643 // Truncate from nxvmf64 to nxvmf16. 1644 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 }, 1645 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 }, 1646 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 }, 1647 1648 // Truncate from nxvmf64 to nxvmf32. 1649 { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 }, 1650 { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 }, 1651 { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 }, 1652 1653 // Extend from nxvmf16 to nxvmf32. 1654 { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1}, 1655 { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1}, 1656 { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2}, 1657 1658 // Extend from nxvmf16 to nxvmf64. 1659 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1}, 1660 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2}, 1661 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4}, 1662 1663 // Extend from nxvmf32 to nxvmf64. 1664 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1}, 1665 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2}, 1666 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6}, 1667 1668 // Bitcasts from float to integer 1669 { ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0 }, 1670 { ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0 }, 1671 { ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0 }, 1672 1673 // Bitcasts from integer to float 1674 { ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0 }, 1675 { ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0 }, 1676 { ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0 }, 1677 }; 1678 1679 if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD, 1680 DstTy.getSimpleVT(), 1681 SrcTy.getSimpleVT())) 1682 return AdjustCost(Entry->Cost); 1683 1684 static const TypeConversionCostTblEntry FP16Tbl[] = { 1685 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs 1686 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1}, 1687 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs 1688 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1}, 1689 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs 1690 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2}, 1691 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn 1692 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2}, 1693 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs 1694 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1}, 1695 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs 1696 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4}, 1697 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn 1698 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3}, 1699 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs 1700 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2}, 1701 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs 1702 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8}, 1703 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf 1704 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf 1705 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf 1706 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf 1707 }; 1708 1709 if (ST->hasFullFP16()) 1710 if (const auto *Entry = ConvertCostTableLookup( 1711 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) 1712 return AdjustCost(Entry->Cost); 1713 1714 return AdjustCost( 1715 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); 1716 } 1717 1718 InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, 1719 Type *Dst, 1720 VectorType *VecTy, 1721 unsigned Index) { 1722 1723 // Make sure we were given a valid extend opcode. 1724 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && 1725 "Invalid opcode"); 1726 1727 // We are extending an element we extract from a vector, so the source type 1728 // of the extend is the element type of the vector. 1729 auto *Src = VecTy->getElementType(); 1730 1731 // Sign- and zero-extends are for integer types only. 1732 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type"); 1733 1734 // Get the cost for the extract. We compute the cost (if any) for the extend 1735 // below. 1736 InstructionCost Cost = 1737 getVectorInstrCost(Instruction::ExtractElement, VecTy, Index); 1738 1739 // Legalize the types. 1740 auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy); 1741 auto DstVT = TLI->getValueType(DL, Dst); 1742 auto SrcVT = TLI->getValueType(DL, Src); 1743 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 1744 1745 // If the resulting type is still a vector and the destination type is legal, 1746 // we may get the extension for free. If not, get the default cost for the 1747 // extend. 1748 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT)) 1749 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 1750 CostKind); 1751 1752 // The destination type should be larger than the element type. If not, get 1753 // the default cost for the extend. 1754 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits()) 1755 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 1756 CostKind); 1757 1758 switch (Opcode) { 1759 default: 1760 llvm_unreachable("Opcode should be either SExt or ZExt"); 1761 1762 // For sign-extends, we only need a smov, which performs the extension 1763 // automatically. 1764 case Instruction::SExt: 1765 return Cost; 1766 1767 // For zero-extends, the extend is performed automatically by a umov unless 1768 // the destination type is i64 and the element type is i8 or i16. 1769 case Instruction::ZExt: 1770 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u) 1771 return Cost; 1772 } 1773 1774 // If we are unable to perform the extend for free, get the default cost. 1775 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 1776 CostKind); 1777 } 1778 1779 InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode, 1780 TTI::TargetCostKind CostKind, 1781 const Instruction *I) { 1782 if (CostKind != TTI::TCK_RecipThroughput) 1783 return Opcode == Instruction::PHI ? 0 : 1; 1784 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind"); 1785 // Branches are assumed to be predicted. 1786 return 0; 1787 } 1788 1789 InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, 1790 unsigned Index) { 1791 assert(Val->isVectorTy() && "This must be a vector type"); 1792 1793 if (Index != -1U) { 1794 // Legalize the type. 1795 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); 1796 1797 // This type is legalized to a scalar type. 1798 if (!LT.second.isVector()) 1799 return 0; 1800 1801 // The type may be split. For fixed-width vectors we can normalize the 1802 // index to the new type. 1803 if (LT.second.isFixedLengthVector()) { 1804 unsigned Width = LT.second.getVectorNumElements(); 1805 Index = Index % Width; 1806 } 1807 1808 // The element at index zero is already inside the vector. 1809 if (Index == 0) 1810 return 0; 1811 } 1812 1813 // All other insert/extracts cost this much. 1814 return ST->getVectorInsertExtractBaseCost(); 1815 } 1816 1817 InstructionCost AArch64TTIImpl::getArithmeticInstrCost( 1818 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 1819 TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, 1820 TTI::OperandValueProperties Opd1PropInfo, 1821 TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args, 1822 const Instruction *CxtI) { 1823 // TODO: Handle more cost kinds. 1824 if (CostKind != TTI::TCK_RecipThroughput) 1825 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, 1826 Opd2Info, Opd1PropInfo, 1827 Opd2PropInfo, Args, CxtI); 1828 1829 // Legalize the type. 1830 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 1831 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1832 1833 switch (ISD) { 1834 default: 1835 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, 1836 Opd2Info, Opd1PropInfo, Opd2PropInfo); 1837 case ISD::SDIV: 1838 if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue && 1839 Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { 1840 // On AArch64, scalar signed division by constants power-of-two are 1841 // normally expanded to the sequence ADD + CMP + SELECT + SRA. 1842 // The OperandValue properties many not be same as that of previous 1843 // operation; conservatively assume OP_None. 1844 InstructionCost Cost = getArithmeticInstrCost( 1845 Instruction::Add, Ty, CostKind, Opd1Info, Opd2Info, 1846 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 1847 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Opd1Info, 1848 Opd2Info, TargetTransformInfo::OP_None, 1849 TargetTransformInfo::OP_None); 1850 Cost += getArithmeticInstrCost( 1851 Instruction::Select, Ty, CostKind, Opd1Info, Opd2Info, 1852 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 1853 Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Opd1Info, 1854 Opd2Info, TargetTransformInfo::OP_None, 1855 TargetTransformInfo::OP_None); 1856 return Cost; 1857 } 1858 LLVM_FALLTHROUGH; 1859 case ISD::UDIV: { 1860 if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue) { 1861 auto VT = TLI->getValueType(DL, Ty); 1862 if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) { 1863 // Vector signed division by constant are expanded to the 1864 // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division 1865 // to MULHS + SUB + SRL + ADD + SRL. 1866 InstructionCost MulCost = getArithmeticInstrCost( 1867 Instruction::Mul, Ty, CostKind, Opd1Info, Opd2Info, 1868 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 1869 InstructionCost AddCost = getArithmeticInstrCost( 1870 Instruction::Add, Ty, CostKind, Opd1Info, Opd2Info, 1871 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 1872 InstructionCost ShrCost = getArithmeticInstrCost( 1873 Instruction::AShr, Ty, CostKind, Opd1Info, Opd2Info, 1874 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 1875 return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1; 1876 } 1877 } 1878 1879 InstructionCost Cost = BaseT::getArithmeticInstrCost( 1880 Opcode, Ty, CostKind, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo); 1881 if (Ty->isVectorTy()) { 1882 // On AArch64, vector divisions are not supported natively and are 1883 // expanded into scalar divisions of each pair of elements. 1884 Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, CostKind, 1885 Opd1Info, Opd2Info, Opd1PropInfo, 1886 Opd2PropInfo); 1887 Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind, 1888 Opd1Info, Opd2Info, Opd1PropInfo, 1889 Opd2PropInfo); 1890 // TODO: if one of the arguments is scalar, then it's not necessary to 1891 // double the cost of handling the vector elements. 1892 Cost += Cost; 1893 } 1894 return Cost; 1895 } 1896 case ISD::MUL: 1897 // Since we do not have a MUL.2d instruction, a mul <2 x i64> is expensive 1898 // as elements are extracted from the vectors and the muls scalarized. 1899 // As getScalarizationOverhead is a bit too pessimistic, we estimate the 1900 // cost for a i64 vector directly here, which is: 1901 // - four 2-cost i64 extracts, 1902 // - two 2-cost i64 inserts, and 1903 // - two 1-cost muls. 1904 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with 1905 // LT.first = 2 the cost is 28. If both operands are extensions it will not 1906 // need to scalarize so the cost can be cheaper (smull or umull). 1907 if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args)) 1908 return LT.first; 1909 return LT.first * 14; 1910 case ISD::ADD: 1911 case ISD::XOR: 1912 case ISD::OR: 1913 case ISD::AND: 1914 case ISD::SRL: 1915 case ISD::SRA: 1916 case ISD::SHL: 1917 // These nodes are marked as 'custom' for combining purposes only. 1918 // We know that they are legal. See LowerAdd in ISelLowering. 1919 return LT.first; 1920 1921 case ISD::FADD: 1922 case ISD::FSUB: 1923 case ISD::FMUL: 1924 case ISD::FDIV: 1925 case ISD::FNEG: 1926 // These nodes are marked as 'custom' just to lower them to SVE. 1927 // We know said lowering will incur no additional cost. 1928 if (!Ty->getScalarType()->isFP128Ty()) 1929 return 2 * LT.first; 1930 1931 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, 1932 Opd2Info, Opd1PropInfo, Opd2PropInfo); 1933 } 1934 } 1935 1936 InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty, 1937 ScalarEvolution *SE, 1938 const SCEV *Ptr) { 1939 // Address computations in vectorized code with non-consecutive addresses will 1940 // likely result in more instructions compared to scalar code where the 1941 // computation can more often be merged into the index mode. The resulting 1942 // extra micro-ops can significantly decrease throughput. 1943 unsigned NumVectorInstToHideOverhead = 10; 1944 int MaxMergeDistance = 64; 1945 1946 if (Ty->isVectorTy() && SE && 1947 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) 1948 return NumVectorInstToHideOverhead; 1949 1950 // In many cases the address computation is not merged into the instruction 1951 // addressing mode. 1952 return 1; 1953 } 1954 1955 InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 1956 Type *CondTy, 1957 CmpInst::Predicate VecPred, 1958 TTI::TargetCostKind CostKind, 1959 const Instruction *I) { 1960 // TODO: Handle other cost kinds. 1961 if (CostKind != TTI::TCK_RecipThroughput) 1962 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1963 I); 1964 1965 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1966 // We don't lower some vector selects well that are wider than the register 1967 // width. 1968 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) { 1969 // We would need this many instructions to hide the scalarization happening. 1970 const int AmortizationCost = 20; 1971 1972 // If VecPred is not set, check if we can get a predicate from the context 1973 // instruction, if its type matches the requested ValTy. 1974 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) { 1975 CmpInst::Predicate CurrentPred; 1976 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(), 1977 m_Value()))) 1978 VecPred = CurrentPred; 1979 } 1980 // Check if we have a compare/select chain that can be lowered using 1981 // a (F)CMxx & BFI pair. 1982 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE || 1983 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT || 1984 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ || 1985 VecPred == CmpInst::FCMP_UNE) { 1986 static const auto ValidMinMaxTys = { 1987 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, 1988 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64}; 1989 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16}; 1990 1991 auto LT = TLI->getTypeLegalizationCost(DL, ValTy); 1992 if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; }) || 1993 (ST->hasFullFP16() && 1994 any_of(ValidFP16MinMaxTys, [<](MVT M) { return M == LT.second; }))) 1995 return LT.first; 1996 } 1997 1998 static const TypeConversionCostTblEntry 1999 VectorSelectTbl[] = { 2000 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 }, 2001 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 }, 2002 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 }, 2003 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost }, 2004 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost }, 2005 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost } 2006 }; 2007 2008 EVT SelCondTy = TLI->getValueType(DL, CondTy); 2009 EVT SelValTy = TLI->getValueType(DL, ValTy); 2010 if (SelCondTy.isSimple() && SelValTy.isSimple()) { 2011 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD, 2012 SelCondTy.getSimpleVT(), 2013 SelValTy.getSimpleVT())) 2014 return Entry->Cost; 2015 } 2016 } 2017 // The base case handles scalable vectors fine for now, since it treats the 2018 // cost as 1 * legalization cost. 2019 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); 2020 } 2021 2022 AArch64TTIImpl::TTI::MemCmpExpansionOptions 2023 AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { 2024 TTI::MemCmpExpansionOptions Options; 2025 if (ST->requiresStrictAlign()) { 2026 // TODO: Add cost modeling for strict align. Misaligned loads expand to 2027 // a bunch of instructions when strict align is enabled. 2028 return Options; 2029 } 2030 Options.AllowOverlappingLoads = true; 2031 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); 2032 Options.NumLoadsPerBlock = Options.MaxNumLoads; 2033 // TODO: Though vector loads usually perform well on AArch64, in some targets 2034 // they may wake up the FP unit, which raises the power consumption. Perhaps 2035 // they could be used with no holds barred (-O3). 2036 Options.LoadSizes = {8, 4, 2, 1}; 2037 return Options; 2038 } 2039 2040 InstructionCost 2041 AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, 2042 Align Alignment, unsigned AddressSpace, 2043 TTI::TargetCostKind CostKind) { 2044 if (useNeonVector(Src)) 2045 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 2046 CostKind); 2047 auto LT = TLI->getTypeLegalizationCost(DL, Src); 2048 if (!LT.first.isValid()) 2049 return InstructionCost::getInvalid(); 2050 2051 // The code-generator is currently not able to handle scalable vectors 2052 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 2053 // it. This change will be removed when code-generation for these types is 2054 // sufficiently reliable. 2055 if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1)) 2056 return InstructionCost::getInvalid(); 2057 2058 return LT.first * 2; 2059 } 2060 2061 static unsigned getSVEGatherScatterOverhead(unsigned Opcode) { 2062 return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead; 2063 } 2064 2065 InstructionCost AArch64TTIImpl::getGatherScatterOpCost( 2066 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, 2067 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { 2068 if (useNeonVector(DataTy)) 2069 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 2070 Alignment, CostKind, I); 2071 auto *VT = cast<VectorType>(DataTy); 2072 auto LT = TLI->getTypeLegalizationCost(DL, DataTy); 2073 if (!LT.first.isValid()) 2074 return InstructionCost::getInvalid(); 2075 2076 // The code-generator is currently not able to handle scalable vectors 2077 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 2078 // it. This change will be removed when code-generation for these types is 2079 // sufficiently reliable. 2080 if (cast<VectorType>(DataTy)->getElementCount() == 2081 ElementCount::getScalable(1)) 2082 return InstructionCost::getInvalid(); 2083 2084 ElementCount LegalVF = LT.second.getVectorElementCount(); 2085 InstructionCost MemOpCost = 2086 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I); 2087 // Add on an overhead cost for using gathers/scatters. 2088 // TODO: At the moment this is applied unilaterally for all CPUs, but at some 2089 // point we may want a per-CPU overhead. 2090 MemOpCost *= getSVEGatherScatterOverhead(Opcode); 2091 return LT.first * MemOpCost * getMaxNumElements(LegalVF); 2092 } 2093 2094 bool AArch64TTIImpl::useNeonVector(const Type *Ty) const { 2095 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors(); 2096 } 2097 2098 InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, 2099 MaybeAlign Alignment, 2100 unsigned AddressSpace, 2101 TTI::TargetCostKind CostKind, 2102 const Instruction *I) { 2103 EVT VT = TLI->getValueType(DL, Ty, true); 2104 // Type legalization can't handle structs 2105 if (VT == MVT::Other) 2106 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, 2107 CostKind); 2108 2109 auto LT = TLI->getTypeLegalizationCost(DL, Ty); 2110 if (!LT.first.isValid()) 2111 return InstructionCost::getInvalid(); 2112 2113 // The code-generator is currently not able to handle scalable vectors 2114 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 2115 // it. This change will be removed when code-generation for these types is 2116 // sufficiently reliable. 2117 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty)) 2118 if (VTy->getElementCount() == ElementCount::getScalable(1)) 2119 return InstructionCost::getInvalid(); 2120 2121 // TODO: consider latency as well for TCK_SizeAndLatency. 2122 if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) 2123 return LT.first; 2124 2125 if (CostKind != TTI::TCK_RecipThroughput) 2126 return 1; 2127 2128 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store && 2129 LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) { 2130 // Unaligned stores are extremely inefficient. We don't split all 2131 // unaligned 128-bit stores because the negative impact that has shown in 2132 // practice on inlined block copy code. 2133 // We make such stores expensive so that we will only vectorize if there 2134 // are 6 other instructions getting vectorized. 2135 const int AmortizationCost = 6; 2136 2137 return LT.first * 2 * AmortizationCost; 2138 } 2139 2140 // Check truncating stores and extending loads. 2141 if (useNeonVector(Ty) && 2142 Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) { 2143 // v4i8 types are lowered to scalar a load/store and sshll/xtn. 2144 if (VT == MVT::v4i8) 2145 return 2; 2146 // Otherwise we need to scalarize. 2147 return cast<FixedVectorType>(Ty)->getNumElements() * 2; 2148 } 2149 2150 return LT.first; 2151 } 2152 2153 InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost( 2154 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 2155 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 2156 bool UseMaskForCond, bool UseMaskForGaps) { 2157 assert(Factor >= 2 && "Invalid interleave factor"); 2158 auto *VecVTy = cast<FixedVectorType>(VecTy); 2159 2160 if (!UseMaskForCond && !UseMaskForGaps && 2161 Factor <= TLI->getMaxSupportedInterleaveFactor()) { 2162 unsigned NumElts = VecVTy->getNumElements(); 2163 auto *SubVecTy = 2164 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor); 2165 2166 // ldN/stN only support legal vector types of size 64 or 128 in bits. 2167 // Accesses having vector types that are a multiple of 128 bits can be 2168 // matched to more than one ldN/stN instruction. 2169 bool UseScalable; 2170 if (NumElts % Factor == 0 && 2171 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable)) 2172 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable); 2173 } 2174 2175 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 2176 Alignment, AddressSpace, CostKind, 2177 UseMaskForCond, UseMaskForGaps); 2178 } 2179 2180 InstructionCost 2181 AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { 2182 InstructionCost Cost = 0; 2183 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2184 for (auto *I : Tys) { 2185 if (!I->isVectorTy()) 2186 continue; 2187 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() == 2188 128) 2189 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) + 2190 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind); 2191 } 2192 return Cost; 2193 } 2194 2195 unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) { 2196 return ST->getMaxInterleaveFactor(); 2197 } 2198 2199 // For Falkor, we want to avoid having too many strided loads in a loop since 2200 // that can exhaust the HW prefetcher resources. We adjust the unroller 2201 // MaxCount preference below to attempt to ensure unrolling doesn't create too 2202 // many strided loads. 2203 static void 2204 getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, 2205 TargetTransformInfo::UnrollingPreferences &UP) { 2206 enum { MaxStridedLoads = 7 }; 2207 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) { 2208 int StridedLoads = 0; 2209 // FIXME? We could make this more precise by looking at the CFG and 2210 // e.g. not counting loads in each side of an if-then-else diamond. 2211 for (const auto BB : L->blocks()) { 2212 for (auto &I : *BB) { 2213 LoadInst *LMemI = dyn_cast<LoadInst>(&I); 2214 if (!LMemI) 2215 continue; 2216 2217 Value *PtrValue = LMemI->getPointerOperand(); 2218 if (L->isLoopInvariant(PtrValue)) 2219 continue; 2220 2221 const SCEV *LSCEV = SE.getSCEV(PtrValue); 2222 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV); 2223 if (!LSCEVAddRec || !LSCEVAddRec->isAffine()) 2224 continue; 2225 2226 // FIXME? We could take pairing of unrolled load copies into account 2227 // by looking at the AddRec, but we would probably have to limit this 2228 // to loops with no stores or other memory optimization barriers. 2229 ++StridedLoads; 2230 // We've seen enough strided loads that seeing more won't make a 2231 // difference. 2232 if (StridedLoads > MaxStridedLoads / 2) 2233 return StridedLoads; 2234 } 2235 } 2236 return StridedLoads; 2237 }; 2238 2239 int StridedLoads = countStridedLoads(L, SE); 2240 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads 2241 << " strided loads\n"); 2242 // Pick the largest power of 2 unroll count that won't result in too many 2243 // strided loads. 2244 if (StridedLoads) { 2245 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads); 2246 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to " 2247 << UP.MaxCount << '\n'); 2248 } 2249 } 2250 2251 void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 2252 TTI::UnrollingPreferences &UP, 2253 OptimizationRemarkEmitter *ORE) { 2254 // Enable partial unrolling and runtime unrolling. 2255 BaseT::getUnrollingPreferences(L, SE, UP, ORE); 2256 2257 UP.UpperBound = true; 2258 2259 // For inner loop, it is more likely to be a hot one, and the runtime check 2260 // can be promoted out from LICM pass, so the overhead is less, let's try 2261 // a larger threshold to unroll more loops. 2262 if (L->getLoopDepth() > 1) 2263 UP.PartialThreshold *= 2; 2264 2265 // Disable partial & runtime unrolling on -Os. 2266 UP.PartialOptSizeThreshold = 0; 2267 2268 if (ST->getProcFamily() == AArch64Subtarget::Falkor && 2269 EnableFalkorHWPFUnrollFix) 2270 getFalkorUnrollingPreferences(L, SE, UP); 2271 2272 // Scan the loop: don't unroll loops with calls as this could prevent 2273 // inlining. Don't unroll vector loops either, as they don't benefit much from 2274 // unrolling. 2275 for (auto *BB : L->getBlocks()) { 2276 for (auto &I : *BB) { 2277 // Don't unroll vectorised loop. 2278 if (I.getType()->isVectorTy()) 2279 return; 2280 2281 if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 2282 if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 2283 if (!isLoweredToCall(F)) 2284 continue; 2285 } 2286 return; 2287 } 2288 } 2289 } 2290 2291 // Enable runtime unrolling for in-order models 2292 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by 2293 // checking for that case, we can ensure that the default behaviour is 2294 // unchanged 2295 if (ST->getProcFamily() != AArch64Subtarget::Others && 2296 !ST->getSchedModel().isOutOfOrder()) { 2297 UP.Runtime = true; 2298 UP.Partial = true; 2299 UP.UnrollRemainder = true; 2300 UP.DefaultUnrollRuntimeCount = 4; 2301 2302 UP.UnrollAndJam = true; 2303 UP.UnrollAndJamInnerLoopThreshold = 60; 2304 } 2305 } 2306 2307 void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 2308 TTI::PeelingPreferences &PP) { 2309 BaseT::getPeelingPreferences(L, SE, PP); 2310 } 2311 2312 Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, 2313 Type *ExpectedType) { 2314 switch (Inst->getIntrinsicID()) { 2315 default: 2316 return nullptr; 2317 case Intrinsic::aarch64_neon_st2: 2318 case Intrinsic::aarch64_neon_st3: 2319 case Intrinsic::aarch64_neon_st4: { 2320 // Create a struct type 2321 StructType *ST = dyn_cast<StructType>(ExpectedType); 2322 if (!ST) 2323 return nullptr; 2324 unsigned NumElts = Inst->arg_size() - 1; 2325 if (ST->getNumElements() != NumElts) 2326 return nullptr; 2327 for (unsigned i = 0, e = NumElts; i != e; ++i) { 2328 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i)) 2329 return nullptr; 2330 } 2331 Value *Res = UndefValue::get(ExpectedType); 2332 IRBuilder<> Builder(Inst); 2333 for (unsigned i = 0, e = NumElts; i != e; ++i) { 2334 Value *L = Inst->getArgOperand(i); 2335 Res = Builder.CreateInsertValue(Res, L, i); 2336 } 2337 return Res; 2338 } 2339 case Intrinsic::aarch64_neon_ld2: 2340 case Intrinsic::aarch64_neon_ld3: 2341 case Intrinsic::aarch64_neon_ld4: 2342 if (Inst->getType() == ExpectedType) 2343 return Inst; 2344 return nullptr; 2345 } 2346 } 2347 2348 bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, 2349 MemIntrinsicInfo &Info) { 2350 switch (Inst->getIntrinsicID()) { 2351 default: 2352 break; 2353 case Intrinsic::aarch64_neon_ld2: 2354 case Intrinsic::aarch64_neon_ld3: 2355 case Intrinsic::aarch64_neon_ld4: 2356 Info.ReadMem = true; 2357 Info.WriteMem = false; 2358 Info.PtrVal = Inst->getArgOperand(0); 2359 break; 2360 case Intrinsic::aarch64_neon_st2: 2361 case Intrinsic::aarch64_neon_st3: 2362 case Intrinsic::aarch64_neon_st4: 2363 Info.ReadMem = false; 2364 Info.WriteMem = true; 2365 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1); 2366 break; 2367 } 2368 2369 switch (Inst->getIntrinsicID()) { 2370 default: 2371 return false; 2372 case Intrinsic::aarch64_neon_ld2: 2373 case Intrinsic::aarch64_neon_st2: 2374 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS; 2375 break; 2376 case Intrinsic::aarch64_neon_ld3: 2377 case Intrinsic::aarch64_neon_st3: 2378 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS; 2379 break; 2380 case Intrinsic::aarch64_neon_ld4: 2381 case Intrinsic::aarch64_neon_st4: 2382 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS; 2383 break; 2384 } 2385 return true; 2386 } 2387 2388 /// See if \p I should be considered for address type promotion. We check if \p 2389 /// I is a sext with right type and used in memory accesses. If it used in a 2390 /// "complex" getelementptr, we allow it to be promoted without finding other 2391 /// sext instructions that sign extended the same initial value. A getelementptr 2392 /// is considered as "complex" if it has more than 2 operands. 2393 bool AArch64TTIImpl::shouldConsiderAddressTypePromotion( 2394 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) { 2395 bool Considerable = false; 2396 AllowPromotionWithoutCommonHeader = false; 2397 if (!isa<SExtInst>(&I)) 2398 return false; 2399 Type *ConsideredSExtType = 2400 Type::getInt64Ty(I.getParent()->getParent()->getContext()); 2401 if (I.getType() != ConsideredSExtType) 2402 return false; 2403 // See if the sext is the one with the right type and used in at least one 2404 // GetElementPtrInst. 2405 for (const User *U : I.users()) { 2406 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) { 2407 Considerable = true; 2408 // A getelementptr is considered as "complex" if it has more than 2 2409 // operands. We will promote a SExt used in such complex GEP as we 2410 // expect some computation to be merged if they are done on 64 bits. 2411 if (GEPInst->getNumOperands() > 2) { 2412 AllowPromotionWithoutCommonHeader = true; 2413 break; 2414 } 2415 } 2416 } 2417 return Considerable; 2418 } 2419 2420 bool AArch64TTIImpl::isLegalToVectorizeReduction( 2421 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const { 2422 if (!VF.isScalable()) 2423 return true; 2424 2425 Type *Ty = RdxDesc.getRecurrenceType(); 2426 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty)) 2427 return false; 2428 2429 switch (RdxDesc.getRecurrenceKind()) { 2430 case RecurKind::Add: 2431 case RecurKind::FAdd: 2432 case RecurKind::And: 2433 case RecurKind::Or: 2434 case RecurKind::Xor: 2435 case RecurKind::SMin: 2436 case RecurKind::SMax: 2437 case RecurKind::UMin: 2438 case RecurKind::UMax: 2439 case RecurKind::FMin: 2440 case RecurKind::FMax: 2441 case RecurKind::SelectICmp: 2442 case RecurKind::SelectFCmp: 2443 case RecurKind::FMulAdd: 2444 return true; 2445 default: 2446 return false; 2447 } 2448 } 2449 2450 InstructionCost 2451 AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, 2452 bool IsUnsigned, 2453 TTI::TargetCostKind CostKind) { 2454 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 2455 2456 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16()) 2457 return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind); 2458 2459 assert((isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) && 2460 "Both vector needs to be equally scalable"); 2461 2462 InstructionCost LegalizationCost = 0; 2463 if (LT.first > 1) { 2464 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext()); 2465 unsigned MinMaxOpcode = 2466 Ty->isFPOrFPVectorTy() 2467 ? Intrinsic::maxnum 2468 : (IsUnsigned ? Intrinsic::umin : Intrinsic::smin); 2469 IntrinsicCostAttributes Attrs(MinMaxOpcode, LegalVTy, {LegalVTy, LegalVTy}); 2470 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1); 2471 } 2472 2473 return LegalizationCost + /*Cost of horizontal reduction*/ 2; 2474 } 2475 2476 InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE( 2477 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) { 2478 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 2479 InstructionCost LegalizationCost = 0; 2480 if (LT.first > 1) { 2481 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext()); 2482 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind); 2483 LegalizationCost *= LT.first - 1; 2484 } 2485 2486 int ISD = TLI->InstructionOpcodeToISD(Opcode); 2487 assert(ISD && "Invalid opcode"); 2488 // Add the final reduction cost for the legal horizontal reduction 2489 switch (ISD) { 2490 case ISD::ADD: 2491 case ISD::AND: 2492 case ISD::OR: 2493 case ISD::XOR: 2494 case ISD::FADD: 2495 return LegalizationCost + 2; 2496 default: 2497 return InstructionCost::getInvalid(); 2498 } 2499 } 2500 2501 InstructionCost 2502 AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, 2503 Optional<FastMathFlags> FMF, 2504 TTI::TargetCostKind CostKind) { 2505 if (TTI::requiresOrderedReduction(FMF)) { 2506 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) { 2507 InstructionCost BaseCost = 2508 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 2509 // Add on extra cost to reflect the extra overhead on some CPUs. We still 2510 // end up vectorizing for more computationally intensive loops. 2511 return BaseCost + FixedVTy->getNumElements(); 2512 } 2513 2514 if (Opcode != Instruction::FAdd) 2515 return InstructionCost::getInvalid(); 2516 2517 auto *VTy = cast<ScalableVectorType>(ValTy); 2518 InstructionCost Cost = 2519 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind); 2520 Cost *= getMaxNumElements(VTy->getElementCount()); 2521 return Cost; 2522 } 2523 2524 if (isa<ScalableVectorType>(ValTy)) 2525 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind); 2526 2527 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 2528 MVT MTy = LT.second; 2529 int ISD = TLI->InstructionOpcodeToISD(Opcode); 2530 assert(ISD && "Invalid opcode"); 2531 2532 // Horizontal adds can use the 'addv' instruction. We model the cost of these 2533 // instructions as twice a normal vector add, plus 1 for each legalization 2534 // step (LT.first). This is the only arithmetic vector reduction operation for 2535 // which we have an instruction. 2536 // OR, XOR and AND costs should match the codegen from: 2537 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll 2538 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll 2539 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll 2540 static const CostTblEntry CostTblNoPairwise[]{ 2541 {ISD::ADD, MVT::v8i8, 2}, 2542 {ISD::ADD, MVT::v16i8, 2}, 2543 {ISD::ADD, MVT::v4i16, 2}, 2544 {ISD::ADD, MVT::v8i16, 2}, 2545 {ISD::ADD, MVT::v4i32, 2}, 2546 {ISD::OR, MVT::v8i8, 15}, 2547 {ISD::OR, MVT::v16i8, 17}, 2548 {ISD::OR, MVT::v4i16, 7}, 2549 {ISD::OR, MVT::v8i16, 9}, 2550 {ISD::OR, MVT::v2i32, 3}, 2551 {ISD::OR, MVT::v4i32, 5}, 2552 {ISD::OR, MVT::v2i64, 3}, 2553 {ISD::XOR, MVT::v8i8, 15}, 2554 {ISD::XOR, MVT::v16i8, 17}, 2555 {ISD::XOR, MVT::v4i16, 7}, 2556 {ISD::XOR, MVT::v8i16, 9}, 2557 {ISD::XOR, MVT::v2i32, 3}, 2558 {ISD::XOR, MVT::v4i32, 5}, 2559 {ISD::XOR, MVT::v2i64, 3}, 2560 {ISD::AND, MVT::v8i8, 15}, 2561 {ISD::AND, MVT::v16i8, 17}, 2562 {ISD::AND, MVT::v4i16, 7}, 2563 {ISD::AND, MVT::v8i16, 9}, 2564 {ISD::AND, MVT::v2i32, 3}, 2565 {ISD::AND, MVT::v4i32, 5}, 2566 {ISD::AND, MVT::v2i64, 3}, 2567 }; 2568 switch (ISD) { 2569 default: 2570 break; 2571 case ISD::ADD: 2572 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy)) 2573 return (LT.first - 1) + Entry->Cost; 2574 break; 2575 case ISD::XOR: 2576 case ISD::AND: 2577 case ISD::OR: 2578 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy); 2579 if (!Entry) 2580 break; 2581 auto *ValVTy = cast<FixedVectorType>(ValTy); 2582 if (!ValVTy->getElementType()->isIntegerTy(1) && 2583 MTy.getVectorNumElements() <= ValVTy->getNumElements() && 2584 isPowerOf2_32(ValVTy->getNumElements())) { 2585 InstructionCost ExtraCost = 0; 2586 if (LT.first != 1) { 2587 // Type needs to be split, so there is an extra cost of LT.first - 1 2588 // arithmetic ops. 2589 auto *Ty = FixedVectorType::get(ValTy->getElementType(), 2590 MTy.getVectorNumElements()); 2591 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind); 2592 ExtraCost *= LT.first - 1; 2593 } 2594 return Entry->Cost + ExtraCost; 2595 } 2596 break; 2597 } 2598 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 2599 } 2600 2601 InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) { 2602 static const CostTblEntry ShuffleTbl[] = { 2603 { TTI::SK_Splice, MVT::nxv16i8, 1 }, 2604 { TTI::SK_Splice, MVT::nxv8i16, 1 }, 2605 { TTI::SK_Splice, MVT::nxv4i32, 1 }, 2606 { TTI::SK_Splice, MVT::nxv2i64, 1 }, 2607 { TTI::SK_Splice, MVT::nxv2f16, 1 }, 2608 { TTI::SK_Splice, MVT::nxv4f16, 1 }, 2609 { TTI::SK_Splice, MVT::nxv8f16, 1 }, 2610 { TTI::SK_Splice, MVT::nxv2bf16, 1 }, 2611 { TTI::SK_Splice, MVT::nxv4bf16, 1 }, 2612 { TTI::SK_Splice, MVT::nxv8bf16, 1 }, 2613 { TTI::SK_Splice, MVT::nxv2f32, 1 }, 2614 { TTI::SK_Splice, MVT::nxv4f32, 1 }, 2615 { TTI::SK_Splice, MVT::nxv2f64, 1 }, 2616 }; 2617 2618 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 2619 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext()); 2620 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2621 EVT PromotedVT = LT.second.getScalarType() == MVT::i1 2622 ? TLI->getPromotedVTForPredicate(EVT(LT.second)) 2623 : LT.second; 2624 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext()); 2625 InstructionCost LegalizationCost = 0; 2626 if (Index < 0) { 2627 LegalizationCost = 2628 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy, 2629 CmpInst::BAD_ICMP_PREDICATE, CostKind) + 2630 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy, 2631 CmpInst::BAD_ICMP_PREDICATE, CostKind); 2632 } 2633 2634 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp 2635 // Cost performed on a promoted type. 2636 if (LT.second.getScalarType() == MVT::i1) { 2637 LegalizationCost += 2638 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy, 2639 TTI::CastContextHint::None, CostKind) + 2640 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy, 2641 TTI::CastContextHint::None, CostKind); 2642 } 2643 const auto *Entry = 2644 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT()); 2645 assert(Entry && "Illegal Type for Splice"); 2646 LegalizationCost += Entry->Cost; 2647 return LegalizationCost * LT.first; 2648 } 2649 2650 InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, 2651 VectorType *Tp, 2652 ArrayRef<int> Mask, int Index, 2653 VectorType *SubTp, 2654 ArrayRef<const Value *> Args) { 2655 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 2656 // If we have a Mask, and the LT is being legalized somehow, split the Mask 2657 // into smaller vectors and sum the cost of each shuffle. 2658 if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() && 2659 Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() && 2660 cast<FixedVectorType>(Tp)->getNumElements() > 2661 LT.second.getVectorNumElements() && 2662 !Index && !SubTp) { 2663 unsigned TpNumElts = cast<FixedVectorType>(Tp)->getNumElements(); 2664 assert(Mask.size() == TpNumElts && "Expected Mask and Tp size to match!"); 2665 unsigned LTNumElts = LT.second.getVectorNumElements(); 2666 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts; 2667 VectorType *NTp = 2668 VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount()); 2669 InstructionCost Cost; 2670 for (unsigned N = 0; N < NumVecs; N++) { 2671 SmallVector<int> NMask; 2672 // Split the existing mask into chunks of size LTNumElts. Track the source 2673 // sub-vectors to ensure the result has at most 2 inputs. 2674 unsigned Source1, Source2; 2675 unsigned NumSources = 0; 2676 for (unsigned E = 0; E < LTNumElts; E++) { 2677 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E] 2678 : UndefMaskElem; 2679 if (MaskElt < 0) { 2680 NMask.push_back(UndefMaskElem); 2681 continue; 2682 } 2683 2684 // Calculate which source from the input this comes from and whether it 2685 // is new to us. 2686 unsigned Source = MaskElt / LTNumElts; 2687 if (NumSources == 0) { 2688 Source1 = Source; 2689 NumSources = 1; 2690 } else if (NumSources == 1 && Source != Source1) { 2691 Source2 = Source; 2692 NumSources = 2; 2693 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) { 2694 NumSources++; 2695 } 2696 2697 // Add to the new mask. For the NumSources>2 case these are not correct, 2698 // but are only used for the modular lane number. 2699 if (Source == Source1) 2700 NMask.push_back(MaskElt % LTNumElts); 2701 else if (Source == Source2) 2702 NMask.push_back(MaskElt % LTNumElts + LTNumElts); 2703 else 2704 NMask.push_back(MaskElt % LTNumElts); 2705 } 2706 // If the sub-mask has at most 2 input sub-vectors then re-cost it using 2707 // getShuffleCost. If not then cost it using the worst case. 2708 if (NumSources <= 2) 2709 Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc 2710 : TTI::SK_PermuteTwoSrc, 2711 NTp, NMask, 0, nullptr, Args); 2712 else if (any_of(enumerate(NMask), [&](const auto &ME) { 2713 return ME.value() % LTNumElts == ME.index(); 2714 })) 2715 Cost += LTNumElts - 1; 2716 else 2717 Cost += LTNumElts; 2718 } 2719 return Cost; 2720 } 2721 2722 Kind = improveShuffleKindFromMask(Kind, Mask); 2723 2724 // Check for broadcast loads. 2725 if (Kind == TTI::SK_Broadcast) { 2726 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]); 2727 if (IsLoad && LT.second.isVector() && 2728 isLegalBroadcastLoad(Tp->getElementType(), 2729 LT.second.getVectorElementCount())) 2730 return 0; // broadcast is handled by ld1r 2731 } 2732 2733 // If we have 4 elements for the shuffle and a Mask, get the cost straight 2734 // from the perfect shuffle tables. 2735 if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) && 2736 (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) && 2737 all_of(Mask, [](int E) { return E < 8; })) 2738 return getPerfectShuffleCost(Mask); 2739 2740 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose || 2741 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc || 2742 Kind == TTI::SK_Reverse) { 2743 2744 static const CostTblEntry ShuffleTbl[] = { 2745 // Broadcast shuffle kinds can be performed with 'dup'. 2746 { TTI::SK_Broadcast, MVT::v8i8, 1 }, 2747 { TTI::SK_Broadcast, MVT::v16i8, 1 }, 2748 { TTI::SK_Broadcast, MVT::v4i16, 1 }, 2749 { TTI::SK_Broadcast, MVT::v8i16, 1 }, 2750 { TTI::SK_Broadcast, MVT::v2i32, 1 }, 2751 { TTI::SK_Broadcast, MVT::v4i32, 1 }, 2752 { TTI::SK_Broadcast, MVT::v2i64, 1 }, 2753 { TTI::SK_Broadcast, MVT::v2f32, 1 }, 2754 { TTI::SK_Broadcast, MVT::v4f32, 1 }, 2755 { TTI::SK_Broadcast, MVT::v2f64, 1 }, 2756 // Transpose shuffle kinds can be performed with 'trn1/trn2' and 2757 // 'zip1/zip2' instructions. 2758 { TTI::SK_Transpose, MVT::v8i8, 1 }, 2759 { TTI::SK_Transpose, MVT::v16i8, 1 }, 2760 { TTI::SK_Transpose, MVT::v4i16, 1 }, 2761 { TTI::SK_Transpose, MVT::v8i16, 1 }, 2762 { TTI::SK_Transpose, MVT::v2i32, 1 }, 2763 { TTI::SK_Transpose, MVT::v4i32, 1 }, 2764 { TTI::SK_Transpose, MVT::v2i64, 1 }, 2765 { TTI::SK_Transpose, MVT::v2f32, 1 }, 2766 { TTI::SK_Transpose, MVT::v4f32, 1 }, 2767 { TTI::SK_Transpose, MVT::v2f64, 1 }, 2768 // Select shuffle kinds. 2769 // TODO: handle vXi8/vXi16. 2770 { TTI::SK_Select, MVT::v2i32, 1 }, // mov. 2771 { TTI::SK_Select, MVT::v4i32, 2 }, // rev+trn (or similar). 2772 { TTI::SK_Select, MVT::v2i64, 1 }, // mov. 2773 { TTI::SK_Select, MVT::v2f32, 1 }, // mov. 2774 { TTI::SK_Select, MVT::v4f32, 2 }, // rev+trn (or similar). 2775 { TTI::SK_Select, MVT::v2f64, 1 }, // mov. 2776 // PermuteSingleSrc shuffle kinds. 2777 { TTI::SK_PermuteSingleSrc, MVT::v2i32, 1 }, // mov. 2778 { TTI::SK_PermuteSingleSrc, MVT::v4i32, 3 }, // perfectshuffle worst case. 2779 { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // mov. 2780 { TTI::SK_PermuteSingleSrc, MVT::v2f32, 1 }, // mov. 2781 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 3 }, // perfectshuffle worst case. 2782 { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // mov. 2783 { TTI::SK_PermuteSingleSrc, MVT::v4i16, 3 }, // perfectshuffle worst case. 2784 { TTI::SK_PermuteSingleSrc, MVT::v4f16, 3 }, // perfectshuffle worst case. 2785 { TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3 }, // perfectshuffle worst case. 2786 { TTI::SK_PermuteSingleSrc, MVT::v8i16, 8 }, // constpool + load + tbl 2787 { TTI::SK_PermuteSingleSrc, MVT::v8f16, 8 }, // constpool + load + tbl 2788 { TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8 }, // constpool + load + tbl 2789 { TTI::SK_PermuteSingleSrc, MVT::v8i8, 8 }, // constpool + load + tbl 2790 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 8 }, // constpool + load + tbl 2791 // Reverse can be lowered with `rev`. 2792 { TTI::SK_Reverse, MVT::v2i32, 1 }, // mov. 2793 { TTI::SK_Reverse, MVT::v4i32, 2 }, // REV64; EXT 2794 { TTI::SK_Reverse, MVT::v2i64, 1 }, // mov. 2795 { TTI::SK_Reverse, MVT::v2f32, 1 }, // mov. 2796 { TTI::SK_Reverse, MVT::v4f32, 2 }, // REV64; EXT 2797 { TTI::SK_Reverse, MVT::v2f64, 1 }, // mov. 2798 { TTI::SK_Reverse, MVT::v8f16, 2 }, // REV64; EXT 2799 { TTI::SK_Reverse, MVT::v8i16, 2 }, // REV64; EXT 2800 { TTI::SK_Reverse, MVT::v16i8, 2 }, // REV64; EXT 2801 { TTI::SK_Reverse, MVT::v4f16, 1 }, // REV64 2802 { TTI::SK_Reverse, MVT::v4i16, 1 }, // REV64 2803 { TTI::SK_Reverse, MVT::v8i8, 1 }, // REV64 2804 // Broadcast shuffle kinds for scalable vectors 2805 { TTI::SK_Broadcast, MVT::nxv16i8, 1 }, 2806 { TTI::SK_Broadcast, MVT::nxv8i16, 1 }, 2807 { TTI::SK_Broadcast, MVT::nxv4i32, 1 }, 2808 { TTI::SK_Broadcast, MVT::nxv2i64, 1 }, 2809 { TTI::SK_Broadcast, MVT::nxv2f16, 1 }, 2810 { TTI::SK_Broadcast, MVT::nxv4f16, 1 }, 2811 { TTI::SK_Broadcast, MVT::nxv8f16, 1 }, 2812 { TTI::SK_Broadcast, MVT::nxv2bf16, 1 }, 2813 { TTI::SK_Broadcast, MVT::nxv4bf16, 1 }, 2814 { TTI::SK_Broadcast, MVT::nxv8bf16, 1 }, 2815 { TTI::SK_Broadcast, MVT::nxv2f32, 1 }, 2816 { TTI::SK_Broadcast, MVT::nxv4f32, 1 }, 2817 { TTI::SK_Broadcast, MVT::nxv2f64, 1 }, 2818 { TTI::SK_Broadcast, MVT::nxv16i1, 1 }, 2819 { TTI::SK_Broadcast, MVT::nxv8i1, 1 }, 2820 { TTI::SK_Broadcast, MVT::nxv4i1, 1 }, 2821 { TTI::SK_Broadcast, MVT::nxv2i1, 1 }, 2822 // Handle the cases for vector.reverse with scalable vectors 2823 { TTI::SK_Reverse, MVT::nxv16i8, 1 }, 2824 { TTI::SK_Reverse, MVT::nxv8i16, 1 }, 2825 { TTI::SK_Reverse, MVT::nxv4i32, 1 }, 2826 { TTI::SK_Reverse, MVT::nxv2i64, 1 }, 2827 { TTI::SK_Reverse, MVT::nxv2f16, 1 }, 2828 { TTI::SK_Reverse, MVT::nxv4f16, 1 }, 2829 { TTI::SK_Reverse, MVT::nxv8f16, 1 }, 2830 { TTI::SK_Reverse, MVT::nxv2bf16, 1 }, 2831 { TTI::SK_Reverse, MVT::nxv4bf16, 1 }, 2832 { TTI::SK_Reverse, MVT::nxv8bf16, 1 }, 2833 { TTI::SK_Reverse, MVT::nxv2f32, 1 }, 2834 { TTI::SK_Reverse, MVT::nxv4f32, 1 }, 2835 { TTI::SK_Reverse, MVT::nxv2f64, 1 }, 2836 { TTI::SK_Reverse, MVT::nxv16i1, 1 }, 2837 { TTI::SK_Reverse, MVT::nxv8i1, 1 }, 2838 { TTI::SK_Reverse, MVT::nxv4i1, 1 }, 2839 { TTI::SK_Reverse, MVT::nxv2i1, 1 }, 2840 }; 2841 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second)) 2842 return LT.first * Entry->Cost; 2843 } 2844 2845 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp)) 2846 return getSpliceCost(Tp, Index); 2847 2848 // Inserting a subvector can often be done with either a D, S or H register 2849 // move, so long as the inserted vector is "aligned". 2850 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() && 2851 LT.second.getSizeInBits() <= 128 && SubTp) { 2852 std::pair<InstructionCost, MVT> SubLT = 2853 TLI->getTypeLegalizationCost(DL, SubTp); 2854 if (SubLT.second.isVector()) { 2855 int NumElts = LT.second.getVectorNumElements(); 2856 int NumSubElts = SubLT.second.getVectorNumElements(); 2857 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) 2858 return SubLT.first; 2859 } 2860 } 2861 2862 return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp); 2863 } 2864