1 //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "AArch64TargetTransformInfo.h" 10 #include "AArch64ExpandImm.h" 11 #include "AArch64PerfectShuffle.h" 12 #include "MCTargetDesc/AArch64AddressingModes.h" 13 #include "llvm/Analysis/IVDescriptors.h" 14 #include "llvm/Analysis/LoopInfo.h" 15 #include "llvm/Analysis/TargetTransformInfo.h" 16 #include "llvm/CodeGen/BasicTTIImpl.h" 17 #include "llvm/CodeGen/CostTable.h" 18 #include "llvm/CodeGen/TargetLowering.h" 19 #include "llvm/IR/IntrinsicInst.h" 20 #include "llvm/IR/Intrinsics.h" 21 #include "llvm/IR/IntrinsicsAArch64.h" 22 #include "llvm/IR/PatternMatch.h" 23 #include "llvm/Support/Debug.h" 24 #include "llvm/Transforms/InstCombine/InstCombiner.h" 25 #include <algorithm> 26 using namespace llvm; 27 using namespace llvm::PatternMatch; 28 29 #define DEBUG_TYPE "aarch64tti" 30 31 static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", 32 cl::init(true), cl::Hidden); 33 34 static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10), 35 cl::Hidden); 36 37 static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead", 38 cl::init(10), cl::Hidden); 39 40 bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, 41 const Function *Callee) const { 42 const TargetMachine &TM = getTLI()->getTargetMachine(); 43 44 const FeatureBitset &CallerBits = 45 TM.getSubtargetImpl(*Caller)->getFeatureBits(); 46 const FeatureBitset &CalleeBits = 47 TM.getSubtargetImpl(*Callee)->getFeatureBits(); 48 49 // Inline a callee if its target-features are a subset of the callers 50 // target-features. 51 return (CallerBits & CalleeBits) == CalleeBits; 52 } 53 54 bool AArch64TTIImpl::shouldMaximizeVectorBandwidth( 55 TargetTransformInfo::RegisterKind K) const { 56 assert(K != TargetTransformInfo::RGK_Scalar); 57 return K == TargetTransformInfo::RGK_FixedWidthVector; 58 } 59 60 /// Calculate the cost of materializing a 64-bit value. This helper 61 /// method might only calculate a fraction of a larger immediate. Therefore it 62 /// is valid to return a cost of ZERO. 63 InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) { 64 // Check if the immediate can be encoded within an instruction. 65 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64)) 66 return 0; 67 68 if (Val < 0) 69 Val = ~Val; 70 71 // Calculate how many moves we will need to materialize this constant. 72 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 73 AArch64_IMM::expandMOVImm(Val, 64, Insn); 74 return Insn.size(); 75 } 76 77 /// Calculate the cost of materializing the given constant. 78 InstructionCost AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, 79 TTI::TargetCostKind CostKind) { 80 assert(Ty->isIntegerTy()); 81 82 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 83 if (BitSize == 0) 84 return ~0U; 85 86 // Sign-extend all constants to a multiple of 64-bit. 87 APInt ImmVal = Imm; 88 if (BitSize & 0x3f) 89 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU); 90 91 // Split the constant into 64-bit chunks and calculate the cost for each 92 // chunk. 93 InstructionCost Cost = 0; 94 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { 95 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); 96 int64_t Val = Tmp.getSExtValue(); 97 Cost += getIntImmCost(Val); 98 } 99 // We need at least one instruction to materialze the constant. 100 return std::max<InstructionCost>(1, Cost); 101 } 102 103 InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, 104 const APInt &Imm, Type *Ty, 105 TTI::TargetCostKind CostKind, 106 Instruction *Inst) { 107 assert(Ty->isIntegerTy()); 108 109 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 110 // There is no cost model for constants with a bit size of 0. Return TCC_Free 111 // here, so that constant hoisting will ignore this constant. 112 if (BitSize == 0) 113 return TTI::TCC_Free; 114 115 unsigned ImmIdx = ~0U; 116 switch (Opcode) { 117 default: 118 return TTI::TCC_Free; 119 case Instruction::GetElementPtr: 120 // Always hoist the base address of a GetElementPtr. 121 if (Idx == 0) 122 return 2 * TTI::TCC_Basic; 123 return TTI::TCC_Free; 124 case Instruction::Store: 125 ImmIdx = 0; 126 break; 127 case Instruction::Add: 128 case Instruction::Sub: 129 case Instruction::Mul: 130 case Instruction::UDiv: 131 case Instruction::SDiv: 132 case Instruction::URem: 133 case Instruction::SRem: 134 case Instruction::And: 135 case Instruction::Or: 136 case Instruction::Xor: 137 case Instruction::ICmp: 138 ImmIdx = 1; 139 break; 140 // Always return TCC_Free for the shift value of a shift instruction. 141 case Instruction::Shl: 142 case Instruction::LShr: 143 case Instruction::AShr: 144 if (Idx == 1) 145 return TTI::TCC_Free; 146 break; 147 case Instruction::Trunc: 148 case Instruction::ZExt: 149 case Instruction::SExt: 150 case Instruction::IntToPtr: 151 case Instruction::PtrToInt: 152 case Instruction::BitCast: 153 case Instruction::PHI: 154 case Instruction::Call: 155 case Instruction::Select: 156 case Instruction::Ret: 157 case Instruction::Load: 158 break; 159 } 160 161 if (Idx == ImmIdx) { 162 int NumConstants = (BitSize + 63) / 64; 163 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 164 return (Cost <= NumConstants * TTI::TCC_Basic) 165 ? static_cast<int>(TTI::TCC_Free) 166 : Cost; 167 } 168 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 169 } 170 171 InstructionCost 172 AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, 173 const APInt &Imm, Type *Ty, 174 TTI::TargetCostKind CostKind) { 175 assert(Ty->isIntegerTy()); 176 177 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 178 // There is no cost model for constants with a bit size of 0. Return TCC_Free 179 // here, so that constant hoisting will ignore this constant. 180 if (BitSize == 0) 181 return TTI::TCC_Free; 182 183 // Most (all?) AArch64 intrinsics do not support folding immediates into the 184 // selected instruction, so we compute the materialization cost for the 185 // immediate directly. 186 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv) 187 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 188 189 switch (IID) { 190 default: 191 return TTI::TCC_Free; 192 case Intrinsic::sadd_with_overflow: 193 case Intrinsic::uadd_with_overflow: 194 case Intrinsic::ssub_with_overflow: 195 case Intrinsic::usub_with_overflow: 196 case Intrinsic::smul_with_overflow: 197 case Intrinsic::umul_with_overflow: 198 if (Idx == 1) { 199 int NumConstants = (BitSize + 63) / 64; 200 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 201 return (Cost <= NumConstants * TTI::TCC_Basic) 202 ? static_cast<int>(TTI::TCC_Free) 203 : Cost; 204 } 205 break; 206 case Intrinsic::experimental_stackmap: 207 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 208 return TTI::TCC_Free; 209 break; 210 case Intrinsic::experimental_patchpoint_void: 211 case Intrinsic::experimental_patchpoint_i64: 212 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 213 return TTI::TCC_Free; 214 break; 215 case Intrinsic::experimental_gc_statepoint: 216 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 217 return TTI::TCC_Free; 218 break; 219 } 220 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 221 } 222 223 TargetTransformInfo::PopcntSupportKind 224 AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) { 225 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 226 if (TyWidth == 32 || TyWidth == 64) 227 return TTI::PSK_FastHardware; 228 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount. 229 return TTI::PSK_Software; 230 } 231 232 InstructionCost 233 AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 234 TTI::TargetCostKind CostKind) { 235 auto *RetTy = ICA.getReturnType(); 236 switch (ICA.getID()) { 237 case Intrinsic::umin: 238 case Intrinsic::umax: 239 case Intrinsic::smin: 240 case Intrinsic::smax: { 241 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, 242 MVT::v8i16, MVT::v2i32, MVT::v4i32}; 243 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 244 // v2i64 types get converted to cmp+bif hence the cost of 2 245 if (LT.second == MVT::v2i64) 246 return LT.first * 2; 247 if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; })) 248 return LT.first; 249 break; 250 } 251 case Intrinsic::sadd_sat: 252 case Intrinsic::ssub_sat: 253 case Intrinsic::uadd_sat: 254 case Intrinsic::usub_sat: { 255 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, 256 MVT::v8i16, MVT::v2i32, MVT::v4i32, 257 MVT::v2i64}; 258 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 259 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we 260 // need to extend the type, as it uses shr(qadd(shl, shl)). 261 unsigned Instrs = 262 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4; 263 if (any_of(ValidSatTys, [<](MVT M) { return M == LT.second; })) 264 return LT.first * Instrs; 265 break; 266 } 267 case Intrinsic::abs: { 268 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, 269 MVT::v8i16, MVT::v2i32, MVT::v4i32, 270 MVT::v2i64}; 271 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 272 if (any_of(ValidAbsTys, [<](MVT M) { return M == LT.second; })) 273 return LT.first; 274 break; 275 } 276 case Intrinsic::experimental_stepvector: { 277 InstructionCost Cost = 1; // Cost of the `index' instruction 278 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 279 // Legalisation of illegal vectors involves an `index' instruction plus 280 // (LT.first - 1) vector adds. 281 if (LT.first > 1) { 282 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext()); 283 InstructionCost AddCost = 284 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind); 285 Cost += AddCost * (LT.first - 1); 286 } 287 return Cost; 288 } 289 case Intrinsic::bitreverse: { 290 static const CostTblEntry BitreverseTbl[] = { 291 {Intrinsic::bitreverse, MVT::i32, 1}, 292 {Intrinsic::bitreverse, MVT::i64, 1}, 293 {Intrinsic::bitreverse, MVT::v8i8, 1}, 294 {Intrinsic::bitreverse, MVT::v16i8, 1}, 295 {Intrinsic::bitreverse, MVT::v4i16, 2}, 296 {Intrinsic::bitreverse, MVT::v8i16, 2}, 297 {Intrinsic::bitreverse, MVT::v2i32, 2}, 298 {Intrinsic::bitreverse, MVT::v4i32, 2}, 299 {Intrinsic::bitreverse, MVT::v1i64, 2}, 300 {Intrinsic::bitreverse, MVT::v2i64, 2}, 301 }; 302 const auto LegalisationCost = TLI->getTypeLegalizationCost(DL, RetTy); 303 const auto *Entry = 304 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second); 305 if (Entry) { 306 // Cost Model is using the legal type(i32) that i8 and i16 will be 307 // converted to +1 so that we match the actual lowering cost 308 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 || 309 TLI->getValueType(DL, RetTy, true) == MVT::i16) 310 return LegalisationCost.first * Entry->Cost + 1; 311 312 return LegalisationCost.first * Entry->Cost; 313 } 314 break; 315 } 316 case Intrinsic::ctpop: { 317 static const CostTblEntry CtpopCostTbl[] = { 318 {ISD::CTPOP, MVT::v2i64, 4}, 319 {ISD::CTPOP, MVT::v4i32, 3}, 320 {ISD::CTPOP, MVT::v8i16, 2}, 321 {ISD::CTPOP, MVT::v16i8, 1}, 322 {ISD::CTPOP, MVT::i64, 4}, 323 {ISD::CTPOP, MVT::v2i32, 3}, 324 {ISD::CTPOP, MVT::v4i16, 2}, 325 {ISD::CTPOP, MVT::v8i8, 1}, 326 {ISD::CTPOP, MVT::i32, 5}, 327 }; 328 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 329 MVT MTy = LT.second; 330 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) { 331 // Extra cost of +1 when illegal vector types are legalized by promoting 332 // the integer type. 333 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() != 334 RetTy->getScalarSizeInBits() 335 ? 1 336 : 0; 337 return LT.first * Entry->Cost + ExtraCost; 338 } 339 break; 340 } 341 case Intrinsic::sadd_with_overflow: 342 case Intrinsic::uadd_with_overflow: 343 case Intrinsic::ssub_with_overflow: 344 case Intrinsic::usub_with_overflow: 345 case Intrinsic::smul_with_overflow: 346 case Intrinsic::umul_with_overflow: { 347 static const CostTblEntry WithOverflowCostTbl[] = { 348 {Intrinsic::sadd_with_overflow, MVT::i8, 3}, 349 {Intrinsic::uadd_with_overflow, MVT::i8, 3}, 350 {Intrinsic::sadd_with_overflow, MVT::i16, 3}, 351 {Intrinsic::uadd_with_overflow, MVT::i16, 3}, 352 {Intrinsic::sadd_with_overflow, MVT::i32, 1}, 353 {Intrinsic::uadd_with_overflow, MVT::i32, 1}, 354 {Intrinsic::sadd_with_overflow, MVT::i64, 1}, 355 {Intrinsic::uadd_with_overflow, MVT::i64, 1}, 356 {Intrinsic::ssub_with_overflow, MVT::i8, 3}, 357 {Intrinsic::usub_with_overflow, MVT::i8, 3}, 358 {Intrinsic::ssub_with_overflow, MVT::i16, 3}, 359 {Intrinsic::usub_with_overflow, MVT::i16, 3}, 360 {Intrinsic::ssub_with_overflow, MVT::i32, 1}, 361 {Intrinsic::usub_with_overflow, MVT::i32, 1}, 362 {Intrinsic::ssub_with_overflow, MVT::i64, 1}, 363 {Intrinsic::usub_with_overflow, MVT::i64, 1}, 364 {Intrinsic::smul_with_overflow, MVT::i8, 5}, 365 {Intrinsic::umul_with_overflow, MVT::i8, 4}, 366 {Intrinsic::smul_with_overflow, MVT::i16, 5}, 367 {Intrinsic::umul_with_overflow, MVT::i16, 4}, 368 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst 369 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw 370 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp 371 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr 372 }; 373 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true); 374 if (MTy.isSimple()) 375 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(), 376 MTy.getSimpleVT())) 377 return Entry->Cost; 378 break; 379 } 380 case Intrinsic::fptosi_sat: 381 case Intrinsic::fptoui_sat: { 382 if (ICA.getArgTypes().empty()) 383 break; 384 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat; 385 auto LT = TLI->getTypeLegalizationCost(DL, ICA.getArgTypes()[0]); 386 EVT MTy = TLI->getValueType(DL, RetTy); 387 // Check for the legal types, which are where the size of the input and the 388 // output are the same, or we are using cvt f64->i32 or f32->i64. 389 if ((LT.second == MVT::f32 || LT.second == MVT::f64 || 390 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 || 391 LT.second == MVT::v2f64) && 392 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() || 393 (LT.second == MVT::f64 && MTy == MVT::i32) || 394 (LT.second == MVT::f32 && MTy == MVT::i64))) 395 return LT.first; 396 // Similarly for fp16 sizes 397 if (ST->hasFullFP16() && 398 ((LT.second == MVT::f16 && MTy == MVT::i32) || 399 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) && 400 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))) 401 return LT.first; 402 403 // Otherwise we use a legal convert followed by a min+max 404 if ((LT.second.getScalarType() == MVT::f32 || 405 LT.second.getScalarType() == MVT::f64 || 406 (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) && 407 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) { 408 Type *LegalTy = 409 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits()); 410 if (LT.second.isVector()) 411 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount()); 412 InstructionCost Cost = 1; 413 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin, 414 LegalTy, {LegalTy, LegalTy}); 415 Cost += getIntrinsicInstrCost(Attrs1, CostKind); 416 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax, 417 LegalTy, {LegalTy, LegalTy}); 418 Cost += getIntrinsicInstrCost(Attrs2, CostKind); 419 return LT.first * Cost; 420 } 421 break; 422 } 423 default: 424 break; 425 } 426 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 427 } 428 429 /// The function will remove redundant reinterprets casting in the presence 430 /// of the control flow 431 static Optional<Instruction *> processPhiNode(InstCombiner &IC, 432 IntrinsicInst &II) { 433 SmallVector<Instruction *, 32> Worklist; 434 auto RequiredType = II.getType(); 435 436 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0)); 437 assert(PN && "Expected Phi Node!"); 438 439 // Don't create a new Phi unless we can remove the old one. 440 if (!PN->hasOneUse()) 441 return None; 442 443 for (Value *IncValPhi : PN->incoming_values()) { 444 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi); 445 if (!Reinterpret || 446 Reinterpret->getIntrinsicID() != 447 Intrinsic::aarch64_sve_convert_to_svbool || 448 RequiredType != Reinterpret->getArgOperand(0)->getType()) 449 return None; 450 } 451 452 // Create the new Phi 453 LLVMContext &Ctx = PN->getContext(); 454 IRBuilder<> Builder(Ctx); 455 Builder.SetInsertPoint(PN); 456 PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues()); 457 Worklist.push_back(PN); 458 459 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) { 460 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I)); 461 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I)); 462 Worklist.push_back(Reinterpret); 463 } 464 465 // Cleanup Phi Node and reinterprets 466 return IC.replaceInstUsesWith(II, NPN); 467 } 468 469 // (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _)))) 470 // => (binop (pred) (from_svbool _) (from_svbool _)) 471 // 472 // The above transformation eliminates a `to_svbool` in the predicate 473 // operand of bitwise operation `binop` by narrowing the vector width of 474 // the operation. For example, it would convert a `<vscale x 16 x i1> 475 // and` into a `<vscale x 4 x i1> and`. This is profitable because 476 // to_svbool must zero the new lanes during widening, whereas 477 // from_svbool is free. 478 static Optional<Instruction *> tryCombineFromSVBoolBinOp(InstCombiner &IC, 479 IntrinsicInst &II) { 480 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0)); 481 if (!BinOp) 482 return None; 483 484 auto IntrinsicID = BinOp->getIntrinsicID(); 485 switch (IntrinsicID) { 486 case Intrinsic::aarch64_sve_and_z: 487 case Intrinsic::aarch64_sve_bic_z: 488 case Intrinsic::aarch64_sve_eor_z: 489 case Intrinsic::aarch64_sve_nand_z: 490 case Intrinsic::aarch64_sve_nor_z: 491 case Intrinsic::aarch64_sve_orn_z: 492 case Intrinsic::aarch64_sve_orr_z: 493 break; 494 default: 495 return None; 496 } 497 498 auto BinOpPred = BinOp->getOperand(0); 499 auto BinOpOp1 = BinOp->getOperand(1); 500 auto BinOpOp2 = BinOp->getOperand(2); 501 502 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred); 503 if (!PredIntr || 504 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool) 505 return None; 506 507 auto PredOp = PredIntr->getOperand(0); 508 auto PredOpTy = cast<VectorType>(PredOp->getType()); 509 if (PredOpTy != II.getType()) 510 return None; 511 512 IRBuilder<> Builder(II.getContext()); 513 Builder.SetInsertPoint(&II); 514 515 SmallVector<Value *> NarrowedBinOpArgs = {PredOp}; 516 auto NarrowBinOpOp1 = Builder.CreateIntrinsic( 517 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1}); 518 NarrowedBinOpArgs.push_back(NarrowBinOpOp1); 519 if (BinOpOp1 == BinOpOp2) 520 NarrowedBinOpArgs.push_back(NarrowBinOpOp1); 521 else 522 NarrowedBinOpArgs.push_back(Builder.CreateIntrinsic( 523 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2})); 524 525 auto NarrowedBinOp = 526 Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs); 527 return IC.replaceInstUsesWith(II, NarrowedBinOp); 528 } 529 530 static Optional<Instruction *> instCombineConvertFromSVBool(InstCombiner &IC, 531 IntrinsicInst &II) { 532 // If the reinterpret instruction operand is a PHI Node 533 if (isa<PHINode>(II.getArgOperand(0))) 534 return processPhiNode(IC, II); 535 536 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II)) 537 return BinOpCombine; 538 539 SmallVector<Instruction *, 32> CandidatesForRemoval; 540 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr; 541 542 const auto *IVTy = cast<VectorType>(II.getType()); 543 544 // Walk the chain of conversions. 545 while (Cursor) { 546 // If the type of the cursor has fewer lanes than the final result, zeroing 547 // must take place, which breaks the equivalence chain. 548 const auto *CursorVTy = cast<VectorType>(Cursor->getType()); 549 if (CursorVTy->getElementCount().getKnownMinValue() < 550 IVTy->getElementCount().getKnownMinValue()) 551 break; 552 553 // If the cursor has the same type as I, it is a viable replacement. 554 if (Cursor->getType() == IVTy) 555 EarliestReplacement = Cursor; 556 557 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor); 558 559 // If this is not an SVE conversion intrinsic, this is the end of the chain. 560 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() == 561 Intrinsic::aarch64_sve_convert_to_svbool || 562 IntrinsicCursor->getIntrinsicID() == 563 Intrinsic::aarch64_sve_convert_from_svbool)) 564 break; 565 566 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor); 567 Cursor = IntrinsicCursor->getOperand(0); 568 } 569 570 // If no viable replacement in the conversion chain was found, there is 571 // nothing to do. 572 if (!EarliestReplacement) 573 return None; 574 575 return IC.replaceInstUsesWith(II, EarliestReplacement); 576 } 577 578 static Optional<Instruction *> instCombineSVESel(InstCombiner &IC, 579 IntrinsicInst &II) { 580 IRBuilder<> Builder(&II); 581 auto Select = Builder.CreateSelect(II.getOperand(0), II.getOperand(1), 582 II.getOperand(2)); 583 return IC.replaceInstUsesWith(II, Select); 584 } 585 586 static Optional<Instruction *> instCombineSVEDup(InstCombiner &IC, 587 IntrinsicInst &II) { 588 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); 589 if (!Pg) 590 return None; 591 592 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) 593 return None; 594 595 const auto PTruePattern = 596 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue(); 597 if (PTruePattern != AArch64SVEPredPattern::vl1) 598 return None; 599 600 // The intrinsic is inserting into lane zero so use an insert instead. 601 auto *IdxTy = Type::getInt64Ty(II.getContext()); 602 auto *Insert = InsertElementInst::Create( 603 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0)); 604 Insert->insertBefore(&II); 605 Insert->takeName(&II); 606 607 return IC.replaceInstUsesWith(II, Insert); 608 } 609 610 static Optional<Instruction *> instCombineSVEDupX(InstCombiner &IC, 611 IntrinsicInst &II) { 612 // Replace DupX with a regular IR splat. 613 IRBuilder<> Builder(II.getContext()); 614 Builder.SetInsertPoint(&II); 615 auto *RetTy = cast<ScalableVectorType>(II.getType()); 616 Value *Splat = 617 Builder.CreateVectorSplat(RetTy->getElementCount(), II.getArgOperand(0)); 618 Splat->takeName(&II); 619 return IC.replaceInstUsesWith(II, Splat); 620 } 621 622 static Optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC, 623 IntrinsicInst &II) { 624 LLVMContext &Ctx = II.getContext(); 625 IRBuilder<> Builder(Ctx); 626 Builder.SetInsertPoint(&II); 627 628 // Check that the predicate is all active 629 auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0)); 630 if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) 631 return None; 632 633 const auto PTruePattern = 634 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue(); 635 if (PTruePattern != AArch64SVEPredPattern::all) 636 return None; 637 638 // Check that we have a compare of zero.. 639 auto *SplatValue = 640 dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2))); 641 if (!SplatValue || !SplatValue->isZero()) 642 return None; 643 644 // ..against a dupq 645 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); 646 if (!DupQLane || 647 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane) 648 return None; 649 650 // Where the dupq is a lane 0 replicate of a vector insert 651 if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero()) 652 return None; 653 654 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0)); 655 if (!VecIns || 656 VecIns->getIntrinsicID() != Intrinsic::experimental_vector_insert) 657 return None; 658 659 // Where the vector insert is a fixed constant vector insert into undef at 660 // index zero 661 if (!isa<UndefValue>(VecIns->getArgOperand(0))) 662 return None; 663 664 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero()) 665 return None; 666 667 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1)); 668 if (!ConstVec) 669 return None; 670 671 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType()); 672 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType()); 673 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements()) 674 return None; 675 676 unsigned NumElts = VecTy->getNumElements(); 677 unsigned PredicateBits = 0; 678 679 // Expand intrinsic operands to a 16-bit byte level predicate 680 for (unsigned I = 0; I < NumElts; ++I) { 681 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I)); 682 if (!Arg) 683 return None; 684 if (!Arg->isZero()) 685 PredicateBits |= 1 << (I * (16 / NumElts)); 686 } 687 688 // If all bits are zero bail early with an empty predicate 689 if (PredicateBits == 0) { 690 auto *PFalse = Constant::getNullValue(II.getType()); 691 PFalse->takeName(&II); 692 return IC.replaceInstUsesWith(II, PFalse); 693 } 694 695 // Calculate largest predicate type used (where byte predicate is largest) 696 unsigned Mask = 8; 697 for (unsigned I = 0; I < 16; ++I) 698 if ((PredicateBits & (1 << I)) != 0) 699 Mask |= (I % 8); 700 701 unsigned PredSize = Mask & -Mask; 702 auto *PredType = ScalableVectorType::get( 703 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8)); 704 705 // Ensure all relevant bits are set 706 for (unsigned I = 0; I < 16; I += PredSize) 707 if ((PredicateBits & (1 << I)) == 0) 708 return None; 709 710 auto *PTruePat = 711 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all); 712 auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, 713 {PredType}, {PTruePat}); 714 auto *ConvertToSVBool = Builder.CreateIntrinsic( 715 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue}); 716 auto *ConvertFromSVBool = 717 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, 718 {II.getType()}, {ConvertToSVBool}); 719 720 ConvertFromSVBool->takeName(&II); 721 return IC.replaceInstUsesWith(II, ConvertFromSVBool); 722 } 723 724 static Optional<Instruction *> instCombineSVELast(InstCombiner &IC, 725 IntrinsicInst &II) { 726 IRBuilder<> Builder(II.getContext()); 727 Builder.SetInsertPoint(&II); 728 Value *Pg = II.getArgOperand(0); 729 Value *Vec = II.getArgOperand(1); 730 auto IntrinsicID = II.getIntrinsicID(); 731 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta; 732 733 // lastX(splat(X)) --> X 734 if (auto *SplatVal = getSplatValue(Vec)) 735 return IC.replaceInstUsesWith(II, SplatVal); 736 737 // If x and/or y is a splat value then: 738 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y)) 739 Value *LHS, *RHS; 740 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) { 741 if (isSplatValue(LHS) || isSplatValue(RHS)) { 742 auto *OldBinOp = cast<BinaryOperator>(Vec); 743 auto OpC = OldBinOp->getOpcode(); 744 auto *NewLHS = 745 Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS}); 746 auto *NewRHS = 747 Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS}); 748 auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags( 749 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), &II); 750 return IC.replaceInstUsesWith(II, NewBinOp); 751 } 752 } 753 754 auto *C = dyn_cast<Constant>(Pg); 755 if (IsAfter && C && C->isNullValue()) { 756 // The intrinsic is extracting lane 0 so use an extract instead. 757 auto *IdxTy = Type::getInt64Ty(II.getContext()); 758 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0)); 759 Extract->insertBefore(&II); 760 Extract->takeName(&II); 761 return IC.replaceInstUsesWith(II, Extract); 762 } 763 764 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg); 765 if (!IntrPG) 766 return None; 767 768 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) 769 return None; 770 771 const auto PTruePattern = 772 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue(); 773 774 // Can the intrinsic's predicate be converted to a known constant index? 775 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern); 776 if (!MinNumElts) 777 return None; 778 779 unsigned Idx = MinNumElts - 1; 780 // Increment the index if extracting the element after the last active 781 // predicate element. 782 if (IsAfter) 783 ++Idx; 784 785 // Ignore extracts whose index is larger than the known minimum vector 786 // length. NOTE: This is an artificial constraint where we prefer to 787 // maintain what the user asked for until an alternative is proven faster. 788 auto *PgVTy = cast<ScalableVectorType>(Pg->getType()); 789 if (Idx >= PgVTy->getMinNumElements()) 790 return None; 791 792 // The intrinsic is extracting a fixed lane so use an extract instead. 793 auto *IdxTy = Type::getInt64Ty(II.getContext()); 794 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx)); 795 Extract->insertBefore(&II); 796 Extract->takeName(&II); 797 return IC.replaceInstUsesWith(II, Extract); 798 } 799 800 static Optional<Instruction *> instCombineRDFFR(InstCombiner &IC, 801 IntrinsicInst &II) { 802 LLVMContext &Ctx = II.getContext(); 803 IRBuilder<> Builder(Ctx); 804 Builder.SetInsertPoint(&II); 805 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr 806 // can work with RDFFR_PP for ptest elimination. 807 auto *AllPat = 808 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all); 809 auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, 810 {II.getType()}, {AllPat}); 811 auto *RDFFR = 812 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue}); 813 RDFFR->takeName(&II); 814 return IC.replaceInstUsesWith(II, RDFFR); 815 } 816 817 static Optional<Instruction *> 818 instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) { 819 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue(); 820 821 if (Pattern == AArch64SVEPredPattern::all) { 822 LLVMContext &Ctx = II.getContext(); 823 IRBuilder<> Builder(Ctx); 824 Builder.SetInsertPoint(&II); 825 826 Constant *StepVal = ConstantInt::get(II.getType(), NumElts); 827 auto *VScale = Builder.CreateVScale(StepVal); 828 VScale->takeName(&II); 829 return IC.replaceInstUsesWith(II, VScale); 830 } 831 832 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern); 833 834 return MinNumElts && NumElts >= MinNumElts 835 ? Optional<Instruction *>(IC.replaceInstUsesWith( 836 II, ConstantInt::get(II.getType(), MinNumElts))) 837 : None; 838 } 839 840 static Optional<Instruction *> instCombineSVEPTest(InstCombiner &IC, 841 IntrinsicInst &II) { 842 IntrinsicInst *Op1 = dyn_cast<IntrinsicInst>(II.getArgOperand(0)); 843 IntrinsicInst *Op2 = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); 844 845 if (Op1 && Op2 && 846 Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && 847 Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && 848 Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) { 849 850 IRBuilder<> Builder(II.getContext()); 851 Builder.SetInsertPoint(&II); 852 853 Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)}; 854 Type *Tys[] = {Op1->getArgOperand(0)->getType()}; 855 856 auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops); 857 858 PTest->takeName(&II); 859 return IC.replaceInstUsesWith(II, PTest); 860 } 861 862 return None; 863 } 864 865 static Optional<Instruction *> instCombineSVEVectorFMLA(InstCombiner &IC, 866 IntrinsicInst &II) { 867 // fold (fadd p a (fmul p b c)) -> (fma p a b c) 868 Value *P = II.getOperand(0); 869 Value *A = II.getOperand(1); 870 auto FMul = II.getOperand(2); 871 Value *B, *C; 872 if (!match(FMul, m_Intrinsic<Intrinsic::aarch64_sve_fmul>( 873 m_Specific(P), m_Value(B), m_Value(C)))) 874 return None; 875 876 if (!FMul->hasOneUse()) 877 return None; 878 879 llvm::FastMathFlags FAddFlags = II.getFastMathFlags(); 880 // Stop the combine when the flags on the inputs differ in case dropping flags 881 // would lead to us missing out on more beneficial optimizations. 882 if (FAddFlags != cast<CallInst>(FMul)->getFastMathFlags()) 883 return None; 884 if (!FAddFlags.allowContract()) 885 return None; 886 887 IRBuilder<> Builder(II.getContext()); 888 Builder.SetInsertPoint(&II); 889 auto FMLA = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_fmla, 890 {II.getType()}, {P, A, B, C}, &II); 891 FMLA->setFastMathFlags(FAddFlags); 892 return IC.replaceInstUsesWith(II, FMLA); 893 } 894 895 static bool isAllActivePredicate(Value *Pred) { 896 // Look through convert.from.svbool(convert.to.svbool(...) chain. 897 Value *UncastedPred; 898 if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>( 899 m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>( 900 m_Value(UncastedPred))))) 901 // If the predicate has the same or less lanes than the uncasted 902 // predicate then we know the casting has no effect. 903 if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <= 904 cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements()) 905 Pred = UncastedPred; 906 907 return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( 908 m_ConstantInt<AArch64SVEPredPattern::all>())); 909 } 910 911 static Optional<Instruction *> 912 instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { 913 IRBuilder<> Builder(II.getContext()); 914 Builder.SetInsertPoint(&II); 915 916 Value *Pred = II.getOperand(0); 917 Value *PtrOp = II.getOperand(1); 918 Type *VecTy = II.getType(); 919 Value *VecPtr = Builder.CreateBitCast(PtrOp, VecTy->getPointerTo()); 920 921 if (isAllActivePredicate(Pred)) { 922 LoadInst *Load = Builder.CreateLoad(VecTy, VecPtr); 923 Load->copyMetadata(II); 924 return IC.replaceInstUsesWith(II, Load); 925 } 926 927 CallInst *MaskedLoad = 928 Builder.CreateMaskedLoad(VecTy, VecPtr, PtrOp->getPointerAlignment(DL), 929 Pred, ConstantAggregateZero::get(VecTy)); 930 MaskedLoad->copyMetadata(II); 931 return IC.replaceInstUsesWith(II, MaskedLoad); 932 } 933 934 static Optional<Instruction *> 935 instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { 936 IRBuilder<> Builder(II.getContext()); 937 Builder.SetInsertPoint(&II); 938 939 Value *VecOp = II.getOperand(0); 940 Value *Pred = II.getOperand(1); 941 Value *PtrOp = II.getOperand(2); 942 Value *VecPtr = 943 Builder.CreateBitCast(PtrOp, VecOp->getType()->getPointerTo()); 944 945 if (isAllActivePredicate(Pred)) { 946 StoreInst *Store = Builder.CreateStore(VecOp, VecPtr); 947 Store->copyMetadata(II); 948 return IC.eraseInstFromFunction(II); 949 } 950 951 CallInst *MaskedStore = Builder.CreateMaskedStore( 952 VecOp, VecPtr, PtrOp->getPointerAlignment(DL), Pred); 953 MaskedStore->copyMetadata(II); 954 return IC.eraseInstFromFunction(II); 955 } 956 957 static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) { 958 switch (Intrinsic) { 959 case Intrinsic::aarch64_sve_fmul: 960 return Instruction::BinaryOps::FMul; 961 case Intrinsic::aarch64_sve_fadd: 962 return Instruction::BinaryOps::FAdd; 963 case Intrinsic::aarch64_sve_fsub: 964 return Instruction::BinaryOps::FSub; 965 default: 966 return Instruction::BinaryOpsEnd; 967 } 968 } 969 970 static Optional<Instruction *> instCombineSVEVectorBinOp(InstCombiner &IC, 971 IntrinsicInst &II) { 972 auto *OpPredicate = II.getOperand(0); 973 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID()); 974 if (BinOpCode == Instruction::BinaryOpsEnd || 975 !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( 976 m_ConstantInt<AArch64SVEPredPattern::all>()))) 977 return None; 978 IRBuilder<> Builder(II.getContext()); 979 Builder.SetInsertPoint(&II); 980 Builder.setFastMathFlags(II.getFastMathFlags()); 981 auto BinOp = 982 Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2)); 983 return IC.replaceInstUsesWith(II, BinOp); 984 } 985 986 static Optional<Instruction *> instCombineSVEVectorFAdd(InstCombiner &IC, 987 IntrinsicInst &II) { 988 if (auto FMLA = instCombineSVEVectorFMLA(IC, II)) 989 return FMLA; 990 return instCombineSVEVectorBinOp(IC, II); 991 } 992 993 static Optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC, 994 IntrinsicInst &II) { 995 auto *OpPredicate = II.getOperand(0); 996 auto *OpMultiplicand = II.getOperand(1); 997 auto *OpMultiplier = II.getOperand(2); 998 999 IRBuilder<> Builder(II.getContext()); 1000 Builder.SetInsertPoint(&II); 1001 1002 // Return true if a given instruction is a unit splat value, false otherwise. 1003 auto IsUnitSplat = [](auto *I) { 1004 auto *SplatValue = getSplatValue(I); 1005 if (!SplatValue) 1006 return false; 1007 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); 1008 }; 1009 1010 // Return true if a given instruction is an aarch64_sve_dup intrinsic call 1011 // with a unit splat value, false otherwise. 1012 auto IsUnitDup = [](auto *I) { 1013 auto *IntrI = dyn_cast<IntrinsicInst>(I); 1014 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup) 1015 return false; 1016 1017 auto *SplatValue = IntrI->getOperand(2); 1018 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); 1019 }; 1020 1021 if (IsUnitSplat(OpMultiplier)) { 1022 // [f]mul pg %n, (dupx 1) => %n 1023 OpMultiplicand->takeName(&II); 1024 return IC.replaceInstUsesWith(II, OpMultiplicand); 1025 } else if (IsUnitDup(OpMultiplier)) { 1026 // [f]mul pg %n, (dup pg 1) => %n 1027 auto *DupInst = cast<IntrinsicInst>(OpMultiplier); 1028 auto *DupPg = DupInst->getOperand(1); 1029 // TODO: this is naive. The optimization is still valid if DupPg 1030 // 'encompasses' OpPredicate, not only if they're the same predicate. 1031 if (OpPredicate == DupPg) { 1032 OpMultiplicand->takeName(&II); 1033 return IC.replaceInstUsesWith(II, OpMultiplicand); 1034 } 1035 } 1036 1037 return instCombineSVEVectorBinOp(IC, II); 1038 } 1039 1040 static Optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC, 1041 IntrinsicInst &II) { 1042 IRBuilder<> Builder(II.getContext()); 1043 Builder.SetInsertPoint(&II); 1044 Value *UnpackArg = II.getArgOperand(0); 1045 auto *RetTy = cast<ScalableVectorType>(II.getType()); 1046 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi || 1047 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo; 1048 1049 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X)) 1050 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X)) 1051 if (auto *ScalarArg = getSplatValue(UnpackArg)) { 1052 ScalarArg = 1053 Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned); 1054 Value *NewVal = 1055 Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg); 1056 NewVal->takeName(&II); 1057 return IC.replaceInstUsesWith(II, NewVal); 1058 } 1059 1060 return None; 1061 } 1062 static Optional<Instruction *> instCombineSVETBL(InstCombiner &IC, 1063 IntrinsicInst &II) { 1064 auto *OpVal = II.getOperand(0); 1065 auto *OpIndices = II.getOperand(1); 1066 VectorType *VTy = cast<VectorType>(II.getType()); 1067 1068 // Check whether OpIndices is a constant splat value < minimal element count 1069 // of result. 1070 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices)); 1071 if (!SplatValue || 1072 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue())) 1073 return None; 1074 1075 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to 1076 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization. 1077 IRBuilder<> Builder(II.getContext()); 1078 Builder.SetInsertPoint(&II); 1079 auto *Extract = Builder.CreateExtractElement(OpVal, SplatValue); 1080 auto *VectorSplat = 1081 Builder.CreateVectorSplat(VTy->getElementCount(), Extract); 1082 1083 VectorSplat->takeName(&II); 1084 return IC.replaceInstUsesWith(II, VectorSplat); 1085 } 1086 1087 static Optional<Instruction *> instCombineSVETupleGet(InstCombiner &IC, 1088 IntrinsicInst &II) { 1089 // Try to remove sequences of tuple get/set. 1090 Value *SetTuple, *SetIndex, *SetValue; 1091 auto *GetTuple = II.getArgOperand(0); 1092 auto *GetIndex = II.getArgOperand(1); 1093 // Check that we have tuple_get(GetTuple, GetIndex) where GetTuple is a 1094 // call to tuple_set i.e. tuple_set(SetTuple, SetIndex, SetValue). 1095 // Make sure that the types of the current intrinsic and SetValue match 1096 // in order to safely remove the sequence. 1097 if (!match(GetTuple, 1098 m_Intrinsic<Intrinsic::aarch64_sve_tuple_set>( 1099 m_Value(SetTuple), m_Value(SetIndex), m_Value(SetValue))) || 1100 SetValue->getType() != II.getType()) 1101 return None; 1102 // Case where we get the same index right after setting it. 1103 // tuple_get(tuple_set(SetTuple, SetIndex, SetValue), GetIndex) --> SetValue 1104 if (GetIndex == SetIndex) 1105 return IC.replaceInstUsesWith(II, SetValue); 1106 // If we are getting a different index than what was set in the tuple_set 1107 // intrinsic. We can just set the input tuple to the one up in the chain. 1108 // tuple_get(tuple_set(SetTuple, SetIndex, SetValue), GetIndex) 1109 // --> tuple_get(SetTuple, GetIndex) 1110 return IC.replaceOperand(II, 0, SetTuple); 1111 } 1112 1113 static Optional<Instruction *> instCombineSVEZip(InstCombiner &IC, 1114 IntrinsicInst &II) { 1115 // zip1(uzp1(A, B), uzp2(A, B)) --> A 1116 // zip2(uzp1(A, B), uzp2(A, B)) --> B 1117 Value *A, *B; 1118 if (match(II.getArgOperand(0), 1119 m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) && 1120 match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>( 1121 m_Specific(A), m_Specific(B)))) 1122 return IC.replaceInstUsesWith( 1123 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B)); 1124 1125 return None; 1126 } 1127 1128 static Optional<Instruction *> instCombineLD1GatherIndex(InstCombiner &IC, 1129 IntrinsicInst &II) { 1130 Value *Mask = II.getOperand(0); 1131 Value *BasePtr = II.getOperand(1); 1132 Value *Index = II.getOperand(2); 1133 Type *Ty = II.getType(); 1134 Value *PassThru = ConstantAggregateZero::get(Ty); 1135 1136 // Contiguous gather => masked load. 1137 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1)) 1138 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer) 1139 Value *IndexBase; 1140 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>( 1141 m_Value(IndexBase), m_SpecificInt(1)))) { 1142 IRBuilder<> Builder(II.getContext()); 1143 Builder.SetInsertPoint(&II); 1144 1145 Align Alignment = 1146 BasePtr->getPointerAlignment(II.getModule()->getDataLayout()); 1147 1148 Type *VecPtrTy = PointerType::getUnqual(Ty); 1149 Value *Ptr = Builder.CreateGEP( 1150 cast<VectorType>(Ty)->getElementType(), BasePtr, IndexBase); 1151 Ptr = Builder.CreateBitCast(Ptr, VecPtrTy); 1152 CallInst *MaskedLoad = 1153 Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru); 1154 MaskedLoad->takeName(&II); 1155 return IC.replaceInstUsesWith(II, MaskedLoad); 1156 } 1157 1158 return None; 1159 } 1160 1161 static Optional<Instruction *> instCombineST1ScatterIndex(InstCombiner &IC, 1162 IntrinsicInst &II) { 1163 Value *Val = II.getOperand(0); 1164 Value *Mask = II.getOperand(1); 1165 Value *BasePtr = II.getOperand(2); 1166 Value *Index = II.getOperand(3); 1167 Type *Ty = Val->getType(); 1168 1169 // Contiguous scatter => masked store. 1170 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1)) 1171 // => (masked.store Value (gep BasePtr IndexBase) Align Mask) 1172 Value *IndexBase; 1173 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>( 1174 m_Value(IndexBase), m_SpecificInt(1)))) { 1175 IRBuilder<> Builder(II.getContext()); 1176 Builder.SetInsertPoint(&II); 1177 1178 Align Alignment = 1179 BasePtr->getPointerAlignment(II.getModule()->getDataLayout()); 1180 1181 Value *Ptr = Builder.CreateGEP( 1182 cast<VectorType>(Ty)->getElementType(), BasePtr, IndexBase); 1183 Type *VecPtrTy = PointerType::getUnqual(Ty); 1184 Ptr = Builder.CreateBitCast(Ptr, VecPtrTy); 1185 1186 (void)Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask); 1187 1188 return IC.eraseInstFromFunction(II); 1189 } 1190 1191 return None; 1192 } 1193 1194 static Optional<Instruction *> instCombineSVESDIV(InstCombiner &IC, 1195 IntrinsicInst &II) { 1196 IRBuilder<> Builder(II.getContext()); 1197 Builder.SetInsertPoint(&II); 1198 Type *Int32Ty = Builder.getInt32Ty(); 1199 Value *Pred = II.getOperand(0); 1200 Value *Vec = II.getOperand(1); 1201 Value *DivVec = II.getOperand(2); 1202 1203 Value *SplatValue = getSplatValue(DivVec); 1204 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue); 1205 if (!SplatConstantInt) 1206 return None; 1207 APInt Divisor = SplatConstantInt->getValue(); 1208 1209 if (Divisor.isPowerOf2()) { 1210 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2()); 1211 auto ASRD = Builder.CreateIntrinsic( 1212 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2}); 1213 return IC.replaceInstUsesWith(II, ASRD); 1214 } 1215 if (Divisor.isNegatedPowerOf2()) { 1216 Divisor.negate(); 1217 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2()); 1218 auto ASRD = Builder.CreateIntrinsic( 1219 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2}); 1220 auto NEG = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_neg, 1221 {ASRD->getType()}, {ASRD, Pred, ASRD}); 1222 return IC.replaceInstUsesWith(II, NEG); 1223 } 1224 1225 return None; 1226 } 1227 1228 static Optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC, 1229 IntrinsicInst &II) { 1230 Value *A = II.getArgOperand(0); 1231 Value *B = II.getArgOperand(1); 1232 if (A == B) 1233 return IC.replaceInstUsesWith(II, A); 1234 1235 return None; 1236 } 1237 1238 static Optional<Instruction *> instCombineSVESrshl(InstCombiner &IC, 1239 IntrinsicInst &II) { 1240 IRBuilder<> Builder(&II); 1241 Value *Pred = II.getOperand(0); 1242 Value *Vec = II.getOperand(1); 1243 Value *Shift = II.getOperand(2); 1244 1245 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic. 1246 Value *AbsPred, *MergedValue; 1247 if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>( 1248 m_Value(MergedValue), m_Value(AbsPred), m_Value())) && 1249 !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>( 1250 m_Value(MergedValue), m_Value(AbsPred), m_Value()))) 1251 1252 return None; 1253 1254 // Transform is valid if any of the following are true: 1255 // * The ABS merge value is an undef or non-negative 1256 // * The ABS predicate is all active 1257 // * The ABS predicate and the SRSHL predicates are the same 1258 if (!isa<UndefValue>(MergedValue) && 1259 !match(MergedValue, m_NonNegative()) && 1260 AbsPred != Pred && !isAllActivePredicate(AbsPred)) 1261 return None; 1262 1263 // Only valid when the shift amount is non-negative, otherwise the rounding 1264 // behaviour of SRSHL cannot be ignored. 1265 if (!match(Shift, m_NonNegative())) 1266 return None; 1267 1268 auto LSL = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl, {II.getType()}, 1269 {Pred, Vec, Shift}); 1270 1271 return IC.replaceInstUsesWith(II, LSL); 1272 } 1273 1274 Optional<Instruction *> 1275 AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, 1276 IntrinsicInst &II) const { 1277 Intrinsic::ID IID = II.getIntrinsicID(); 1278 switch (IID) { 1279 default: 1280 break; 1281 case Intrinsic::aarch64_neon_fmaxnm: 1282 case Intrinsic::aarch64_neon_fminnm: 1283 return instCombineMaxMinNM(IC, II); 1284 case Intrinsic::aarch64_sve_convert_from_svbool: 1285 return instCombineConvertFromSVBool(IC, II); 1286 case Intrinsic::aarch64_sve_dup: 1287 return instCombineSVEDup(IC, II); 1288 case Intrinsic::aarch64_sve_dup_x: 1289 return instCombineSVEDupX(IC, II); 1290 case Intrinsic::aarch64_sve_cmpne: 1291 case Intrinsic::aarch64_sve_cmpne_wide: 1292 return instCombineSVECmpNE(IC, II); 1293 case Intrinsic::aarch64_sve_rdffr: 1294 return instCombineRDFFR(IC, II); 1295 case Intrinsic::aarch64_sve_lasta: 1296 case Intrinsic::aarch64_sve_lastb: 1297 return instCombineSVELast(IC, II); 1298 case Intrinsic::aarch64_sve_cntd: 1299 return instCombineSVECntElts(IC, II, 2); 1300 case Intrinsic::aarch64_sve_cntw: 1301 return instCombineSVECntElts(IC, II, 4); 1302 case Intrinsic::aarch64_sve_cnth: 1303 return instCombineSVECntElts(IC, II, 8); 1304 case Intrinsic::aarch64_sve_cntb: 1305 return instCombineSVECntElts(IC, II, 16); 1306 case Intrinsic::aarch64_sve_ptest_any: 1307 case Intrinsic::aarch64_sve_ptest_first: 1308 case Intrinsic::aarch64_sve_ptest_last: 1309 return instCombineSVEPTest(IC, II); 1310 case Intrinsic::aarch64_sve_mul: 1311 case Intrinsic::aarch64_sve_fmul: 1312 return instCombineSVEVectorMul(IC, II); 1313 case Intrinsic::aarch64_sve_fadd: 1314 return instCombineSVEVectorFAdd(IC, II); 1315 case Intrinsic::aarch64_sve_fsub: 1316 return instCombineSVEVectorBinOp(IC, II); 1317 case Intrinsic::aarch64_sve_tbl: 1318 return instCombineSVETBL(IC, II); 1319 case Intrinsic::aarch64_sve_uunpkhi: 1320 case Intrinsic::aarch64_sve_uunpklo: 1321 case Intrinsic::aarch64_sve_sunpkhi: 1322 case Intrinsic::aarch64_sve_sunpklo: 1323 return instCombineSVEUnpack(IC, II); 1324 case Intrinsic::aarch64_sve_tuple_get: 1325 return instCombineSVETupleGet(IC, II); 1326 case Intrinsic::aarch64_sve_zip1: 1327 case Intrinsic::aarch64_sve_zip2: 1328 return instCombineSVEZip(IC, II); 1329 case Intrinsic::aarch64_sve_ld1_gather_index: 1330 return instCombineLD1GatherIndex(IC, II); 1331 case Intrinsic::aarch64_sve_st1_scatter_index: 1332 return instCombineST1ScatterIndex(IC, II); 1333 case Intrinsic::aarch64_sve_ld1: 1334 return instCombineSVELD1(IC, II, DL); 1335 case Intrinsic::aarch64_sve_st1: 1336 return instCombineSVEST1(IC, II, DL); 1337 case Intrinsic::aarch64_sve_sdiv: 1338 return instCombineSVESDIV(IC, II); 1339 case Intrinsic::aarch64_sve_sel: 1340 return instCombineSVESel(IC, II); 1341 case Intrinsic::aarch64_sve_srshl: 1342 return instCombineSVESrshl(IC, II); 1343 } 1344 1345 return None; 1346 } 1347 1348 Optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic( 1349 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts, 1350 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, 1351 std::function<void(Instruction *, unsigned, APInt, APInt &)> 1352 SimplifyAndSetOp) const { 1353 switch (II.getIntrinsicID()) { 1354 default: 1355 break; 1356 case Intrinsic::aarch64_neon_fcvtxn: 1357 case Intrinsic::aarch64_neon_rshrn: 1358 case Intrinsic::aarch64_neon_sqrshrn: 1359 case Intrinsic::aarch64_neon_sqrshrun: 1360 case Intrinsic::aarch64_neon_sqshrn: 1361 case Intrinsic::aarch64_neon_sqshrun: 1362 case Intrinsic::aarch64_neon_sqxtn: 1363 case Intrinsic::aarch64_neon_sqxtun: 1364 case Intrinsic::aarch64_neon_uqrshrn: 1365 case Intrinsic::aarch64_neon_uqshrn: 1366 case Intrinsic::aarch64_neon_uqxtn: 1367 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts); 1368 break; 1369 } 1370 1371 return None; 1372 } 1373 1374 bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, 1375 ArrayRef<const Value *> Args) { 1376 1377 // A helper that returns a vector type from the given type. The number of 1378 // elements in type Ty determines the vector width. 1379 auto toVectorTy = [&](Type *ArgTy) { 1380 return VectorType::get(ArgTy->getScalarType(), 1381 cast<VectorType>(DstTy)->getElementCount()); 1382 }; 1383 1384 // Exit early if DstTy is not a vector type whose elements are at least 1385 // 16-bits wide. 1386 if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16) 1387 return false; 1388 1389 // Determine if the operation has a widening variant. We consider both the 1390 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the 1391 // instructions. 1392 // 1393 // TODO: Add additional widening operations (e.g., shl, etc.) once we 1394 // verify that their extending operands are eliminated during code 1395 // generation. 1396 switch (Opcode) { 1397 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2). 1398 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2). 1399 case Instruction::Mul: // SMULL(2), UMULL(2) 1400 break; 1401 default: 1402 return false; 1403 } 1404 1405 // To be a widening instruction (either the "wide" or "long" versions), the 1406 // second operand must be a sign- or zero extend. 1407 if (Args.size() != 2 || 1408 (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1]))) 1409 return false; 1410 auto *Extend = cast<CastInst>(Args[1]); 1411 auto *Arg0 = dyn_cast<CastInst>(Args[0]); 1412 1413 // A mul only has a mull version (not like addw). Both operands need to be 1414 // extending and the same type. 1415 if (Opcode == Instruction::Mul && 1416 (!Arg0 || Arg0->getOpcode() != Extend->getOpcode() || 1417 Arg0->getOperand(0)->getType() != Extend->getOperand(0)->getType())) 1418 return false; 1419 1420 // Legalize the destination type and ensure it can be used in a widening 1421 // operation. 1422 auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy); 1423 unsigned DstElTySize = DstTyL.second.getScalarSizeInBits(); 1424 if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits()) 1425 return false; 1426 1427 // Legalize the source type and ensure it can be used in a widening 1428 // operation. 1429 auto *SrcTy = toVectorTy(Extend->getSrcTy()); 1430 auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy); 1431 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits(); 1432 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits()) 1433 return false; 1434 1435 // Get the total number of vector elements in the legalized types. 1436 InstructionCost NumDstEls = 1437 DstTyL.first * DstTyL.second.getVectorMinNumElements(); 1438 InstructionCost NumSrcEls = 1439 SrcTyL.first * SrcTyL.second.getVectorMinNumElements(); 1440 1441 // Return true if the legalized types have the same number of vector elements 1442 // and the destination element type size is twice that of the source type. 1443 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize; 1444 } 1445 1446 InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, 1447 Type *Src, 1448 TTI::CastContextHint CCH, 1449 TTI::TargetCostKind CostKind, 1450 const Instruction *I) { 1451 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1452 assert(ISD && "Invalid opcode"); 1453 1454 // If the cast is observable, and it is used by a widening instruction (e.g., 1455 // uaddl, saddw, etc.), it may be free. 1456 if (I && I->hasOneUser()) { 1457 auto *SingleUser = cast<Instruction>(*I->user_begin()); 1458 SmallVector<const Value *, 4> Operands(SingleUser->operand_values()); 1459 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) { 1460 // If the cast is the second operand, it is free. We will generate either 1461 // a "wide" or "long" version of the widening instruction. 1462 if (I == SingleUser->getOperand(1)) 1463 return 0; 1464 // If the cast is not the second operand, it will be free if it looks the 1465 // same as the second operand. In this case, we will generate a "long" 1466 // version of the widening instruction. 1467 if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1))) 1468 if (I->getOpcode() == unsigned(Cast->getOpcode()) && 1469 cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy()) 1470 return 0; 1471 } 1472 } 1473 1474 // TODO: Allow non-throughput costs that aren't binary. 1475 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost { 1476 if (CostKind != TTI::TCK_RecipThroughput) 1477 return Cost == 0 ? 0 : 1; 1478 return Cost; 1479 }; 1480 1481 EVT SrcTy = TLI->getValueType(DL, Src); 1482 EVT DstTy = TLI->getValueType(DL, Dst); 1483 1484 if (!SrcTy.isSimple() || !DstTy.isSimple()) 1485 return AdjustCost( 1486 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); 1487 1488 static const TypeConversionCostTblEntry 1489 ConversionTbl[] = { 1490 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, 1491 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 }, 1492 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, 1493 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 }, 1494 1495 // Truncations on nxvmiN 1496 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 }, 1497 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 }, 1498 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 }, 1499 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 }, 1500 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 }, 1501 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 }, 1502 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 }, 1503 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 }, 1504 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 }, 1505 { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 }, 1506 { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 }, 1507 { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 }, 1508 { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 }, 1509 { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 }, 1510 { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 }, 1511 { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 }, 1512 1513 // The number of shll instructions for the extension. 1514 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 1515 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 1516 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, 1517 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, 1518 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 1519 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 1520 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, 1521 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, 1522 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, 1523 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, 1524 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, 1525 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, 1526 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, 1527 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, 1528 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, 1529 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, 1530 1531 // LowerVectorINT_TO_FP: 1532 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 1533 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 1534 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 1535 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 1536 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 1537 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 1538 1539 // Complex: to v2f32 1540 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 1541 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, 1542 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, 1543 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 1544 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, 1545 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, 1546 1547 // Complex: to v4f32 1548 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 }, 1549 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 1550 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, 1551 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 1552 1553 // Complex: to v8f32 1554 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, 1555 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 1556 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, 1557 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 1558 1559 // Complex: to v16f32 1560 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, 1561 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, 1562 1563 // Complex: to v2f64 1564 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 1565 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, 1566 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 1567 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 1568 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, 1569 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 1570 1571 1572 // LowerVectorFP_TO_INT 1573 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 }, 1574 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, 1575 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, 1576 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, 1577 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, 1578 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, 1579 1580 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext). 1581 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 }, 1582 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 }, 1583 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 }, 1584 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 }, 1585 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 }, 1586 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 }, 1587 1588 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2 1589 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 }, 1590 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 }, 1591 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 }, 1592 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 }, 1593 1594 // Complex, from nxv2f32. 1595 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 }, 1596 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 }, 1597 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 }, 1598 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1 }, 1599 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 }, 1600 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 }, 1601 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 }, 1602 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1 }, 1603 1604 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2. 1605 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 }, 1606 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 }, 1607 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 }, 1608 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 }, 1609 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 }, 1610 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 }, 1611 1612 // Complex, from nxv2f64. 1613 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 }, 1614 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 }, 1615 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 }, 1616 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1 }, 1617 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 }, 1618 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 }, 1619 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 }, 1620 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1 }, 1621 1622 // Complex, from nxv4f32. 1623 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 }, 1624 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 }, 1625 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 }, 1626 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1 }, 1627 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 }, 1628 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 }, 1629 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 }, 1630 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1 }, 1631 1632 // Complex, from nxv8f64. Illegal -> illegal conversions not required. 1633 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 }, 1634 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7 }, 1635 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 }, 1636 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7 }, 1637 1638 // Complex, from nxv4f64. Illegal -> illegal conversions not required. 1639 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 }, 1640 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 }, 1641 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3 }, 1642 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 }, 1643 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 }, 1644 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3 }, 1645 1646 // Complex, from nxv8f32. Illegal -> illegal conversions not required. 1647 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 }, 1648 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3 }, 1649 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 }, 1650 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3 }, 1651 1652 // Complex, from nxv8f16. 1653 { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 }, 1654 { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 }, 1655 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 }, 1656 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1 }, 1657 { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 }, 1658 { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 }, 1659 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 }, 1660 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1 }, 1661 1662 // Complex, from nxv4f16. 1663 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 }, 1664 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 }, 1665 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 }, 1666 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1 }, 1667 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 }, 1668 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 }, 1669 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 }, 1670 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1 }, 1671 1672 // Complex, from nxv2f16. 1673 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 }, 1674 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 }, 1675 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 }, 1676 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1 }, 1677 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 }, 1678 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 }, 1679 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 }, 1680 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1 }, 1681 1682 // Truncate from nxvmf32 to nxvmf16. 1683 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 }, 1684 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 }, 1685 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 }, 1686 1687 // Truncate from nxvmf64 to nxvmf16. 1688 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 }, 1689 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 }, 1690 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 }, 1691 1692 // Truncate from nxvmf64 to nxvmf32. 1693 { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 }, 1694 { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 }, 1695 { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 }, 1696 1697 // Extend from nxvmf16 to nxvmf32. 1698 { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1}, 1699 { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1}, 1700 { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2}, 1701 1702 // Extend from nxvmf16 to nxvmf64. 1703 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1}, 1704 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2}, 1705 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4}, 1706 1707 // Extend from nxvmf32 to nxvmf64. 1708 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1}, 1709 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2}, 1710 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6}, 1711 1712 // Bitcasts from float to integer 1713 { ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0 }, 1714 { ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0 }, 1715 { ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0 }, 1716 1717 // Bitcasts from integer to float 1718 { ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0 }, 1719 { ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0 }, 1720 { ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0 }, 1721 }; 1722 1723 if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD, 1724 DstTy.getSimpleVT(), 1725 SrcTy.getSimpleVT())) 1726 return AdjustCost(Entry->Cost); 1727 1728 static const TypeConversionCostTblEntry FP16Tbl[] = { 1729 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs 1730 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1}, 1731 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs 1732 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1}, 1733 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs 1734 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2}, 1735 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn 1736 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2}, 1737 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs 1738 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1}, 1739 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs 1740 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4}, 1741 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn 1742 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3}, 1743 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs 1744 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2}, 1745 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs 1746 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8}, 1747 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf 1748 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf 1749 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf 1750 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf 1751 }; 1752 1753 if (ST->hasFullFP16()) 1754 if (const auto *Entry = ConvertCostTableLookup( 1755 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) 1756 return AdjustCost(Entry->Cost); 1757 1758 return AdjustCost( 1759 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); 1760 } 1761 1762 InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, 1763 Type *Dst, 1764 VectorType *VecTy, 1765 unsigned Index) { 1766 1767 // Make sure we were given a valid extend opcode. 1768 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && 1769 "Invalid opcode"); 1770 1771 // We are extending an element we extract from a vector, so the source type 1772 // of the extend is the element type of the vector. 1773 auto *Src = VecTy->getElementType(); 1774 1775 // Sign- and zero-extends are for integer types only. 1776 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type"); 1777 1778 // Get the cost for the extract. We compute the cost (if any) for the extend 1779 // below. 1780 InstructionCost Cost = 1781 getVectorInstrCost(Instruction::ExtractElement, VecTy, Index); 1782 1783 // Legalize the types. 1784 auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy); 1785 auto DstVT = TLI->getValueType(DL, Dst); 1786 auto SrcVT = TLI->getValueType(DL, Src); 1787 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 1788 1789 // If the resulting type is still a vector and the destination type is legal, 1790 // we may get the extension for free. If not, get the default cost for the 1791 // extend. 1792 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT)) 1793 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 1794 CostKind); 1795 1796 // The destination type should be larger than the element type. If not, get 1797 // the default cost for the extend. 1798 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits()) 1799 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 1800 CostKind); 1801 1802 switch (Opcode) { 1803 default: 1804 llvm_unreachable("Opcode should be either SExt or ZExt"); 1805 1806 // For sign-extends, we only need a smov, which performs the extension 1807 // automatically. 1808 case Instruction::SExt: 1809 return Cost; 1810 1811 // For zero-extends, the extend is performed automatically by a umov unless 1812 // the destination type is i64 and the element type is i8 or i16. 1813 case Instruction::ZExt: 1814 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u) 1815 return Cost; 1816 } 1817 1818 // If we are unable to perform the extend for free, get the default cost. 1819 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 1820 CostKind); 1821 } 1822 1823 InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode, 1824 TTI::TargetCostKind CostKind, 1825 const Instruction *I) { 1826 if (CostKind != TTI::TCK_RecipThroughput) 1827 return Opcode == Instruction::PHI ? 0 : 1; 1828 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind"); 1829 // Branches are assumed to be predicted. 1830 return 0; 1831 } 1832 1833 InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, 1834 unsigned Index) { 1835 assert(Val->isVectorTy() && "This must be a vector type"); 1836 1837 if (Index != -1U) { 1838 // Legalize the type. 1839 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); 1840 1841 // This type is legalized to a scalar type. 1842 if (!LT.second.isVector()) 1843 return 0; 1844 1845 // The type may be split. For fixed-width vectors we can normalize the 1846 // index to the new type. 1847 if (LT.second.isFixedLengthVector()) { 1848 unsigned Width = LT.second.getVectorNumElements(); 1849 Index = Index % Width; 1850 } 1851 1852 // The element at index zero is already inside the vector. 1853 if (Index == 0) 1854 return 0; 1855 } 1856 1857 // All other insert/extracts cost this much. 1858 return ST->getVectorInsertExtractBaseCost(); 1859 } 1860 1861 InstructionCost AArch64TTIImpl::getArithmeticInstrCost( 1862 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 1863 TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, 1864 TTI::OperandValueProperties Opd1PropInfo, 1865 TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args, 1866 const Instruction *CxtI) { 1867 // TODO: Handle more cost kinds. 1868 if (CostKind != TTI::TCK_RecipThroughput) 1869 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, 1870 Opd2Info, Opd1PropInfo, 1871 Opd2PropInfo, Args, CxtI); 1872 1873 // Legalize the type. 1874 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 1875 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1876 1877 switch (ISD) { 1878 default: 1879 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, 1880 Opd2Info, Opd1PropInfo, Opd2PropInfo); 1881 case ISD::SDIV: 1882 if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue && 1883 Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { 1884 // On AArch64, scalar signed division by constants power-of-two are 1885 // normally expanded to the sequence ADD + CMP + SELECT + SRA. 1886 // The OperandValue properties many not be same as that of previous 1887 // operation; conservatively assume OP_None. 1888 InstructionCost Cost = getArithmeticInstrCost( 1889 Instruction::Add, Ty, CostKind, Opd1Info, Opd2Info, 1890 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 1891 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Opd1Info, 1892 Opd2Info, TargetTransformInfo::OP_None, 1893 TargetTransformInfo::OP_None); 1894 Cost += getArithmeticInstrCost( 1895 Instruction::Select, Ty, CostKind, Opd1Info, Opd2Info, 1896 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 1897 Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Opd1Info, 1898 Opd2Info, TargetTransformInfo::OP_None, 1899 TargetTransformInfo::OP_None); 1900 return Cost; 1901 } 1902 LLVM_FALLTHROUGH; 1903 case ISD::UDIV: { 1904 if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue) { 1905 auto VT = TLI->getValueType(DL, Ty); 1906 if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) { 1907 // Vector signed division by constant are expanded to the 1908 // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division 1909 // to MULHS + SUB + SRL + ADD + SRL. 1910 InstructionCost MulCost = getArithmeticInstrCost( 1911 Instruction::Mul, Ty, CostKind, Opd1Info, Opd2Info, 1912 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 1913 InstructionCost AddCost = getArithmeticInstrCost( 1914 Instruction::Add, Ty, CostKind, Opd1Info, Opd2Info, 1915 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 1916 InstructionCost ShrCost = getArithmeticInstrCost( 1917 Instruction::AShr, Ty, CostKind, Opd1Info, Opd2Info, 1918 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 1919 return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1; 1920 } 1921 } 1922 1923 InstructionCost Cost = BaseT::getArithmeticInstrCost( 1924 Opcode, Ty, CostKind, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo); 1925 if (Ty->isVectorTy()) { 1926 // On AArch64, vector divisions are not supported natively and are 1927 // expanded into scalar divisions of each pair of elements. 1928 Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, CostKind, 1929 Opd1Info, Opd2Info, Opd1PropInfo, 1930 Opd2PropInfo); 1931 Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind, 1932 Opd1Info, Opd2Info, Opd1PropInfo, 1933 Opd2PropInfo); 1934 // TODO: if one of the arguments is scalar, then it's not necessary to 1935 // double the cost of handling the vector elements. 1936 Cost += Cost; 1937 } 1938 return Cost; 1939 } 1940 case ISD::MUL: 1941 // Since we do not have a MUL.2d instruction, a mul <2 x i64> is expensive 1942 // as elements are extracted from the vectors and the muls scalarized. 1943 // As getScalarizationOverhead is a bit too pessimistic, we estimate the 1944 // cost for a i64 vector directly here, which is: 1945 // - four 2-cost i64 extracts, 1946 // - two 2-cost i64 inserts, and 1947 // - two 1-cost muls. 1948 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with 1949 // LT.first = 2 the cost is 28. If both operands are extensions it will not 1950 // need to scalarize so the cost can be cheaper (smull or umull). 1951 if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args)) 1952 return LT.first; 1953 return LT.first * 14; 1954 case ISD::ADD: 1955 case ISD::XOR: 1956 case ISD::OR: 1957 case ISD::AND: 1958 case ISD::SRL: 1959 case ISD::SRA: 1960 case ISD::SHL: 1961 // These nodes are marked as 'custom' for combining purposes only. 1962 // We know that they are legal. See LowerAdd in ISelLowering. 1963 return LT.first; 1964 1965 case ISD::FADD: 1966 case ISD::FSUB: 1967 case ISD::FMUL: 1968 case ISD::FDIV: 1969 case ISD::FNEG: 1970 // These nodes are marked as 'custom' just to lower them to SVE. 1971 // We know said lowering will incur no additional cost. 1972 if (!Ty->getScalarType()->isFP128Ty()) 1973 return 2 * LT.first; 1974 1975 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, 1976 Opd2Info, Opd1PropInfo, Opd2PropInfo); 1977 } 1978 } 1979 1980 InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty, 1981 ScalarEvolution *SE, 1982 const SCEV *Ptr) { 1983 // Address computations in vectorized code with non-consecutive addresses will 1984 // likely result in more instructions compared to scalar code where the 1985 // computation can more often be merged into the index mode. The resulting 1986 // extra micro-ops can significantly decrease throughput. 1987 unsigned NumVectorInstToHideOverhead = 10; 1988 int MaxMergeDistance = 64; 1989 1990 if (Ty->isVectorTy() && SE && 1991 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) 1992 return NumVectorInstToHideOverhead; 1993 1994 // In many cases the address computation is not merged into the instruction 1995 // addressing mode. 1996 return 1; 1997 } 1998 1999 InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 2000 Type *CondTy, 2001 CmpInst::Predicate VecPred, 2002 TTI::TargetCostKind CostKind, 2003 const Instruction *I) { 2004 // TODO: Handle other cost kinds. 2005 if (CostKind != TTI::TCK_RecipThroughput) 2006 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 2007 I); 2008 2009 int ISD = TLI->InstructionOpcodeToISD(Opcode); 2010 // We don't lower some vector selects well that are wider than the register 2011 // width. 2012 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) { 2013 // We would need this many instructions to hide the scalarization happening. 2014 const int AmortizationCost = 20; 2015 2016 // If VecPred is not set, check if we can get a predicate from the context 2017 // instruction, if its type matches the requested ValTy. 2018 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) { 2019 CmpInst::Predicate CurrentPred; 2020 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(), 2021 m_Value()))) 2022 VecPred = CurrentPred; 2023 } 2024 // Check if we have a compare/select chain that can be lowered using 2025 // a (F)CMxx & BFI pair. 2026 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE || 2027 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT || 2028 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ || 2029 VecPred == CmpInst::FCMP_UNE) { 2030 static const auto ValidMinMaxTys = { 2031 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, 2032 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64}; 2033 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16}; 2034 2035 auto LT = TLI->getTypeLegalizationCost(DL, ValTy); 2036 if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; }) || 2037 (ST->hasFullFP16() && 2038 any_of(ValidFP16MinMaxTys, [<](MVT M) { return M == LT.second; }))) 2039 return LT.first; 2040 } 2041 2042 static const TypeConversionCostTblEntry 2043 VectorSelectTbl[] = { 2044 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 }, 2045 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 }, 2046 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 }, 2047 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost }, 2048 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost }, 2049 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost } 2050 }; 2051 2052 EVT SelCondTy = TLI->getValueType(DL, CondTy); 2053 EVT SelValTy = TLI->getValueType(DL, ValTy); 2054 if (SelCondTy.isSimple() && SelValTy.isSimple()) { 2055 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD, 2056 SelCondTy.getSimpleVT(), 2057 SelValTy.getSimpleVT())) 2058 return Entry->Cost; 2059 } 2060 } 2061 // The base case handles scalable vectors fine for now, since it treats the 2062 // cost as 1 * legalization cost. 2063 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); 2064 } 2065 2066 AArch64TTIImpl::TTI::MemCmpExpansionOptions 2067 AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { 2068 TTI::MemCmpExpansionOptions Options; 2069 if (ST->requiresStrictAlign()) { 2070 // TODO: Add cost modeling for strict align. Misaligned loads expand to 2071 // a bunch of instructions when strict align is enabled. 2072 return Options; 2073 } 2074 Options.AllowOverlappingLoads = true; 2075 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); 2076 Options.NumLoadsPerBlock = Options.MaxNumLoads; 2077 // TODO: Though vector loads usually perform well on AArch64, in some targets 2078 // they may wake up the FP unit, which raises the power consumption. Perhaps 2079 // they could be used with no holds barred (-O3). 2080 Options.LoadSizes = {8, 4, 2, 1}; 2081 return Options; 2082 } 2083 2084 InstructionCost 2085 AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, 2086 Align Alignment, unsigned AddressSpace, 2087 TTI::TargetCostKind CostKind) { 2088 if (useNeonVector(Src)) 2089 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 2090 CostKind); 2091 auto LT = TLI->getTypeLegalizationCost(DL, Src); 2092 if (!LT.first.isValid()) 2093 return InstructionCost::getInvalid(); 2094 2095 // The code-generator is currently not able to handle scalable vectors 2096 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 2097 // it. This change will be removed when code-generation for these types is 2098 // sufficiently reliable. 2099 if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1)) 2100 return InstructionCost::getInvalid(); 2101 2102 return LT.first * 2; 2103 } 2104 2105 static unsigned getSVEGatherScatterOverhead(unsigned Opcode) { 2106 return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead; 2107 } 2108 2109 InstructionCost AArch64TTIImpl::getGatherScatterOpCost( 2110 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, 2111 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { 2112 if (useNeonVector(DataTy)) 2113 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 2114 Alignment, CostKind, I); 2115 auto *VT = cast<VectorType>(DataTy); 2116 auto LT = TLI->getTypeLegalizationCost(DL, DataTy); 2117 if (!LT.first.isValid()) 2118 return InstructionCost::getInvalid(); 2119 2120 // The code-generator is currently not able to handle scalable vectors 2121 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 2122 // it. This change will be removed when code-generation for these types is 2123 // sufficiently reliable. 2124 if (cast<VectorType>(DataTy)->getElementCount() == 2125 ElementCount::getScalable(1)) 2126 return InstructionCost::getInvalid(); 2127 2128 ElementCount LegalVF = LT.second.getVectorElementCount(); 2129 InstructionCost MemOpCost = 2130 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I); 2131 // Add on an overhead cost for using gathers/scatters. 2132 // TODO: At the moment this is applied unilaterally for all CPUs, but at some 2133 // point we may want a per-CPU overhead. 2134 MemOpCost *= getSVEGatherScatterOverhead(Opcode); 2135 return LT.first * MemOpCost * getMaxNumElements(LegalVF); 2136 } 2137 2138 bool AArch64TTIImpl::useNeonVector(const Type *Ty) const { 2139 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors(); 2140 } 2141 2142 InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, 2143 MaybeAlign Alignment, 2144 unsigned AddressSpace, 2145 TTI::TargetCostKind CostKind, 2146 const Instruction *I) { 2147 EVT VT = TLI->getValueType(DL, Ty, true); 2148 // Type legalization can't handle structs 2149 if (VT == MVT::Other) 2150 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, 2151 CostKind); 2152 2153 auto LT = TLI->getTypeLegalizationCost(DL, Ty); 2154 if (!LT.first.isValid()) 2155 return InstructionCost::getInvalid(); 2156 2157 // The code-generator is currently not able to handle scalable vectors 2158 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 2159 // it. This change will be removed when code-generation for these types is 2160 // sufficiently reliable. 2161 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty)) 2162 if (VTy->getElementCount() == ElementCount::getScalable(1)) 2163 return InstructionCost::getInvalid(); 2164 2165 // TODO: consider latency as well for TCK_SizeAndLatency. 2166 if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) 2167 return LT.first; 2168 2169 if (CostKind != TTI::TCK_RecipThroughput) 2170 return 1; 2171 2172 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store && 2173 LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) { 2174 // Unaligned stores are extremely inefficient. We don't split all 2175 // unaligned 128-bit stores because the negative impact that has shown in 2176 // practice on inlined block copy code. 2177 // We make such stores expensive so that we will only vectorize if there 2178 // are 6 other instructions getting vectorized. 2179 const int AmortizationCost = 6; 2180 2181 return LT.first * 2 * AmortizationCost; 2182 } 2183 2184 // Check truncating stores and extending loads. 2185 if (useNeonVector(Ty) && 2186 Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) { 2187 // v4i8 types are lowered to scalar a load/store and sshll/xtn. 2188 if (VT == MVT::v4i8) 2189 return 2; 2190 // Otherwise we need to scalarize. 2191 return cast<FixedVectorType>(Ty)->getNumElements() * 2; 2192 } 2193 2194 return LT.first; 2195 } 2196 2197 InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost( 2198 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 2199 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 2200 bool UseMaskForCond, bool UseMaskForGaps) { 2201 assert(Factor >= 2 && "Invalid interleave factor"); 2202 auto *VecVTy = cast<FixedVectorType>(VecTy); 2203 2204 if (!UseMaskForCond && !UseMaskForGaps && 2205 Factor <= TLI->getMaxSupportedInterleaveFactor()) { 2206 unsigned NumElts = VecVTy->getNumElements(); 2207 auto *SubVecTy = 2208 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor); 2209 2210 // ldN/stN only support legal vector types of size 64 or 128 in bits. 2211 // Accesses having vector types that are a multiple of 128 bits can be 2212 // matched to more than one ldN/stN instruction. 2213 bool UseScalable; 2214 if (NumElts % Factor == 0 && 2215 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable)) 2216 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable); 2217 } 2218 2219 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 2220 Alignment, AddressSpace, CostKind, 2221 UseMaskForCond, UseMaskForGaps); 2222 } 2223 2224 InstructionCost 2225 AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { 2226 InstructionCost Cost = 0; 2227 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2228 for (auto *I : Tys) { 2229 if (!I->isVectorTy()) 2230 continue; 2231 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() == 2232 128) 2233 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) + 2234 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind); 2235 } 2236 return Cost; 2237 } 2238 2239 unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) { 2240 return ST->getMaxInterleaveFactor(); 2241 } 2242 2243 // For Falkor, we want to avoid having too many strided loads in a loop since 2244 // that can exhaust the HW prefetcher resources. We adjust the unroller 2245 // MaxCount preference below to attempt to ensure unrolling doesn't create too 2246 // many strided loads. 2247 static void 2248 getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, 2249 TargetTransformInfo::UnrollingPreferences &UP) { 2250 enum { MaxStridedLoads = 7 }; 2251 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) { 2252 int StridedLoads = 0; 2253 // FIXME? We could make this more precise by looking at the CFG and 2254 // e.g. not counting loads in each side of an if-then-else diamond. 2255 for (const auto BB : L->blocks()) { 2256 for (auto &I : *BB) { 2257 LoadInst *LMemI = dyn_cast<LoadInst>(&I); 2258 if (!LMemI) 2259 continue; 2260 2261 Value *PtrValue = LMemI->getPointerOperand(); 2262 if (L->isLoopInvariant(PtrValue)) 2263 continue; 2264 2265 const SCEV *LSCEV = SE.getSCEV(PtrValue); 2266 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV); 2267 if (!LSCEVAddRec || !LSCEVAddRec->isAffine()) 2268 continue; 2269 2270 // FIXME? We could take pairing of unrolled load copies into account 2271 // by looking at the AddRec, but we would probably have to limit this 2272 // to loops with no stores or other memory optimization barriers. 2273 ++StridedLoads; 2274 // We've seen enough strided loads that seeing more won't make a 2275 // difference. 2276 if (StridedLoads > MaxStridedLoads / 2) 2277 return StridedLoads; 2278 } 2279 } 2280 return StridedLoads; 2281 }; 2282 2283 int StridedLoads = countStridedLoads(L, SE); 2284 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads 2285 << " strided loads\n"); 2286 // Pick the largest power of 2 unroll count that won't result in too many 2287 // strided loads. 2288 if (StridedLoads) { 2289 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads); 2290 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to " 2291 << UP.MaxCount << '\n'); 2292 } 2293 } 2294 2295 void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 2296 TTI::UnrollingPreferences &UP, 2297 OptimizationRemarkEmitter *ORE) { 2298 // Enable partial unrolling and runtime unrolling. 2299 BaseT::getUnrollingPreferences(L, SE, UP, ORE); 2300 2301 UP.UpperBound = true; 2302 2303 // For inner loop, it is more likely to be a hot one, and the runtime check 2304 // can be promoted out from LICM pass, so the overhead is less, let's try 2305 // a larger threshold to unroll more loops. 2306 if (L->getLoopDepth() > 1) 2307 UP.PartialThreshold *= 2; 2308 2309 // Disable partial & runtime unrolling on -Os. 2310 UP.PartialOptSizeThreshold = 0; 2311 2312 if (ST->getProcFamily() == AArch64Subtarget::Falkor && 2313 EnableFalkorHWPFUnrollFix) 2314 getFalkorUnrollingPreferences(L, SE, UP); 2315 2316 // Scan the loop: don't unroll loops with calls as this could prevent 2317 // inlining. Don't unroll vector loops either, as they don't benefit much from 2318 // unrolling. 2319 for (auto *BB : L->getBlocks()) { 2320 for (auto &I : *BB) { 2321 // Don't unroll vectorised loop. 2322 if (I.getType()->isVectorTy()) 2323 return; 2324 2325 if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 2326 if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 2327 if (!isLoweredToCall(F)) 2328 continue; 2329 } 2330 return; 2331 } 2332 } 2333 } 2334 2335 // Enable runtime unrolling for in-order models 2336 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by 2337 // checking for that case, we can ensure that the default behaviour is 2338 // unchanged 2339 if (ST->getProcFamily() != AArch64Subtarget::Others && 2340 !ST->getSchedModel().isOutOfOrder()) { 2341 UP.Runtime = true; 2342 UP.Partial = true; 2343 UP.UnrollRemainder = true; 2344 UP.DefaultUnrollRuntimeCount = 4; 2345 2346 UP.UnrollAndJam = true; 2347 UP.UnrollAndJamInnerLoopThreshold = 60; 2348 } 2349 } 2350 2351 void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 2352 TTI::PeelingPreferences &PP) { 2353 BaseT::getPeelingPreferences(L, SE, PP); 2354 } 2355 2356 Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, 2357 Type *ExpectedType) { 2358 switch (Inst->getIntrinsicID()) { 2359 default: 2360 return nullptr; 2361 case Intrinsic::aarch64_neon_st2: 2362 case Intrinsic::aarch64_neon_st3: 2363 case Intrinsic::aarch64_neon_st4: { 2364 // Create a struct type 2365 StructType *ST = dyn_cast<StructType>(ExpectedType); 2366 if (!ST) 2367 return nullptr; 2368 unsigned NumElts = Inst->arg_size() - 1; 2369 if (ST->getNumElements() != NumElts) 2370 return nullptr; 2371 for (unsigned i = 0, e = NumElts; i != e; ++i) { 2372 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i)) 2373 return nullptr; 2374 } 2375 Value *Res = UndefValue::get(ExpectedType); 2376 IRBuilder<> Builder(Inst); 2377 for (unsigned i = 0, e = NumElts; i != e; ++i) { 2378 Value *L = Inst->getArgOperand(i); 2379 Res = Builder.CreateInsertValue(Res, L, i); 2380 } 2381 return Res; 2382 } 2383 case Intrinsic::aarch64_neon_ld2: 2384 case Intrinsic::aarch64_neon_ld3: 2385 case Intrinsic::aarch64_neon_ld4: 2386 if (Inst->getType() == ExpectedType) 2387 return Inst; 2388 return nullptr; 2389 } 2390 } 2391 2392 bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, 2393 MemIntrinsicInfo &Info) { 2394 switch (Inst->getIntrinsicID()) { 2395 default: 2396 break; 2397 case Intrinsic::aarch64_neon_ld2: 2398 case Intrinsic::aarch64_neon_ld3: 2399 case Intrinsic::aarch64_neon_ld4: 2400 Info.ReadMem = true; 2401 Info.WriteMem = false; 2402 Info.PtrVal = Inst->getArgOperand(0); 2403 break; 2404 case Intrinsic::aarch64_neon_st2: 2405 case Intrinsic::aarch64_neon_st3: 2406 case Intrinsic::aarch64_neon_st4: 2407 Info.ReadMem = false; 2408 Info.WriteMem = true; 2409 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1); 2410 break; 2411 } 2412 2413 switch (Inst->getIntrinsicID()) { 2414 default: 2415 return false; 2416 case Intrinsic::aarch64_neon_ld2: 2417 case Intrinsic::aarch64_neon_st2: 2418 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS; 2419 break; 2420 case Intrinsic::aarch64_neon_ld3: 2421 case Intrinsic::aarch64_neon_st3: 2422 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS; 2423 break; 2424 case Intrinsic::aarch64_neon_ld4: 2425 case Intrinsic::aarch64_neon_st4: 2426 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS; 2427 break; 2428 } 2429 return true; 2430 } 2431 2432 /// See if \p I should be considered for address type promotion. We check if \p 2433 /// I is a sext with right type and used in memory accesses. If it used in a 2434 /// "complex" getelementptr, we allow it to be promoted without finding other 2435 /// sext instructions that sign extended the same initial value. A getelementptr 2436 /// is considered as "complex" if it has more than 2 operands. 2437 bool AArch64TTIImpl::shouldConsiderAddressTypePromotion( 2438 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) { 2439 bool Considerable = false; 2440 AllowPromotionWithoutCommonHeader = false; 2441 if (!isa<SExtInst>(&I)) 2442 return false; 2443 Type *ConsideredSExtType = 2444 Type::getInt64Ty(I.getParent()->getParent()->getContext()); 2445 if (I.getType() != ConsideredSExtType) 2446 return false; 2447 // See if the sext is the one with the right type and used in at least one 2448 // GetElementPtrInst. 2449 for (const User *U : I.users()) { 2450 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) { 2451 Considerable = true; 2452 // A getelementptr is considered as "complex" if it has more than 2 2453 // operands. We will promote a SExt used in such complex GEP as we 2454 // expect some computation to be merged if they are done on 64 bits. 2455 if (GEPInst->getNumOperands() > 2) { 2456 AllowPromotionWithoutCommonHeader = true; 2457 break; 2458 } 2459 } 2460 } 2461 return Considerable; 2462 } 2463 2464 bool AArch64TTIImpl::isLegalToVectorizeReduction( 2465 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const { 2466 if (!VF.isScalable()) 2467 return true; 2468 2469 Type *Ty = RdxDesc.getRecurrenceType(); 2470 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty)) 2471 return false; 2472 2473 switch (RdxDesc.getRecurrenceKind()) { 2474 case RecurKind::Add: 2475 case RecurKind::FAdd: 2476 case RecurKind::And: 2477 case RecurKind::Or: 2478 case RecurKind::Xor: 2479 case RecurKind::SMin: 2480 case RecurKind::SMax: 2481 case RecurKind::UMin: 2482 case RecurKind::UMax: 2483 case RecurKind::FMin: 2484 case RecurKind::FMax: 2485 case RecurKind::SelectICmp: 2486 case RecurKind::SelectFCmp: 2487 case RecurKind::FMulAdd: 2488 return true; 2489 default: 2490 return false; 2491 } 2492 } 2493 2494 InstructionCost 2495 AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, 2496 bool IsUnsigned, 2497 TTI::TargetCostKind CostKind) { 2498 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 2499 2500 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16()) 2501 return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind); 2502 2503 assert((isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) && 2504 "Both vector needs to be equally scalable"); 2505 2506 InstructionCost LegalizationCost = 0; 2507 if (LT.first > 1) { 2508 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext()); 2509 unsigned MinMaxOpcode = 2510 Ty->isFPOrFPVectorTy() 2511 ? Intrinsic::maxnum 2512 : (IsUnsigned ? Intrinsic::umin : Intrinsic::smin); 2513 IntrinsicCostAttributes Attrs(MinMaxOpcode, LegalVTy, {LegalVTy, LegalVTy}); 2514 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1); 2515 } 2516 2517 return LegalizationCost + /*Cost of horizontal reduction*/ 2; 2518 } 2519 2520 InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE( 2521 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) { 2522 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 2523 InstructionCost LegalizationCost = 0; 2524 if (LT.first > 1) { 2525 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext()); 2526 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind); 2527 LegalizationCost *= LT.first - 1; 2528 } 2529 2530 int ISD = TLI->InstructionOpcodeToISD(Opcode); 2531 assert(ISD && "Invalid opcode"); 2532 // Add the final reduction cost for the legal horizontal reduction 2533 switch (ISD) { 2534 case ISD::ADD: 2535 case ISD::AND: 2536 case ISD::OR: 2537 case ISD::XOR: 2538 case ISD::FADD: 2539 return LegalizationCost + 2; 2540 default: 2541 return InstructionCost::getInvalid(); 2542 } 2543 } 2544 2545 InstructionCost 2546 AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, 2547 Optional<FastMathFlags> FMF, 2548 TTI::TargetCostKind CostKind) { 2549 if (TTI::requiresOrderedReduction(FMF)) { 2550 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) { 2551 InstructionCost BaseCost = 2552 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 2553 // Add on extra cost to reflect the extra overhead on some CPUs. We still 2554 // end up vectorizing for more computationally intensive loops. 2555 return BaseCost + FixedVTy->getNumElements(); 2556 } 2557 2558 if (Opcode != Instruction::FAdd) 2559 return InstructionCost::getInvalid(); 2560 2561 auto *VTy = cast<ScalableVectorType>(ValTy); 2562 InstructionCost Cost = 2563 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind); 2564 Cost *= getMaxNumElements(VTy->getElementCount()); 2565 return Cost; 2566 } 2567 2568 if (isa<ScalableVectorType>(ValTy)) 2569 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind); 2570 2571 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 2572 MVT MTy = LT.second; 2573 int ISD = TLI->InstructionOpcodeToISD(Opcode); 2574 assert(ISD && "Invalid opcode"); 2575 2576 // Horizontal adds can use the 'addv' instruction. We model the cost of these 2577 // instructions as twice a normal vector add, plus 1 for each legalization 2578 // step (LT.first). This is the only arithmetic vector reduction operation for 2579 // which we have an instruction. 2580 // OR, XOR and AND costs should match the codegen from: 2581 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll 2582 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll 2583 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll 2584 static const CostTblEntry CostTblNoPairwise[]{ 2585 {ISD::ADD, MVT::v8i8, 2}, 2586 {ISD::ADD, MVT::v16i8, 2}, 2587 {ISD::ADD, MVT::v4i16, 2}, 2588 {ISD::ADD, MVT::v8i16, 2}, 2589 {ISD::ADD, MVT::v4i32, 2}, 2590 {ISD::OR, MVT::v8i8, 15}, 2591 {ISD::OR, MVT::v16i8, 17}, 2592 {ISD::OR, MVT::v4i16, 7}, 2593 {ISD::OR, MVT::v8i16, 9}, 2594 {ISD::OR, MVT::v2i32, 3}, 2595 {ISD::OR, MVT::v4i32, 5}, 2596 {ISD::OR, MVT::v2i64, 3}, 2597 {ISD::XOR, MVT::v8i8, 15}, 2598 {ISD::XOR, MVT::v16i8, 17}, 2599 {ISD::XOR, MVT::v4i16, 7}, 2600 {ISD::XOR, MVT::v8i16, 9}, 2601 {ISD::XOR, MVT::v2i32, 3}, 2602 {ISD::XOR, MVT::v4i32, 5}, 2603 {ISD::XOR, MVT::v2i64, 3}, 2604 {ISD::AND, MVT::v8i8, 15}, 2605 {ISD::AND, MVT::v16i8, 17}, 2606 {ISD::AND, MVT::v4i16, 7}, 2607 {ISD::AND, MVT::v8i16, 9}, 2608 {ISD::AND, MVT::v2i32, 3}, 2609 {ISD::AND, MVT::v4i32, 5}, 2610 {ISD::AND, MVT::v2i64, 3}, 2611 }; 2612 switch (ISD) { 2613 default: 2614 break; 2615 case ISD::ADD: 2616 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy)) 2617 return (LT.first - 1) + Entry->Cost; 2618 break; 2619 case ISD::XOR: 2620 case ISD::AND: 2621 case ISD::OR: 2622 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy); 2623 if (!Entry) 2624 break; 2625 auto *ValVTy = cast<FixedVectorType>(ValTy); 2626 if (!ValVTy->getElementType()->isIntegerTy(1) && 2627 MTy.getVectorNumElements() <= ValVTy->getNumElements() && 2628 isPowerOf2_32(ValVTy->getNumElements())) { 2629 InstructionCost ExtraCost = 0; 2630 if (LT.first != 1) { 2631 // Type needs to be split, so there is an extra cost of LT.first - 1 2632 // arithmetic ops. 2633 auto *Ty = FixedVectorType::get(ValTy->getElementType(), 2634 MTy.getVectorNumElements()); 2635 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind); 2636 ExtraCost *= LT.first - 1; 2637 } 2638 return Entry->Cost + ExtraCost; 2639 } 2640 break; 2641 } 2642 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 2643 } 2644 2645 InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) { 2646 static const CostTblEntry ShuffleTbl[] = { 2647 { TTI::SK_Splice, MVT::nxv16i8, 1 }, 2648 { TTI::SK_Splice, MVT::nxv8i16, 1 }, 2649 { TTI::SK_Splice, MVT::nxv4i32, 1 }, 2650 { TTI::SK_Splice, MVT::nxv2i64, 1 }, 2651 { TTI::SK_Splice, MVT::nxv2f16, 1 }, 2652 { TTI::SK_Splice, MVT::nxv4f16, 1 }, 2653 { TTI::SK_Splice, MVT::nxv8f16, 1 }, 2654 { TTI::SK_Splice, MVT::nxv2bf16, 1 }, 2655 { TTI::SK_Splice, MVT::nxv4bf16, 1 }, 2656 { TTI::SK_Splice, MVT::nxv8bf16, 1 }, 2657 { TTI::SK_Splice, MVT::nxv2f32, 1 }, 2658 { TTI::SK_Splice, MVT::nxv4f32, 1 }, 2659 { TTI::SK_Splice, MVT::nxv2f64, 1 }, 2660 }; 2661 2662 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 2663 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext()); 2664 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2665 EVT PromotedVT = LT.second.getScalarType() == MVT::i1 2666 ? TLI->getPromotedVTForPredicate(EVT(LT.second)) 2667 : LT.second; 2668 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext()); 2669 InstructionCost LegalizationCost = 0; 2670 if (Index < 0) { 2671 LegalizationCost = 2672 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy, 2673 CmpInst::BAD_ICMP_PREDICATE, CostKind) + 2674 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy, 2675 CmpInst::BAD_ICMP_PREDICATE, CostKind); 2676 } 2677 2678 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp 2679 // Cost performed on a promoted type. 2680 if (LT.second.getScalarType() == MVT::i1) { 2681 LegalizationCost += 2682 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy, 2683 TTI::CastContextHint::None, CostKind) + 2684 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy, 2685 TTI::CastContextHint::None, CostKind); 2686 } 2687 const auto *Entry = 2688 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT()); 2689 assert(Entry && "Illegal Type for Splice"); 2690 LegalizationCost += Entry->Cost; 2691 return LegalizationCost * LT.first; 2692 } 2693 2694 InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, 2695 VectorType *Tp, 2696 ArrayRef<int> Mask, int Index, 2697 VectorType *SubTp, 2698 ArrayRef<const Value *> Args) { 2699 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 2700 // If we have a Mask, and the LT is being legalized somehow, split the Mask 2701 // into smaller vectors and sum the cost of each shuffle. 2702 if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() && 2703 Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() && 2704 cast<FixedVectorType>(Tp)->getNumElements() > 2705 LT.second.getVectorNumElements() && 2706 !Index && !SubTp) { 2707 unsigned TpNumElts = cast<FixedVectorType>(Tp)->getNumElements(); 2708 assert(Mask.size() == TpNumElts && "Expected Mask and Tp size to match!"); 2709 unsigned LTNumElts = LT.second.getVectorNumElements(); 2710 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts; 2711 VectorType *NTp = 2712 VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount()); 2713 InstructionCost Cost; 2714 for (unsigned N = 0; N < NumVecs; N++) { 2715 SmallVector<int> NMask; 2716 // Split the existing mask into chunks of size LTNumElts. Track the source 2717 // sub-vectors to ensure the result has at most 2 inputs. 2718 unsigned Source1, Source2; 2719 unsigned NumSources = 0; 2720 for (unsigned E = 0; E < LTNumElts; E++) { 2721 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E] 2722 : UndefMaskElem; 2723 if (MaskElt < 0) { 2724 NMask.push_back(UndefMaskElem); 2725 continue; 2726 } 2727 2728 // Calculate which source from the input this comes from and whether it 2729 // is new to us. 2730 unsigned Source = MaskElt / LTNumElts; 2731 if (NumSources == 0) { 2732 Source1 = Source; 2733 NumSources = 1; 2734 } else if (NumSources == 1 && Source != Source1) { 2735 Source2 = Source; 2736 NumSources = 2; 2737 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) { 2738 NumSources++; 2739 } 2740 2741 // Add to the new mask. For the NumSources>2 case these are not correct, 2742 // but are only used for the modular lane number. 2743 if (Source == Source1) 2744 NMask.push_back(MaskElt % LTNumElts); 2745 else if (Source == Source2) 2746 NMask.push_back(MaskElt % LTNumElts + LTNumElts); 2747 else 2748 NMask.push_back(MaskElt % LTNumElts); 2749 } 2750 // If the sub-mask has at most 2 input sub-vectors then re-cost it using 2751 // getShuffleCost. If not then cost it using the worst case. 2752 if (NumSources <= 2) 2753 Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc 2754 : TTI::SK_PermuteTwoSrc, 2755 NTp, NMask, 0, nullptr, Args); 2756 else if (any_of(enumerate(NMask), [&](const auto &ME) { 2757 return ME.value() % LTNumElts == ME.index(); 2758 })) 2759 Cost += LTNumElts - 1; 2760 else 2761 Cost += LTNumElts; 2762 } 2763 return Cost; 2764 } 2765 2766 Kind = improveShuffleKindFromMask(Kind, Mask); 2767 2768 // Check for broadcast loads. 2769 if (Kind == TTI::SK_Broadcast) { 2770 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]); 2771 if (IsLoad && LT.second.isVector() && 2772 isLegalBroadcastLoad(Tp->getElementType(), 2773 LT.second.getVectorElementCount())) 2774 return 0; // broadcast is handled by ld1r 2775 } 2776 2777 // If we have 4 elements for the shuffle and a Mask, get the cost straight 2778 // from the perfect shuffle tables. 2779 if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) && 2780 (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) && 2781 all_of(Mask, [](int E) { return E < 8; })) 2782 return getPerfectShuffleCost(Mask); 2783 2784 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose || 2785 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc || 2786 Kind == TTI::SK_Reverse) { 2787 2788 static const CostTblEntry ShuffleTbl[] = { 2789 // Broadcast shuffle kinds can be performed with 'dup'. 2790 { TTI::SK_Broadcast, MVT::v8i8, 1 }, 2791 { TTI::SK_Broadcast, MVT::v16i8, 1 }, 2792 { TTI::SK_Broadcast, MVT::v4i16, 1 }, 2793 { TTI::SK_Broadcast, MVT::v8i16, 1 }, 2794 { TTI::SK_Broadcast, MVT::v2i32, 1 }, 2795 { TTI::SK_Broadcast, MVT::v4i32, 1 }, 2796 { TTI::SK_Broadcast, MVT::v2i64, 1 }, 2797 { TTI::SK_Broadcast, MVT::v2f32, 1 }, 2798 { TTI::SK_Broadcast, MVT::v4f32, 1 }, 2799 { TTI::SK_Broadcast, MVT::v2f64, 1 }, 2800 // Transpose shuffle kinds can be performed with 'trn1/trn2' and 2801 // 'zip1/zip2' instructions. 2802 { TTI::SK_Transpose, MVT::v8i8, 1 }, 2803 { TTI::SK_Transpose, MVT::v16i8, 1 }, 2804 { TTI::SK_Transpose, MVT::v4i16, 1 }, 2805 { TTI::SK_Transpose, MVT::v8i16, 1 }, 2806 { TTI::SK_Transpose, MVT::v2i32, 1 }, 2807 { TTI::SK_Transpose, MVT::v4i32, 1 }, 2808 { TTI::SK_Transpose, MVT::v2i64, 1 }, 2809 { TTI::SK_Transpose, MVT::v2f32, 1 }, 2810 { TTI::SK_Transpose, MVT::v4f32, 1 }, 2811 { TTI::SK_Transpose, MVT::v2f64, 1 }, 2812 // Select shuffle kinds. 2813 // TODO: handle vXi8/vXi16. 2814 { TTI::SK_Select, MVT::v2i32, 1 }, // mov. 2815 { TTI::SK_Select, MVT::v4i32, 2 }, // rev+trn (or similar). 2816 { TTI::SK_Select, MVT::v2i64, 1 }, // mov. 2817 { TTI::SK_Select, MVT::v2f32, 1 }, // mov. 2818 { TTI::SK_Select, MVT::v4f32, 2 }, // rev+trn (or similar). 2819 { TTI::SK_Select, MVT::v2f64, 1 }, // mov. 2820 // PermuteSingleSrc shuffle kinds. 2821 { TTI::SK_PermuteSingleSrc, MVT::v2i32, 1 }, // mov. 2822 { TTI::SK_PermuteSingleSrc, MVT::v4i32, 3 }, // perfectshuffle worst case. 2823 { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // mov. 2824 { TTI::SK_PermuteSingleSrc, MVT::v2f32, 1 }, // mov. 2825 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 3 }, // perfectshuffle worst case. 2826 { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // mov. 2827 { TTI::SK_PermuteSingleSrc, MVT::v4i16, 3 }, // perfectshuffle worst case. 2828 { TTI::SK_PermuteSingleSrc, MVT::v4f16, 3 }, // perfectshuffle worst case. 2829 { TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3 }, // perfectshuffle worst case. 2830 { TTI::SK_PermuteSingleSrc, MVT::v8i16, 8 }, // constpool + load + tbl 2831 { TTI::SK_PermuteSingleSrc, MVT::v8f16, 8 }, // constpool + load + tbl 2832 { TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8 }, // constpool + load + tbl 2833 { TTI::SK_PermuteSingleSrc, MVT::v8i8, 8 }, // constpool + load + tbl 2834 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 8 }, // constpool + load + tbl 2835 // Reverse can be lowered with `rev`. 2836 { TTI::SK_Reverse, MVT::v2i32, 1 }, // mov. 2837 { TTI::SK_Reverse, MVT::v4i32, 2 }, // REV64; EXT 2838 { TTI::SK_Reverse, MVT::v2i64, 1 }, // mov. 2839 { TTI::SK_Reverse, MVT::v2f32, 1 }, // mov. 2840 { TTI::SK_Reverse, MVT::v4f32, 2 }, // REV64; EXT 2841 { TTI::SK_Reverse, MVT::v2f64, 1 }, // mov. 2842 { TTI::SK_Reverse, MVT::v8f16, 2 }, // REV64; EXT 2843 { TTI::SK_Reverse, MVT::v8i16, 2 }, // REV64; EXT 2844 { TTI::SK_Reverse, MVT::v16i8, 2 }, // REV64; EXT 2845 { TTI::SK_Reverse, MVT::v4f16, 1 }, // REV64 2846 { TTI::SK_Reverse, MVT::v4i16, 1 }, // REV64 2847 { TTI::SK_Reverse, MVT::v8i8, 1 }, // REV64 2848 // Broadcast shuffle kinds for scalable vectors 2849 { TTI::SK_Broadcast, MVT::nxv16i8, 1 }, 2850 { TTI::SK_Broadcast, MVT::nxv8i16, 1 }, 2851 { TTI::SK_Broadcast, MVT::nxv4i32, 1 }, 2852 { TTI::SK_Broadcast, MVT::nxv2i64, 1 }, 2853 { TTI::SK_Broadcast, MVT::nxv2f16, 1 }, 2854 { TTI::SK_Broadcast, MVT::nxv4f16, 1 }, 2855 { TTI::SK_Broadcast, MVT::nxv8f16, 1 }, 2856 { TTI::SK_Broadcast, MVT::nxv2bf16, 1 }, 2857 { TTI::SK_Broadcast, MVT::nxv4bf16, 1 }, 2858 { TTI::SK_Broadcast, MVT::nxv8bf16, 1 }, 2859 { TTI::SK_Broadcast, MVT::nxv2f32, 1 }, 2860 { TTI::SK_Broadcast, MVT::nxv4f32, 1 }, 2861 { TTI::SK_Broadcast, MVT::nxv2f64, 1 }, 2862 { TTI::SK_Broadcast, MVT::nxv16i1, 1 }, 2863 { TTI::SK_Broadcast, MVT::nxv8i1, 1 }, 2864 { TTI::SK_Broadcast, MVT::nxv4i1, 1 }, 2865 { TTI::SK_Broadcast, MVT::nxv2i1, 1 }, 2866 // Handle the cases for vector.reverse with scalable vectors 2867 { TTI::SK_Reverse, MVT::nxv16i8, 1 }, 2868 { TTI::SK_Reverse, MVT::nxv8i16, 1 }, 2869 { TTI::SK_Reverse, MVT::nxv4i32, 1 }, 2870 { TTI::SK_Reverse, MVT::nxv2i64, 1 }, 2871 { TTI::SK_Reverse, MVT::nxv2f16, 1 }, 2872 { TTI::SK_Reverse, MVT::nxv4f16, 1 }, 2873 { TTI::SK_Reverse, MVT::nxv8f16, 1 }, 2874 { TTI::SK_Reverse, MVT::nxv2bf16, 1 }, 2875 { TTI::SK_Reverse, MVT::nxv4bf16, 1 }, 2876 { TTI::SK_Reverse, MVT::nxv8bf16, 1 }, 2877 { TTI::SK_Reverse, MVT::nxv2f32, 1 }, 2878 { TTI::SK_Reverse, MVT::nxv4f32, 1 }, 2879 { TTI::SK_Reverse, MVT::nxv2f64, 1 }, 2880 { TTI::SK_Reverse, MVT::nxv16i1, 1 }, 2881 { TTI::SK_Reverse, MVT::nxv8i1, 1 }, 2882 { TTI::SK_Reverse, MVT::nxv4i1, 1 }, 2883 { TTI::SK_Reverse, MVT::nxv2i1, 1 }, 2884 }; 2885 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second)) 2886 return LT.first * Entry->Cost; 2887 } 2888 2889 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp)) 2890 return getSpliceCost(Tp, Index); 2891 2892 // Inserting a subvector can often be done with either a D, S or H register 2893 // move, so long as the inserted vector is "aligned". 2894 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() && 2895 LT.second.getSizeInBits() <= 128 && SubTp) { 2896 std::pair<InstructionCost, MVT> SubLT = 2897 TLI->getTypeLegalizationCost(DL, SubTp); 2898 if (SubLT.second.isVector()) { 2899 int NumElts = LT.second.getVectorNumElements(); 2900 int NumSubElts = SubLT.second.getVectorNumElements(); 2901 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) 2902 return SubLT.first; 2903 } 2904 } 2905 2906 return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp); 2907 } 2908