1 //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "AArch64TargetTransformInfo.h" 10 #include "AArch64ExpandImm.h" 11 #include "AArch64PerfectShuffle.h" 12 #include "MCTargetDesc/AArch64AddressingModes.h" 13 #include "llvm/Analysis/IVDescriptors.h" 14 #include "llvm/Analysis/LoopInfo.h" 15 #include "llvm/Analysis/TargetTransformInfo.h" 16 #include "llvm/CodeGen/BasicTTIImpl.h" 17 #include "llvm/CodeGen/CostTable.h" 18 #include "llvm/CodeGen/TargetLowering.h" 19 #include "llvm/IR/IntrinsicInst.h" 20 #include "llvm/IR/Intrinsics.h" 21 #include "llvm/IR/IntrinsicsAArch64.h" 22 #include "llvm/IR/PatternMatch.h" 23 #include "llvm/Support/Debug.h" 24 #include "llvm/Transforms/InstCombine/InstCombiner.h" 25 #include <algorithm> 26 using namespace llvm; 27 using namespace llvm::PatternMatch; 28 29 #define DEBUG_TYPE "aarch64tti" 30 31 static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", 32 cl::init(true), cl::Hidden); 33 34 static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10), 35 cl::Hidden); 36 37 static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead", 38 cl::init(10), cl::Hidden); 39 40 bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, 41 const Function *Callee) const { 42 const TargetMachine &TM = getTLI()->getTargetMachine(); 43 44 const FeatureBitset &CallerBits = 45 TM.getSubtargetImpl(*Caller)->getFeatureBits(); 46 const FeatureBitset &CalleeBits = 47 TM.getSubtargetImpl(*Callee)->getFeatureBits(); 48 49 // Inline a callee if its target-features are a subset of the callers 50 // target-features. 51 return (CallerBits & CalleeBits) == CalleeBits; 52 } 53 54 bool AArch64TTIImpl::shouldMaximizeVectorBandwidth( 55 TargetTransformInfo::RegisterKind K) const { 56 assert(K != TargetTransformInfo::RGK_Scalar); 57 return K == TargetTransformInfo::RGK_FixedWidthVector; 58 } 59 60 /// Calculate the cost of materializing a 64-bit value. This helper 61 /// method might only calculate a fraction of a larger immediate. Therefore it 62 /// is valid to return a cost of ZERO. 63 InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) { 64 // Check if the immediate can be encoded within an instruction. 65 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64)) 66 return 0; 67 68 if (Val < 0) 69 Val = ~Val; 70 71 // Calculate how many moves we will need to materialize this constant. 72 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 73 AArch64_IMM::expandMOVImm(Val, 64, Insn); 74 return Insn.size(); 75 } 76 77 /// Calculate the cost of materializing the given constant. 78 InstructionCost AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, 79 TTI::TargetCostKind CostKind) { 80 assert(Ty->isIntegerTy()); 81 82 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 83 if (BitSize == 0) 84 return ~0U; 85 86 // Sign-extend all constants to a multiple of 64-bit. 87 APInt ImmVal = Imm; 88 if (BitSize & 0x3f) 89 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU); 90 91 // Split the constant into 64-bit chunks and calculate the cost for each 92 // chunk. 93 InstructionCost Cost = 0; 94 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { 95 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); 96 int64_t Val = Tmp.getSExtValue(); 97 Cost += getIntImmCost(Val); 98 } 99 // We need at least one instruction to materialze the constant. 100 return std::max<InstructionCost>(1, Cost); 101 } 102 103 InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, 104 const APInt &Imm, Type *Ty, 105 TTI::TargetCostKind CostKind, 106 Instruction *Inst) { 107 assert(Ty->isIntegerTy()); 108 109 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 110 // There is no cost model for constants with a bit size of 0. Return TCC_Free 111 // here, so that constant hoisting will ignore this constant. 112 if (BitSize == 0) 113 return TTI::TCC_Free; 114 115 unsigned ImmIdx = ~0U; 116 switch (Opcode) { 117 default: 118 return TTI::TCC_Free; 119 case Instruction::GetElementPtr: 120 // Always hoist the base address of a GetElementPtr. 121 if (Idx == 0) 122 return 2 * TTI::TCC_Basic; 123 return TTI::TCC_Free; 124 case Instruction::Store: 125 ImmIdx = 0; 126 break; 127 case Instruction::Add: 128 case Instruction::Sub: 129 case Instruction::Mul: 130 case Instruction::UDiv: 131 case Instruction::SDiv: 132 case Instruction::URem: 133 case Instruction::SRem: 134 case Instruction::And: 135 case Instruction::Or: 136 case Instruction::Xor: 137 case Instruction::ICmp: 138 ImmIdx = 1; 139 break; 140 // Always return TCC_Free for the shift value of a shift instruction. 141 case Instruction::Shl: 142 case Instruction::LShr: 143 case Instruction::AShr: 144 if (Idx == 1) 145 return TTI::TCC_Free; 146 break; 147 case Instruction::Trunc: 148 case Instruction::ZExt: 149 case Instruction::SExt: 150 case Instruction::IntToPtr: 151 case Instruction::PtrToInt: 152 case Instruction::BitCast: 153 case Instruction::PHI: 154 case Instruction::Call: 155 case Instruction::Select: 156 case Instruction::Ret: 157 case Instruction::Load: 158 break; 159 } 160 161 if (Idx == ImmIdx) { 162 int NumConstants = (BitSize + 63) / 64; 163 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 164 return (Cost <= NumConstants * TTI::TCC_Basic) 165 ? static_cast<int>(TTI::TCC_Free) 166 : Cost; 167 } 168 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 169 } 170 171 InstructionCost 172 AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, 173 const APInt &Imm, Type *Ty, 174 TTI::TargetCostKind CostKind) { 175 assert(Ty->isIntegerTy()); 176 177 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 178 // There is no cost model for constants with a bit size of 0. Return TCC_Free 179 // here, so that constant hoisting will ignore this constant. 180 if (BitSize == 0) 181 return TTI::TCC_Free; 182 183 // Most (all?) AArch64 intrinsics do not support folding immediates into the 184 // selected instruction, so we compute the materialization cost for the 185 // immediate directly. 186 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv) 187 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 188 189 switch (IID) { 190 default: 191 return TTI::TCC_Free; 192 case Intrinsic::sadd_with_overflow: 193 case Intrinsic::uadd_with_overflow: 194 case Intrinsic::ssub_with_overflow: 195 case Intrinsic::usub_with_overflow: 196 case Intrinsic::smul_with_overflow: 197 case Intrinsic::umul_with_overflow: 198 if (Idx == 1) { 199 int NumConstants = (BitSize + 63) / 64; 200 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 201 return (Cost <= NumConstants * TTI::TCC_Basic) 202 ? static_cast<int>(TTI::TCC_Free) 203 : Cost; 204 } 205 break; 206 case Intrinsic::experimental_stackmap: 207 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 208 return TTI::TCC_Free; 209 break; 210 case Intrinsic::experimental_patchpoint_void: 211 case Intrinsic::experimental_patchpoint_i64: 212 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 213 return TTI::TCC_Free; 214 break; 215 case Intrinsic::experimental_gc_statepoint: 216 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 217 return TTI::TCC_Free; 218 break; 219 } 220 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 221 } 222 223 TargetTransformInfo::PopcntSupportKind 224 AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) { 225 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 226 if (TyWidth == 32 || TyWidth == 64) 227 return TTI::PSK_FastHardware; 228 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount. 229 return TTI::PSK_Software; 230 } 231 232 InstructionCost 233 AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 234 TTI::TargetCostKind CostKind) { 235 auto *RetTy = ICA.getReturnType(); 236 switch (ICA.getID()) { 237 case Intrinsic::umin: 238 case Intrinsic::umax: 239 case Intrinsic::smin: 240 case Intrinsic::smax: { 241 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, 242 MVT::v8i16, MVT::v2i32, MVT::v4i32}; 243 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 244 // v2i64 types get converted to cmp+bif hence the cost of 2 245 if (LT.second == MVT::v2i64) 246 return LT.first * 2; 247 if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; })) 248 return LT.first; 249 break; 250 } 251 case Intrinsic::sadd_sat: 252 case Intrinsic::ssub_sat: 253 case Intrinsic::uadd_sat: 254 case Intrinsic::usub_sat: { 255 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, 256 MVT::v8i16, MVT::v2i32, MVT::v4i32, 257 MVT::v2i64}; 258 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 259 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we 260 // need to extend the type, as it uses shr(qadd(shl, shl)). 261 unsigned Instrs = 262 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4; 263 if (any_of(ValidSatTys, [<](MVT M) { return M == LT.second; })) 264 return LT.first * Instrs; 265 break; 266 } 267 case Intrinsic::abs: { 268 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, 269 MVT::v8i16, MVT::v2i32, MVT::v4i32, 270 MVT::v2i64}; 271 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 272 if (any_of(ValidAbsTys, [<](MVT M) { return M == LT.second; })) 273 return LT.first; 274 break; 275 } 276 case Intrinsic::experimental_stepvector: { 277 InstructionCost Cost = 1; // Cost of the `index' instruction 278 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 279 // Legalisation of illegal vectors involves an `index' instruction plus 280 // (LT.first - 1) vector adds. 281 if (LT.first > 1) { 282 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext()); 283 InstructionCost AddCost = 284 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind); 285 Cost += AddCost * (LT.first - 1); 286 } 287 return Cost; 288 } 289 case Intrinsic::bitreverse: { 290 static const CostTblEntry BitreverseTbl[] = { 291 {Intrinsic::bitreverse, MVT::i32, 1}, 292 {Intrinsic::bitreverse, MVT::i64, 1}, 293 {Intrinsic::bitreverse, MVT::v8i8, 1}, 294 {Intrinsic::bitreverse, MVT::v16i8, 1}, 295 {Intrinsic::bitreverse, MVT::v4i16, 2}, 296 {Intrinsic::bitreverse, MVT::v8i16, 2}, 297 {Intrinsic::bitreverse, MVT::v2i32, 2}, 298 {Intrinsic::bitreverse, MVT::v4i32, 2}, 299 {Intrinsic::bitreverse, MVT::v1i64, 2}, 300 {Intrinsic::bitreverse, MVT::v2i64, 2}, 301 }; 302 const auto LegalisationCost = TLI->getTypeLegalizationCost(DL, RetTy); 303 const auto *Entry = 304 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second); 305 if (Entry) { 306 // Cost Model is using the legal type(i32) that i8 and i16 will be 307 // converted to +1 so that we match the actual lowering cost 308 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 || 309 TLI->getValueType(DL, RetTy, true) == MVT::i16) 310 return LegalisationCost.first * Entry->Cost + 1; 311 312 return LegalisationCost.first * Entry->Cost; 313 } 314 break; 315 } 316 case Intrinsic::ctpop: { 317 static const CostTblEntry CtpopCostTbl[] = { 318 {ISD::CTPOP, MVT::v2i64, 4}, 319 {ISD::CTPOP, MVT::v4i32, 3}, 320 {ISD::CTPOP, MVT::v8i16, 2}, 321 {ISD::CTPOP, MVT::v16i8, 1}, 322 {ISD::CTPOP, MVT::i64, 4}, 323 {ISD::CTPOP, MVT::v2i32, 3}, 324 {ISD::CTPOP, MVT::v4i16, 2}, 325 {ISD::CTPOP, MVT::v8i8, 1}, 326 {ISD::CTPOP, MVT::i32, 5}, 327 }; 328 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 329 MVT MTy = LT.second; 330 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) { 331 // Extra cost of +1 when illegal vector types are legalized by promoting 332 // the integer type. 333 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() != 334 RetTy->getScalarSizeInBits() 335 ? 1 336 : 0; 337 return LT.first * Entry->Cost + ExtraCost; 338 } 339 break; 340 } 341 case Intrinsic::sadd_with_overflow: 342 case Intrinsic::uadd_with_overflow: 343 case Intrinsic::ssub_with_overflow: 344 case Intrinsic::usub_with_overflow: 345 case Intrinsic::smul_with_overflow: 346 case Intrinsic::umul_with_overflow: { 347 static const CostTblEntry WithOverflowCostTbl[] = { 348 {Intrinsic::sadd_with_overflow, MVT::i8, 3}, 349 {Intrinsic::uadd_with_overflow, MVT::i8, 3}, 350 {Intrinsic::sadd_with_overflow, MVT::i16, 3}, 351 {Intrinsic::uadd_with_overflow, MVT::i16, 3}, 352 {Intrinsic::sadd_with_overflow, MVT::i32, 1}, 353 {Intrinsic::uadd_with_overflow, MVT::i32, 1}, 354 {Intrinsic::sadd_with_overflow, MVT::i64, 1}, 355 {Intrinsic::uadd_with_overflow, MVT::i64, 1}, 356 {Intrinsic::ssub_with_overflow, MVT::i8, 3}, 357 {Intrinsic::usub_with_overflow, MVT::i8, 3}, 358 {Intrinsic::ssub_with_overflow, MVT::i16, 3}, 359 {Intrinsic::usub_with_overflow, MVT::i16, 3}, 360 {Intrinsic::ssub_with_overflow, MVT::i32, 1}, 361 {Intrinsic::usub_with_overflow, MVT::i32, 1}, 362 {Intrinsic::ssub_with_overflow, MVT::i64, 1}, 363 {Intrinsic::usub_with_overflow, MVT::i64, 1}, 364 {Intrinsic::smul_with_overflow, MVT::i8, 5}, 365 {Intrinsic::umul_with_overflow, MVT::i8, 4}, 366 {Intrinsic::smul_with_overflow, MVT::i16, 5}, 367 {Intrinsic::umul_with_overflow, MVT::i16, 4}, 368 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst 369 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw 370 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp 371 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr 372 }; 373 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true); 374 if (MTy.isSimple()) 375 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(), 376 MTy.getSimpleVT())) 377 return Entry->Cost; 378 break; 379 } 380 case Intrinsic::fptosi_sat: 381 case Intrinsic::fptoui_sat: { 382 if (ICA.getArgTypes().empty()) 383 break; 384 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat; 385 auto LT = TLI->getTypeLegalizationCost(DL, ICA.getArgTypes()[0]); 386 EVT MTy = TLI->getValueType(DL, RetTy); 387 // Check for the legal types, which are where the size of the input and the 388 // output are the same, or we are using cvt f64->i32 or f32->i64. 389 if ((LT.second == MVT::f32 || LT.second == MVT::f64 || 390 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 || 391 LT.second == MVT::v2f64) && 392 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() || 393 (LT.second == MVT::f64 && MTy == MVT::i32) || 394 (LT.second == MVT::f32 && MTy == MVT::i64))) 395 return LT.first; 396 // Similarly for fp16 sizes 397 if (ST->hasFullFP16() && 398 ((LT.second == MVT::f16 && MTy == MVT::i32) || 399 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) && 400 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))) 401 return LT.first; 402 403 // Otherwise we use a legal convert followed by a min+max 404 if ((LT.second.getScalarType() == MVT::f32 || 405 LT.second.getScalarType() == MVT::f64 || 406 (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) && 407 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) { 408 Type *LegalTy = 409 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits()); 410 if (LT.second.isVector()) 411 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount()); 412 InstructionCost Cost = 1; 413 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin, 414 LegalTy, {LegalTy, LegalTy}); 415 Cost += getIntrinsicInstrCost(Attrs1, CostKind); 416 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax, 417 LegalTy, {LegalTy, LegalTy}); 418 Cost += getIntrinsicInstrCost(Attrs2, CostKind); 419 return LT.first * Cost; 420 } 421 break; 422 } 423 default: 424 break; 425 } 426 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 427 } 428 429 /// The function will remove redundant reinterprets casting in the presence 430 /// of the control flow 431 static Optional<Instruction *> processPhiNode(InstCombiner &IC, 432 IntrinsicInst &II) { 433 SmallVector<Instruction *, 32> Worklist; 434 auto RequiredType = II.getType(); 435 436 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0)); 437 assert(PN && "Expected Phi Node!"); 438 439 // Don't create a new Phi unless we can remove the old one. 440 if (!PN->hasOneUse()) 441 return None; 442 443 for (Value *IncValPhi : PN->incoming_values()) { 444 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi); 445 if (!Reinterpret || 446 Reinterpret->getIntrinsicID() != 447 Intrinsic::aarch64_sve_convert_to_svbool || 448 RequiredType != Reinterpret->getArgOperand(0)->getType()) 449 return None; 450 } 451 452 // Create the new Phi 453 LLVMContext &Ctx = PN->getContext(); 454 IRBuilder<> Builder(Ctx); 455 Builder.SetInsertPoint(PN); 456 PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues()); 457 Worklist.push_back(PN); 458 459 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) { 460 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I)); 461 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I)); 462 Worklist.push_back(Reinterpret); 463 } 464 465 // Cleanup Phi Node and reinterprets 466 return IC.replaceInstUsesWith(II, NPN); 467 } 468 469 // (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _)))) 470 // => (binop (pred) (from_svbool _) (from_svbool _)) 471 // 472 // The above transformation eliminates a `to_svbool` in the predicate 473 // operand of bitwise operation `binop` by narrowing the vector width of 474 // the operation. For example, it would convert a `<vscale x 16 x i1> 475 // and` into a `<vscale x 4 x i1> and`. This is profitable because 476 // to_svbool must zero the new lanes during widening, whereas 477 // from_svbool is free. 478 static Optional<Instruction *> tryCombineFromSVBoolBinOp(InstCombiner &IC, 479 IntrinsicInst &II) { 480 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0)); 481 if (!BinOp) 482 return None; 483 484 auto IntrinsicID = BinOp->getIntrinsicID(); 485 switch (IntrinsicID) { 486 case Intrinsic::aarch64_sve_and_z: 487 case Intrinsic::aarch64_sve_bic_z: 488 case Intrinsic::aarch64_sve_eor_z: 489 case Intrinsic::aarch64_sve_nand_z: 490 case Intrinsic::aarch64_sve_nor_z: 491 case Intrinsic::aarch64_sve_orn_z: 492 case Intrinsic::aarch64_sve_orr_z: 493 break; 494 default: 495 return None; 496 } 497 498 auto BinOpPred = BinOp->getOperand(0); 499 auto BinOpOp1 = BinOp->getOperand(1); 500 auto BinOpOp2 = BinOp->getOperand(2); 501 502 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred); 503 if (!PredIntr || 504 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool) 505 return None; 506 507 auto PredOp = PredIntr->getOperand(0); 508 auto PredOpTy = cast<VectorType>(PredOp->getType()); 509 if (PredOpTy != II.getType()) 510 return None; 511 512 IRBuilder<> Builder(II.getContext()); 513 Builder.SetInsertPoint(&II); 514 515 SmallVector<Value *> NarrowedBinOpArgs = {PredOp}; 516 auto NarrowBinOpOp1 = Builder.CreateIntrinsic( 517 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1}); 518 NarrowedBinOpArgs.push_back(NarrowBinOpOp1); 519 if (BinOpOp1 == BinOpOp2) 520 NarrowedBinOpArgs.push_back(NarrowBinOpOp1); 521 else 522 NarrowedBinOpArgs.push_back(Builder.CreateIntrinsic( 523 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2})); 524 525 auto NarrowedBinOp = 526 Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs); 527 return IC.replaceInstUsesWith(II, NarrowedBinOp); 528 } 529 530 static Optional<Instruction *> instCombineConvertFromSVBool(InstCombiner &IC, 531 IntrinsicInst &II) { 532 // If the reinterpret instruction operand is a PHI Node 533 if (isa<PHINode>(II.getArgOperand(0))) 534 return processPhiNode(IC, II); 535 536 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II)) 537 return BinOpCombine; 538 539 SmallVector<Instruction *, 32> CandidatesForRemoval; 540 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr; 541 542 const auto *IVTy = cast<VectorType>(II.getType()); 543 544 // Walk the chain of conversions. 545 while (Cursor) { 546 // If the type of the cursor has fewer lanes than the final result, zeroing 547 // must take place, which breaks the equivalence chain. 548 const auto *CursorVTy = cast<VectorType>(Cursor->getType()); 549 if (CursorVTy->getElementCount().getKnownMinValue() < 550 IVTy->getElementCount().getKnownMinValue()) 551 break; 552 553 // If the cursor has the same type as I, it is a viable replacement. 554 if (Cursor->getType() == IVTy) 555 EarliestReplacement = Cursor; 556 557 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor); 558 559 // If this is not an SVE conversion intrinsic, this is the end of the chain. 560 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() == 561 Intrinsic::aarch64_sve_convert_to_svbool || 562 IntrinsicCursor->getIntrinsicID() == 563 Intrinsic::aarch64_sve_convert_from_svbool)) 564 break; 565 566 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor); 567 Cursor = IntrinsicCursor->getOperand(0); 568 } 569 570 // If no viable replacement in the conversion chain was found, there is 571 // nothing to do. 572 if (!EarliestReplacement) 573 return None; 574 575 return IC.replaceInstUsesWith(II, EarliestReplacement); 576 } 577 578 static Optional<Instruction *> instCombineSVESel(InstCombiner &IC, 579 IntrinsicInst &II) { 580 IRBuilder<> Builder(&II); 581 auto Select = Builder.CreateSelect(II.getOperand(0), II.getOperand(1), 582 II.getOperand(2)); 583 return IC.replaceInstUsesWith(II, Select); 584 } 585 586 static Optional<Instruction *> instCombineSVEDup(InstCombiner &IC, 587 IntrinsicInst &II) { 588 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); 589 if (!Pg) 590 return None; 591 592 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) 593 return None; 594 595 const auto PTruePattern = 596 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue(); 597 if (PTruePattern != AArch64SVEPredPattern::vl1) 598 return None; 599 600 // The intrinsic is inserting into lane zero so use an insert instead. 601 auto *IdxTy = Type::getInt64Ty(II.getContext()); 602 auto *Insert = InsertElementInst::Create( 603 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0)); 604 Insert->insertBefore(&II); 605 Insert->takeName(&II); 606 607 return IC.replaceInstUsesWith(II, Insert); 608 } 609 610 static Optional<Instruction *> instCombineSVEDupX(InstCombiner &IC, 611 IntrinsicInst &II) { 612 // Replace DupX with a regular IR splat. 613 IRBuilder<> Builder(II.getContext()); 614 Builder.SetInsertPoint(&II); 615 auto *RetTy = cast<ScalableVectorType>(II.getType()); 616 Value *Splat = 617 Builder.CreateVectorSplat(RetTy->getElementCount(), II.getArgOperand(0)); 618 Splat->takeName(&II); 619 return IC.replaceInstUsesWith(II, Splat); 620 } 621 622 static Optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC, 623 IntrinsicInst &II) { 624 LLVMContext &Ctx = II.getContext(); 625 IRBuilder<> Builder(Ctx); 626 Builder.SetInsertPoint(&II); 627 628 // Check that the predicate is all active 629 auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0)); 630 if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) 631 return None; 632 633 const auto PTruePattern = 634 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue(); 635 if (PTruePattern != AArch64SVEPredPattern::all) 636 return None; 637 638 // Check that we have a compare of zero.. 639 auto *SplatValue = 640 dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2))); 641 if (!SplatValue || !SplatValue->isZero()) 642 return None; 643 644 // ..against a dupq 645 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); 646 if (!DupQLane || 647 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane) 648 return None; 649 650 // Where the dupq is a lane 0 replicate of a vector insert 651 if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero()) 652 return None; 653 654 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0)); 655 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert) 656 return None; 657 658 // Where the vector insert is a fixed constant vector insert into undef at 659 // index zero 660 if (!isa<UndefValue>(VecIns->getArgOperand(0))) 661 return None; 662 663 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero()) 664 return None; 665 666 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1)); 667 if (!ConstVec) 668 return None; 669 670 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType()); 671 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType()); 672 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements()) 673 return None; 674 675 unsigned NumElts = VecTy->getNumElements(); 676 unsigned PredicateBits = 0; 677 678 // Expand intrinsic operands to a 16-bit byte level predicate 679 for (unsigned I = 0; I < NumElts; ++I) { 680 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I)); 681 if (!Arg) 682 return None; 683 if (!Arg->isZero()) 684 PredicateBits |= 1 << (I * (16 / NumElts)); 685 } 686 687 // If all bits are zero bail early with an empty predicate 688 if (PredicateBits == 0) { 689 auto *PFalse = Constant::getNullValue(II.getType()); 690 PFalse->takeName(&II); 691 return IC.replaceInstUsesWith(II, PFalse); 692 } 693 694 // Calculate largest predicate type used (where byte predicate is largest) 695 unsigned Mask = 8; 696 for (unsigned I = 0; I < 16; ++I) 697 if ((PredicateBits & (1 << I)) != 0) 698 Mask |= (I % 8); 699 700 unsigned PredSize = Mask & -Mask; 701 auto *PredType = ScalableVectorType::get( 702 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8)); 703 704 // Ensure all relevant bits are set 705 for (unsigned I = 0; I < 16; I += PredSize) 706 if ((PredicateBits & (1 << I)) == 0) 707 return None; 708 709 auto *PTruePat = 710 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all); 711 auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, 712 {PredType}, {PTruePat}); 713 auto *ConvertToSVBool = Builder.CreateIntrinsic( 714 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue}); 715 auto *ConvertFromSVBool = 716 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, 717 {II.getType()}, {ConvertToSVBool}); 718 719 ConvertFromSVBool->takeName(&II); 720 return IC.replaceInstUsesWith(II, ConvertFromSVBool); 721 } 722 723 static Optional<Instruction *> instCombineSVELast(InstCombiner &IC, 724 IntrinsicInst &II) { 725 IRBuilder<> Builder(II.getContext()); 726 Builder.SetInsertPoint(&II); 727 Value *Pg = II.getArgOperand(0); 728 Value *Vec = II.getArgOperand(1); 729 auto IntrinsicID = II.getIntrinsicID(); 730 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta; 731 732 // lastX(splat(X)) --> X 733 if (auto *SplatVal = getSplatValue(Vec)) 734 return IC.replaceInstUsesWith(II, SplatVal); 735 736 // If x and/or y is a splat value then: 737 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y)) 738 Value *LHS, *RHS; 739 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) { 740 if (isSplatValue(LHS) || isSplatValue(RHS)) { 741 auto *OldBinOp = cast<BinaryOperator>(Vec); 742 auto OpC = OldBinOp->getOpcode(); 743 auto *NewLHS = 744 Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS}); 745 auto *NewRHS = 746 Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS}); 747 auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags( 748 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), &II); 749 return IC.replaceInstUsesWith(II, NewBinOp); 750 } 751 } 752 753 auto *C = dyn_cast<Constant>(Pg); 754 if (IsAfter && C && C->isNullValue()) { 755 // The intrinsic is extracting lane 0 so use an extract instead. 756 auto *IdxTy = Type::getInt64Ty(II.getContext()); 757 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0)); 758 Extract->insertBefore(&II); 759 Extract->takeName(&II); 760 return IC.replaceInstUsesWith(II, Extract); 761 } 762 763 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg); 764 if (!IntrPG) 765 return None; 766 767 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) 768 return None; 769 770 const auto PTruePattern = 771 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue(); 772 773 // Can the intrinsic's predicate be converted to a known constant index? 774 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern); 775 if (!MinNumElts) 776 return None; 777 778 unsigned Idx = MinNumElts - 1; 779 // Increment the index if extracting the element after the last active 780 // predicate element. 781 if (IsAfter) 782 ++Idx; 783 784 // Ignore extracts whose index is larger than the known minimum vector 785 // length. NOTE: This is an artificial constraint where we prefer to 786 // maintain what the user asked for until an alternative is proven faster. 787 auto *PgVTy = cast<ScalableVectorType>(Pg->getType()); 788 if (Idx >= PgVTy->getMinNumElements()) 789 return None; 790 791 // The intrinsic is extracting a fixed lane so use an extract instead. 792 auto *IdxTy = Type::getInt64Ty(II.getContext()); 793 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx)); 794 Extract->insertBefore(&II); 795 Extract->takeName(&II); 796 return IC.replaceInstUsesWith(II, Extract); 797 } 798 799 static Optional<Instruction *> instCombineRDFFR(InstCombiner &IC, 800 IntrinsicInst &II) { 801 LLVMContext &Ctx = II.getContext(); 802 IRBuilder<> Builder(Ctx); 803 Builder.SetInsertPoint(&II); 804 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr 805 // can work with RDFFR_PP for ptest elimination. 806 auto *AllPat = 807 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all); 808 auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, 809 {II.getType()}, {AllPat}); 810 auto *RDFFR = 811 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue}); 812 RDFFR->takeName(&II); 813 return IC.replaceInstUsesWith(II, RDFFR); 814 } 815 816 static Optional<Instruction *> 817 instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) { 818 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue(); 819 820 if (Pattern == AArch64SVEPredPattern::all) { 821 LLVMContext &Ctx = II.getContext(); 822 IRBuilder<> Builder(Ctx); 823 Builder.SetInsertPoint(&II); 824 825 Constant *StepVal = ConstantInt::get(II.getType(), NumElts); 826 auto *VScale = Builder.CreateVScale(StepVal); 827 VScale->takeName(&II); 828 return IC.replaceInstUsesWith(II, VScale); 829 } 830 831 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern); 832 833 return MinNumElts && NumElts >= MinNumElts 834 ? Optional<Instruction *>(IC.replaceInstUsesWith( 835 II, ConstantInt::get(II.getType(), MinNumElts))) 836 : None; 837 } 838 839 static Optional<Instruction *> instCombineSVEPTest(InstCombiner &IC, 840 IntrinsicInst &II) { 841 IntrinsicInst *Op1 = dyn_cast<IntrinsicInst>(II.getArgOperand(0)); 842 IntrinsicInst *Op2 = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); 843 844 if (Op1 && Op2 && 845 Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && 846 Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && 847 Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) { 848 849 IRBuilder<> Builder(II.getContext()); 850 Builder.SetInsertPoint(&II); 851 852 Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)}; 853 Type *Tys[] = {Op1->getArgOperand(0)->getType()}; 854 855 auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops); 856 857 PTest->takeName(&II); 858 return IC.replaceInstUsesWith(II, PTest); 859 } 860 861 return None; 862 } 863 864 static Optional<Instruction *> instCombineSVEVectorFMLA(InstCombiner &IC, 865 IntrinsicInst &II) { 866 // fold (fadd p a (fmul p b c)) -> (fma p a b c) 867 Value *P = II.getOperand(0); 868 Value *A = II.getOperand(1); 869 auto FMul = II.getOperand(2); 870 Value *B, *C; 871 if (!match(FMul, m_Intrinsic<Intrinsic::aarch64_sve_fmul>( 872 m_Specific(P), m_Value(B), m_Value(C)))) 873 return None; 874 875 if (!FMul->hasOneUse()) 876 return None; 877 878 llvm::FastMathFlags FAddFlags = II.getFastMathFlags(); 879 // Stop the combine when the flags on the inputs differ in case dropping flags 880 // would lead to us missing out on more beneficial optimizations. 881 if (FAddFlags != cast<CallInst>(FMul)->getFastMathFlags()) 882 return None; 883 if (!FAddFlags.allowContract()) 884 return None; 885 886 IRBuilder<> Builder(II.getContext()); 887 Builder.SetInsertPoint(&II); 888 auto FMLA = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_fmla, 889 {II.getType()}, {P, A, B, C}, &II); 890 FMLA->setFastMathFlags(FAddFlags); 891 return IC.replaceInstUsesWith(II, FMLA); 892 } 893 894 static bool isAllActivePredicate(Value *Pred) { 895 // Look through convert.from.svbool(convert.to.svbool(...) chain. 896 Value *UncastedPred; 897 if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>( 898 m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>( 899 m_Value(UncastedPred))))) 900 // If the predicate has the same or less lanes than the uncasted 901 // predicate then we know the casting has no effect. 902 if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <= 903 cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements()) 904 Pred = UncastedPred; 905 906 return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( 907 m_ConstantInt<AArch64SVEPredPattern::all>())); 908 } 909 910 static Optional<Instruction *> 911 instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { 912 IRBuilder<> Builder(II.getContext()); 913 Builder.SetInsertPoint(&II); 914 915 Value *Pred = II.getOperand(0); 916 Value *PtrOp = II.getOperand(1); 917 Type *VecTy = II.getType(); 918 Value *VecPtr = Builder.CreateBitCast(PtrOp, VecTy->getPointerTo()); 919 920 if (isAllActivePredicate(Pred)) { 921 LoadInst *Load = Builder.CreateLoad(VecTy, VecPtr); 922 Load->copyMetadata(II); 923 return IC.replaceInstUsesWith(II, Load); 924 } 925 926 CallInst *MaskedLoad = 927 Builder.CreateMaskedLoad(VecTy, VecPtr, PtrOp->getPointerAlignment(DL), 928 Pred, ConstantAggregateZero::get(VecTy)); 929 MaskedLoad->copyMetadata(II); 930 return IC.replaceInstUsesWith(II, MaskedLoad); 931 } 932 933 static Optional<Instruction *> 934 instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { 935 IRBuilder<> Builder(II.getContext()); 936 Builder.SetInsertPoint(&II); 937 938 Value *VecOp = II.getOperand(0); 939 Value *Pred = II.getOperand(1); 940 Value *PtrOp = II.getOperand(2); 941 Value *VecPtr = 942 Builder.CreateBitCast(PtrOp, VecOp->getType()->getPointerTo()); 943 944 if (isAllActivePredicate(Pred)) { 945 StoreInst *Store = Builder.CreateStore(VecOp, VecPtr); 946 Store->copyMetadata(II); 947 return IC.eraseInstFromFunction(II); 948 } 949 950 CallInst *MaskedStore = Builder.CreateMaskedStore( 951 VecOp, VecPtr, PtrOp->getPointerAlignment(DL), Pred); 952 MaskedStore->copyMetadata(II); 953 return IC.eraseInstFromFunction(II); 954 } 955 956 static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) { 957 switch (Intrinsic) { 958 case Intrinsic::aarch64_sve_fmul: 959 return Instruction::BinaryOps::FMul; 960 case Intrinsic::aarch64_sve_fadd: 961 return Instruction::BinaryOps::FAdd; 962 case Intrinsic::aarch64_sve_fsub: 963 return Instruction::BinaryOps::FSub; 964 default: 965 return Instruction::BinaryOpsEnd; 966 } 967 } 968 969 static Optional<Instruction *> instCombineSVEVectorBinOp(InstCombiner &IC, 970 IntrinsicInst &II) { 971 auto *OpPredicate = II.getOperand(0); 972 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID()); 973 if (BinOpCode == Instruction::BinaryOpsEnd || 974 !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( 975 m_ConstantInt<AArch64SVEPredPattern::all>()))) 976 return None; 977 IRBuilder<> Builder(II.getContext()); 978 Builder.SetInsertPoint(&II); 979 Builder.setFastMathFlags(II.getFastMathFlags()); 980 auto BinOp = 981 Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2)); 982 return IC.replaceInstUsesWith(II, BinOp); 983 } 984 985 static Optional<Instruction *> instCombineSVEVectorFAdd(InstCombiner &IC, 986 IntrinsicInst &II) { 987 if (auto FMLA = instCombineSVEVectorFMLA(IC, II)) 988 return FMLA; 989 return instCombineSVEVectorBinOp(IC, II); 990 } 991 992 static Optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC, 993 IntrinsicInst &II) { 994 auto *OpPredicate = II.getOperand(0); 995 auto *OpMultiplicand = II.getOperand(1); 996 auto *OpMultiplier = II.getOperand(2); 997 998 IRBuilder<> Builder(II.getContext()); 999 Builder.SetInsertPoint(&II); 1000 1001 // Return true if a given instruction is a unit splat value, false otherwise. 1002 auto IsUnitSplat = [](auto *I) { 1003 auto *SplatValue = getSplatValue(I); 1004 if (!SplatValue) 1005 return false; 1006 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); 1007 }; 1008 1009 // Return true if a given instruction is an aarch64_sve_dup intrinsic call 1010 // with a unit splat value, false otherwise. 1011 auto IsUnitDup = [](auto *I) { 1012 auto *IntrI = dyn_cast<IntrinsicInst>(I); 1013 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup) 1014 return false; 1015 1016 auto *SplatValue = IntrI->getOperand(2); 1017 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); 1018 }; 1019 1020 if (IsUnitSplat(OpMultiplier)) { 1021 // [f]mul pg %n, (dupx 1) => %n 1022 OpMultiplicand->takeName(&II); 1023 return IC.replaceInstUsesWith(II, OpMultiplicand); 1024 } else if (IsUnitDup(OpMultiplier)) { 1025 // [f]mul pg %n, (dup pg 1) => %n 1026 auto *DupInst = cast<IntrinsicInst>(OpMultiplier); 1027 auto *DupPg = DupInst->getOperand(1); 1028 // TODO: this is naive. The optimization is still valid if DupPg 1029 // 'encompasses' OpPredicate, not only if they're the same predicate. 1030 if (OpPredicate == DupPg) { 1031 OpMultiplicand->takeName(&II); 1032 return IC.replaceInstUsesWith(II, OpMultiplicand); 1033 } 1034 } 1035 1036 return instCombineSVEVectorBinOp(IC, II); 1037 } 1038 1039 static Optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC, 1040 IntrinsicInst &II) { 1041 IRBuilder<> Builder(II.getContext()); 1042 Builder.SetInsertPoint(&II); 1043 Value *UnpackArg = II.getArgOperand(0); 1044 auto *RetTy = cast<ScalableVectorType>(II.getType()); 1045 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi || 1046 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo; 1047 1048 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X)) 1049 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X)) 1050 if (auto *ScalarArg = getSplatValue(UnpackArg)) { 1051 ScalarArg = 1052 Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned); 1053 Value *NewVal = 1054 Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg); 1055 NewVal->takeName(&II); 1056 return IC.replaceInstUsesWith(II, NewVal); 1057 } 1058 1059 return None; 1060 } 1061 static Optional<Instruction *> instCombineSVETBL(InstCombiner &IC, 1062 IntrinsicInst &II) { 1063 auto *OpVal = II.getOperand(0); 1064 auto *OpIndices = II.getOperand(1); 1065 VectorType *VTy = cast<VectorType>(II.getType()); 1066 1067 // Check whether OpIndices is a constant splat value < minimal element count 1068 // of result. 1069 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices)); 1070 if (!SplatValue || 1071 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue())) 1072 return None; 1073 1074 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to 1075 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization. 1076 IRBuilder<> Builder(II.getContext()); 1077 Builder.SetInsertPoint(&II); 1078 auto *Extract = Builder.CreateExtractElement(OpVal, SplatValue); 1079 auto *VectorSplat = 1080 Builder.CreateVectorSplat(VTy->getElementCount(), Extract); 1081 1082 VectorSplat->takeName(&II); 1083 return IC.replaceInstUsesWith(II, VectorSplat); 1084 } 1085 1086 static Optional<Instruction *> instCombineSVETupleGet(InstCombiner &IC, 1087 IntrinsicInst &II) { 1088 // Try to remove sequences of tuple get/set. 1089 Value *SetTuple, *SetIndex, *SetValue; 1090 auto *GetTuple = II.getArgOperand(0); 1091 auto *GetIndex = II.getArgOperand(1); 1092 // Check that we have tuple_get(GetTuple, GetIndex) where GetTuple is a 1093 // call to tuple_set i.e. tuple_set(SetTuple, SetIndex, SetValue). 1094 // Make sure that the types of the current intrinsic and SetValue match 1095 // in order to safely remove the sequence. 1096 if (!match(GetTuple, 1097 m_Intrinsic<Intrinsic::aarch64_sve_tuple_set>( 1098 m_Value(SetTuple), m_Value(SetIndex), m_Value(SetValue))) || 1099 SetValue->getType() != II.getType()) 1100 return None; 1101 // Case where we get the same index right after setting it. 1102 // tuple_get(tuple_set(SetTuple, SetIndex, SetValue), GetIndex) --> SetValue 1103 if (GetIndex == SetIndex) 1104 return IC.replaceInstUsesWith(II, SetValue); 1105 // If we are getting a different index than what was set in the tuple_set 1106 // intrinsic. We can just set the input tuple to the one up in the chain. 1107 // tuple_get(tuple_set(SetTuple, SetIndex, SetValue), GetIndex) 1108 // --> tuple_get(SetTuple, GetIndex) 1109 return IC.replaceOperand(II, 0, SetTuple); 1110 } 1111 1112 static Optional<Instruction *> instCombineSVEZip(InstCombiner &IC, 1113 IntrinsicInst &II) { 1114 // zip1(uzp1(A, B), uzp2(A, B)) --> A 1115 // zip2(uzp1(A, B), uzp2(A, B)) --> B 1116 Value *A, *B; 1117 if (match(II.getArgOperand(0), 1118 m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) && 1119 match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>( 1120 m_Specific(A), m_Specific(B)))) 1121 return IC.replaceInstUsesWith( 1122 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B)); 1123 1124 return None; 1125 } 1126 1127 static Optional<Instruction *> instCombineLD1GatherIndex(InstCombiner &IC, 1128 IntrinsicInst &II) { 1129 Value *Mask = II.getOperand(0); 1130 Value *BasePtr = II.getOperand(1); 1131 Value *Index = II.getOperand(2); 1132 Type *Ty = II.getType(); 1133 Value *PassThru = ConstantAggregateZero::get(Ty); 1134 1135 // Contiguous gather => masked load. 1136 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1)) 1137 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer) 1138 Value *IndexBase; 1139 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>( 1140 m_Value(IndexBase), m_SpecificInt(1)))) { 1141 IRBuilder<> Builder(II.getContext()); 1142 Builder.SetInsertPoint(&II); 1143 1144 Align Alignment = 1145 BasePtr->getPointerAlignment(II.getModule()->getDataLayout()); 1146 1147 Type *VecPtrTy = PointerType::getUnqual(Ty); 1148 Value *Ptr = Builder.CreateGEP( 1149 cast<VectorType>(Ty)->getElementType(), BasePtr, IndexBase); 1150 Ptr = Builder.CreateBitCast(Ptr, VecPtrTy); 1151 CallInst *MaskedLoad = 1152 Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru); 1153 MaskedLoad->takeName(&II); 1154 return IC.replaceInstUsesWith(II, MaskedLoad); 1155 } 1156 1157 return None; 1158 } 1159 1160 static Optional<Instruction *> instCombineST1ScatterIndex(InstCombiner &IC, 1161 IntrinsicInst &II) { 1162 Value *Val = II.getOperand(0); 1163 Value *Mask = II.getOperand(1); 1164 Value *BasePtr = II.getOperand(2); 1165 Value *Index = II.getOperand(3); 1166 Type *Ty = Val->getType(); 1167 1168 // Contiguous scatter => masked store. 1169 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1)) 1170 // => (masked.store Value (gep BasePtr IndexBase) Align Mask) 1171 Value *IndexBase; 1172 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>( 1173 m_Value(IndexBase), m_SpecificInt(1)))) { 1174 IRBuilder<> Builder(II.getContext()); 1175 Builder.SetInsertPoint(&II); 1176 1177 Align Alignment = 1178 BasePtr->getPointerAlignment(II.getModule()->getDataLayout()); 1179 1180 Value *Ptr = Builder.CreateGEP( 1181 cast<VectorType>(Ty)->getElementType(), BasePtr, IndexBase); 1182 Type *VecPtrTy = PointerType::getUnqual(Ty); 1183 Ptr = Builder.CreateBitCast(Ptr, VecPtrTy); 1184 1185 (void)Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask); 1186 1187 return IC.eraseInstFromFunction(II); 1188 } 1189 1190 return None; 1191 } 1192 1193 static Optional<Instruction *> instCombineSVESDIV(InstCombiner &IC, 1194 IntrinsicInst &II) { 1195 IRBuilder<> Builder(II.getContext()); 1196 Builder.SetInsertPoint(&II); 1197 Type *Int32Ty = Builder.getInt32Ty(); 1198 Value *Pred = II.getOperand(0); 1199 Value *Vec = II.getOperand(1); 1200 Value *DivVec = II.getOperand(2); 1201 1202 Value *SplatValue = getSplatValue(DivVec); 1203 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue); 1204 if (!SplatConstantInt) 1205 return None; 1206 APInt Divisor = SplatConstantInt->getValue(); 1207 1208 if (Divisor.isPowerOf2()) { 1209 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2()); 1210 auto ASRD = Builder.CreateIntrinsic( 1211 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2}); 1212 return IC.replaceInstUsesWith(II, ASRD); 1213 } 1214 if (Divisor.isNegatedPowerOf2()) { 1215 Divisor.negate(); 1216 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2()); 1217 auto ASRD = Builder.CreateIntrinsic( 1218 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2}); 1219 auto NEG = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_neg, 1220 {ASRD->getType()}, {ASRD, Pred, ASRD}); 1221 return IC.replaceInstUsesWith(II, NEG); 1222 } 1223 1224 return None; 1225 } 1226 1227 static Optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC, 1228 IntrinsicInst &II) { 1229 Value *A = II.getArgOperand(0); 1230 Value *B = II.getArgOperand(1); 1231 if (A == B) 1232 return IC.replaceInstUsesWith(II, A); 1233 1234 return None; 1235 } 1236 1237 static Optional<Instruction *> instCombineSVESrshl(InstCombiner &IC, 1238 IntrinsicInst &II) { 1239 IRBuilder<> Builder(&II); 1240 Value *Pred = II.getOperand(0); 1241 Value *Vec = II.getOperand(1); 1242 Value *Shift = II.getOperand(2); 1243 1244 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic. 1245 Value *AbsPred, *MergedValue; 1246 if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>( 1247 m_Value(MergedValue), m_Value(AbsPred), m_Value())) && 1248 !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>( 1249 m_Value(MergedValue), m_Value(AbsPred), m_Value()))) 1250 1251 return None; 1252 1253 // Transform is valid if any of the following are true: 1254 // * The ABS merge value is an undef or non-negative 1255 // * The ABS predicate is all active 1256 // * The ABS predicate and the SRSHL predicates are the same 1257 if (!isa<UndefValue>(MergedValue) && 1258 !match(MergedValue, m_NonNegative()) && 1259 AbsPred != Pred && !isAllActivePredicate(AbsPred)) 1260 return None; 1261 1262 // Only valid when the shift amount is non-negative, otherwise the rounding 1263 // behaviour of SRSHL cannot be ignored. 1264 if (!match(Shift, m_NonNegative())) 1265 return None; 1266 1267 auto LSL = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl, {II.getType()}, 1268 {Pred, Vec, Shift}); 1269 1270 return IC.replaceInstUsesWith(II, LSL); 1271 } 1272 1273 Optional<Instruction *> 1274 AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, 1275 IntrinsicInst &II) const { 1276 Intrinsic::ID IID = II.getIntrinsicID(); 1277 switch (IID) { 1278 default: 1279 break; 1280 case Intrinsic::aarch64_neon_fmaxnm: 1281 case Intrinsic::aarch64_neon_fminnm: 1282 return instCombineMaxMinNM(IC, II); 1283 case Intrinsic::aarch64_sve_convert_from_svbool: 1284 return instCombineConvertFromSVBool(IC, II); 1285 case Intrinsic::aarch64_sve_dup: 1286 return instCombineSVEDup(IC, II); 1287 case Intrinsic::aarch64_sve_dup_x: 1288 return instCombineSVEDupX(IC, II); 1289 case Intrinsic::aarch64_sve_cmpne: 1290 case Intrinsic::aarch64_sve_cmpne_wide: 1291 return instCombineSVECmpNE(IC, II); 1292 case Intrinsic::aarch64_sve_rdffr: 1293 return instCombineRDFFR(IC, II); 1294 case Intrinsic::aarch64_sve_lasta: 1295 case Intrinsic::aarch64_sve_lastb: 1296 return instCombineSVELast(IC, II); 1297 case Intrinsic::aarch64_sve_cntd: 1298 return instCombineSVECntElts(IC, II, 2); 1299 case Intrinsic::aarch64_sve_cntw: 1300 return instCombineSVECntElts(IC, II, 4); 1301 case Intrinsic::aarch64_sve_cnth: 1302 return instCombineSVECntElts(IC, II, 8); 1303 case Intrinsic::aarch64_sve_cntb: 1304 return instCombineSVECntElts(IC, II, 16); 1305 case Intrinsic::aarch64_sve_ptest_any: 1306 case Intrinsic::aarch64_sve_ptest_first: 1307 case Intrinsic::aarch64_sve_ptest_last: 1308 return instCombineSVEPTest(IC, II); 1309 case Intrinsic::aarch64_sve_mul: 1310 case Intrinsic::aarch64_sve_fmul: 1311 return instCombineSVEVectorMul(IC, II); 1312 case Intrinsic::aarch64_sve_fadd: 1313 return instCombineSVEVectorFAdd(IC, II); 1314 case Intrinsic::aarch64_sve_fsub: 1315 return instCombineSVEVectorBinOp(IC, II); 1316 case Intrinsic::aarch64_sve_tbl: 1317 return instCombineSVETBL(IC, II); 1318 case Intrinsic::aarch64_sve_uunpkhi: 1319 case Intrinsic::aarch64_sve_uunpklo: 1320 case Intrinsic::aarch64_sve_sunpkhi: 1321 case Intrinsic::aarch64_sve_sunpklo: 1322 return instCombineSVEUnpack(IC, II); 1323 case Intrinsic::aarch64_sve_tuple_get: 1324 return instCombineSVETupleGet(IC, II); 1325 case Intrinsic::aarch64_sve_zip1: 1326 case Intrinsic::aarch64_sve_zip2: 1327 return instCombineSVEZip(IC, II); 1328 case Intrinsic::aarch64_sve_ld1_gather_index: 1329 return instCombineLD1GatherIndex(IC, II); 1330 case Intrinsic::aarch64_sve_st1_scatter_index: 1331 return instCombineST1ScatterIndex(IC, II); 1332 case Intrinsic::aarch64_sve_ld1: 1333 return instCombineSVELD1(IC, II, DL); 1334 case Intrinsic::aarch64_sve_st1: 1335 return instCombineSVEST1(IC, II, DL); 1336 case Intrinsic::aarch64_sve_sdiv: 1337 return instCombineSVESDIV(IC, II); 1338 case Intrinsic::aarch64_sve_sel: 1339 return instCombineSVESel(IC, II); 1340 case Intrinsic::aarch64_sve_srshl: 1341 return instCombineSVESrshl(IC, II); 1342 } 1343 1344 return None; 1345 } 1346 1347 Optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic( 1348 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts, 1349 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, 1350 std::function<void(Instruction *, unsigned, APInt, APInt &)> 1351 SimplifyAndSetOp) const { 1352 switch (II.getIntrinsicID()) { 1353 default: 1354 break; 1355 case Intrinsic::aarch64_neon_fcvtxn: 1356 case Intrinsic::aarch64_neon_rshrn: 1357 case Intrinsic::aarch64_neon_sqrshrn: 1358 case Intrinsic::aarch64_neon_sqrshrun: 1359 case Intrinsic::aarch64_neon_sqshrn: 1360 case Intrinsic::aarch64_neon_sqshrun: 1361 case Intrinsic::aarch64_neon_sqxtn: 1362 case Intrinsic::aarch64_neon_sqxtun: 1363 case Intrinsic::aarch64_neon_uqrshrn: 1364 case Intrinsic::aarch64_neon_uqshrn: 1365 case Intrinsic::aarch64_neon_uqxtn: 1366 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts); 1367 break; 1368 } 1369 1370 return None; 1371 } 1372 1373 bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, 1374 ArrayRef<const Value *> Args) { 1375 1376 // A helper that returns a vector type from the given type. The number of 1377 // elements in type Ty determines the vector width. 1378 auto toVectorTy = [&](Type *ArgTy) { 1379 return VectorType::get(ArgTy->getScalarType(), 1380 cast<VectorType>(DstTy)->getElementCount()); 1381 }; 1382 1383 // Exit early if DstTy is not a vector type whose elements are at least 1384 // 16-bits wide. 1385 if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16) 1386 return false; 1387 1388 // Determine if the operation has a widening variant. We consider both the 1389 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the 1390 // instructions. 1391 // 1392 // TODO: Add additional widening operations (e.g., shl, etc.) once we 1393 // verify that their extending operands are eliminated during code 1394 // generation. 1395 switch (Opcode) { 1396 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2). 1397 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2). 1398 case Instruction::Mul: // SMULL(2), UMULL(2) 1399 break; 1400 default: 1401 return false; 1402 } 1403 1404 // To be a widening instruction (either the "wide" or "long" versions), the 1405 // second operand must be a sign- or zero extend. 1406 if (Args.size() != 2 || 1407 (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1]))) 1408 return false; 1409 auto *Extend = cast<CastInst>(Args[1]); 1410 auto *Arg0 = dyn_cast<CastInst>(Args[0]); 1411 1412 // A mul only has a mull version (not like addw). Both operands need to be 1413 // extending and the same type. 1414 if (Opcode == Instruction::Mul && 1415 (!Arg0 || Arg0->getOpcode() != Extend->getOpcode() || 1416 Arg0->getOperand(0)->getType() != Extend->getOperand(0)->getType())) 1417 return false; 1418 1419 // Legalize the destination type and ensure it can be used in a widening 1420 // operation. 1421 auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy); 1422 unsigned DstElTySize = DstTyL.second.getScalarSizeInBits(); 1423 if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits()) 1424 return false; 1425 1426 // Legalize the source type and ensure it can be used in a widening 1427 // operation. 1428 auto *SrcTy = toVectorTy(Extend->getSrcTy()); 1429 auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy); 1430 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits(); 1431 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits()) 1432 return false; 1433 1434 // Get the total number of vector elements in the legalized types. 1435 InstructionCost NumDstEls = 1436 DstTyL.first * DstTyL.second.getVectorMinNumElements(); 1437 InstructionCost NumSrcEls = 1438 SrcTyL.first * SrcTyL.second.getVectorMinNumElements(); 1439 1440 // Return true if the legalized types have the same number of vector elements 1441 // and the destination element type size is twice that of the source type. 1442 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize; 1443 } 1444 1445 InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, 1446 Type *Src, 1447 TTI::CastContextHint CCH, 1448 TTI::TargetCostKind CostKind, 1449 const Instruction *I) { 1450 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1451 assert(ISD && "Invalid opcode"); 1452 1453 // If the cast is observable, and it is used by a widening instruction (e.g., 1454 // uaddl, saddw, etc.), it may be free. 1455 if (I && I->hasOneUser()) { 1456 auto *SingleUser = cast<Instruction>(*I->user_begin()); 1457 SmallVector<const Value *, 4> Operands(SingleUser->operand_values()); 1458 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) { 1459 // If the cast is the second operand, it is free. We will generate either 1460 // a "wide" or "long" version of the widening instruction. 1461 if (I == SingleUser->getOperand(1)) 1462 return 0; 1463 // If the cast is not the second operand, it will be free if it looks the 1464 // same as the second operand. In this case, we will generate a "long" 1465 // version of the widening instruction. 1466 if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1))) 1467 if (I->getOpcode() == unsigned(Cast->getOpcode()) && 1468 cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy()) 1469 return 0; 1470 } 1471 } 1472 1473 // TODO: Allow non-throughput costs that aren't binary. 1474 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost { 1475 if (CostKind != TTI::TCK_RecipThroughput) 1476 return Cost == 0 ? 0 : 1; 1477 return Cost; 1478 }; 1479 1480 EVT SrcTy = TLI->getValueType(DL, Src); 1481 EVT DstTy = TLI->getValueType(DL, Dst); 1482 1483 if (!SrcTy.isSimple() || !DstTy.isSimple()) 1484 return AdjustCost( 1485 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); 1486 1487 static const TypeConversionCostTblEntry 1488 ConversionTbl[] = { 1489 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, 1490 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 }, 1491 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, 1492 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 }, 1493 1494 // Truncations on nxvmiN 1495 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 }, 1496 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 }, 1497 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 }, 1498 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 }, 1499 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 }, 1500 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 }, 1501 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 }, 1502 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 }, 1503 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 }, 1504 { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 }, 1505 { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 }, 1506 { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 }, 1507 { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 }, 1508 { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 }, 1509 { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 }, 1510 { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 }, 1511 1512 // The number of shll instructions for the extension. 1513 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 1514 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 1515 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, 1516 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, 1517 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 1518 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 1519 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, 1520 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, 1521 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, 1522 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, 1523 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, 1524 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, 1525 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, 1526 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, 1527 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, 1528 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, 1529 1530 // LowerVectorINT_TO_FP: 1531 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 1532 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 1533 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 1534 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 1535 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 1536 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 1537 1538 // Complex: to v2f32 1539 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 1540 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, 1541 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, 1542 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 1543 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, 1544 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, 1545 1546 // Complex: to v4f32 1547 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 }, 1548 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 1549 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, 1550 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 1551 1552 // Complex: to v8f32 1553 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, 1554 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 1555 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, 1556 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 1557 1558 // Complex: to v16f32 1559 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, 1560 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, 1561 1562 // Complex: to v2f64 1563 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 1564 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, 1565 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 1566 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 1567 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, 1568 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 1569 1570 1571 // LowerVectorFP_TO_INT 1572 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 }, 1573 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, 1574 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, 1575 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, 1576 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, 1577 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, 1578 1579 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext). 1580 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 }, 1581 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 }, 1582 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 }, 1583 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 }, 1584 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 }, 1585 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 }, 1586 1587 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2 1588 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 }, 1589 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 }, 1590 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 }, 1591 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 }, 1592 1593 // Complex, from nxv2f32. 1594 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 }, 1595 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 }, 1596 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 }, 1597 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1 }, 1598 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 }, 1599 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 }, 1600 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 }, 1601 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1 }, 1602 1603 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2. 1604 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 }, 1605 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 }, 1606 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 }, 1607 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 }, 1608 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 }, 1609 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 }, 1610 1611 // Complex, from nxv2f64. 1612 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 }, 1613 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 }, 1614 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 }, 1615 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1 }, 1616 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 }, 1617 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 }, 1618 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 }, 1619 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1 }, 1620 1621 // Complex, from nxv4f32. 1622 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 }, 1623 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 }, 1624 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 }, 1625 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1 }, 1626 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 }, 1627 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 }, 1628 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 }, 1629 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1 }, 1630 1631 // Complex, from nxv8f64. Illegal -> illegal conversions not required. 1632 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 }, 1633 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7 }, 1634 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 }, 1635 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7 }, 1636 1637 // Complex, from nxv4f64. Illegal -> illegal conversions not required. 1638 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 }, 1639 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 }, 1640 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3 }, 1641 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 }, 1642 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 }, 1643 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3 }, 1644 1645 // Complex, from nxv8f32. Illegal -> illegal conversions not required. 1646 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 }, 1647 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3 }, 1648 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 }, 1649 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3 }, 1650 1651 // Complex, from nxv8f16. 1652 { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 }, 1653 { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 }, 1654 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 }, 1655 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1 }, 1656 { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 }, 1657 { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 }, 1658 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 }, 1659 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1 }, 1660 1661 // Complex, from nxv4f16. 1662 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 }, 1663 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 }, 1664 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 }, 1665 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1 }, 1666 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 }, 1667 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 }, 1668 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 }, 1669 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1 }, 1670 1671 // Complex, from nxv2f16. 1672 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 }, 1673 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 }, 1674 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 }, 1675 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1 }, 1676 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 }, 1677 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 }, 1678 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 }, 1679 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1 }, 1680 1681 // Truncate from nxvmf32 to nxvmf16. 1682 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 }, 1683 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 }, 1684 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 }, 1685 1686 // Truncate from nxvmf64 to nxvmf16. 1687 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 }, 1688 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 }, 1689 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 }, 1690 1691 // Truncate from nxvmf64 to nxvmf32. 1692 { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 }, 1693 { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 }, 1694 { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 }, 1695 1696 // Extend from nxvmf16 to nxvmf32. 1697 { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1}, 1698 { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1}, 1699 { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2}, 1700 1701 // Extend from nxvmf16 to nxvmf64. 1702 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1}, 1703 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2}, 1704 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4}, 1705 1706 // Extend from nxvmf32 to nxvmf64. 1707 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1}, 1708 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2}, 1709 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6}, 1710 1711 // Bitcasts from float to integer 1712 { ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0 }, 1713 { ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0 }, 1714 { ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0 }, 1715 1716 // Bitcasts from integer to float 1717 { ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0 }, 1718 { ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0 }, 1719 { ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0 }, 1720 }; 1721 1722 if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD, 1723 DstTy.getSimpleVT(), 1724 SrcTy.getSimpleVT())) 1725 return AdjustCost(Entry->Cost); 1726 1727 static const TypeConversionCostTblEntry FP16Tbl[] = { 1728 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs 1729 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1}, 1730 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs 1731 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1}, 1732 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs 1733 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2}, 1734 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn 1735 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2}, 1736 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs 1737 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1}, 1738 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs 1739 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4}, 1740 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn 1741 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3}, 1742 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs 1743 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2}, 1744 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs 1745 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8}, 1746 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf 1747 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf 1748 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf 1749 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf 1750 }; 1751 1752 if (ST->hasFullFP16()) 1753 if (const auto *Entry = ConvertCostTableLookup( 1754 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) 1755 return AdjustCost(Entry->Cost); 1756 1757 return AdjustCost( 1758 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); 1759 } 1760 1761 InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, 1762 Type *Dst, 1763 VectorType *VecTy, 1764 unsigned Index) { 1765 1766 // Make sure we were given a valid extend opcode. 1767 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && 1768 "Invalid opcode"); 1769 1770 // We are extending an element we extract from a vector, so the source type 1771 // of the extend is the element type of the vector. 1772 auto *Src = VecTy->getElementType(); 1773 1774 // Sign- and zero-extends are for integer types only. 1775 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type"); 1776 1777 // Get the cost for the extract. We compute the cost (if any) for the extend 1778 // below. 1779 InstructionCost Cost = 1780 getVectorInstrCost(Instruction::ExtractElement, VecTy, Index); 1781 1782 // Legalize the types. 1783 auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy); 1784 auto DstVT = TLI->getValueType(DL, Dst); 1785 auto SrcVT = TLI->getValueType(DL, Src); 1786 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 1787 1788 // If the resulting type is still a vector and the destination type is legal, 1789 // we may get the extension for free. If not, get the default cost for the 1790 // extend. 1791 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT)) 1792 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 1793 CostKind); 1794 1795 // The destination type should be larger than the element type. If not, get 1796 // the default cost for the extend. 1797 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits()) 1798 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 1799 CostKind); 1800 1801 switch (Opcode) { 1802 default: 1803 llvm_unreachable("Opcode should be either SExt or ZExt"); 1804 1805 // For sign-extends, we only need a smov, which performs the extension 1806 // automatically. 1807 case Instruction::SExt: 1808 return Cost; 1809 1810 // For zero-extends, the extend is performed automatically by a umov unless 1811 // the destination type is i64 and the element type is i8 or i16. 1812 case Instruction::ZExt: 1813 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u) 1814 return Cost; 1815 } 1816 1817 // If we are unable to perform the extend for free, get the default cost. 1818 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 1819 CostKind); 1820 } 1821 1822 InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode, 1823 TTI::TargetCostKind CostKind, 1824 const Instruction *I) { 1825 if (CostKind != TTI::TCK_RecipThroughput) 1826 return Opcode == Instruction::PHI ? 0 : 1; 1827 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind"); 1828 // Branches are assumed to be predicted. 1829 return 0; 1830 } 1831 1832 InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, 1833 unsigned Index) { 1834 assert(Val->isVectorTy() && "This must be a vector type"); 1835 1836 if (Index != -1U) { 1837 // Legalize the type. 1838 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); 1839 1840 // This type is legalized to a scalar type. 1841 if (!LT.second.isVector()) 1842 return 0; 1843 1844 // The type may be split. For fixed-width vectors we can normalize the 1845 // index to the new type. 1846 if (LT.second.isFixedLengthVector()) { 1847 unsigned Width = LT.second.getVectorNumElements(); 1848 Index = Index % Width; 1849 } 1850 1851 // The element at index zero is already inside the vector. 1852 if (Index == 0) 1853 return 0; 1854 } 1855 1856 // All other insert/extracts cost this much. 1857 return ST->getVectorInsertExtractBaseCost(); 1858 } 1859 1860 InstructionCost AArch64TTIImpl::getArithmeticInstrCost( 1861 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 1862 TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, 1863 TTI::OperandValueProperties Opd1PropInfo, 1864 TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args, 1865 const Instruction *CxtI) { 1866 // TODO: Handle more cost kinds. 1867 if (CostKind != TTI::TCK_RecipThroughput) 1868 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, 1869 Opd2Info, Opd1PropInfo, 1870 Opd2PropInfo, Args, CxtI); 1871 1872 // Legalize the type. 1873 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 1874 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1875 1876 switch (ISD) { 1877 default: 1878 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, 1879 Opd2Info, Opd1PropInfo, Opd2PropInfo); 1880 case ISD::SDIV: 1881 if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue && 1882 Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { 1883 // On AArch64, scalar signed division by constants power-of-two are 1884 // normally expanded to the sequence ADD + CMP + SELECT + SRA. 1885 // The OperandValue properties many not be same as that of previous 1886 // operation; conservatively assume OP_None. 1887 InstructionCost Cost = getArithmeticInstrCost( 1888 Instruction::Add, Ty, CostKind, Opd1Info, Opd2Info, 1889 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 1890 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Opd1Info, 1891 Opd2Info, TargetTransformInfo::OP_None, 1892 TargetTransformInfo::OP_None); 1893 Cost += getArithmeticInstrCost( 1894 Instruction::Select, Ty, CostKind, Opd1Info, Opd2Info, 1895 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 1896 Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Opd1Info, 1897 Opd2Info, TargetTransformInfo::OP_None, 1898 TargetTransformInfo::OP_None); 1899 return Cost; 1900 } 1901 LLVM_FALLTHROUGH; 1902 case ISD::UDIV: { 1903 if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue) { 1904 auto VT = TLI->getValueType(DL, Ty); 1905 if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) { 1906 // Vector signed division by constant are expanded to the 1907 // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division 1908 // to MULHS + SUB + SRL + ADD + SRL. 1909 InstructionCost MulCost = getArithmeticInstrCost( 1910 Instruction::Mul, Ty, CostKind, Opd1Info, Opd2Info, 1911 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 1912 InstructionCost AddCost = getArithmeticInstrCost( 1913 Instruction::Add, Ty, CostKind, Opd1Info, Opd2Info, 1914 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 1915 InstructionCost ShrCost = getArithmeticInstrCost( 1916 Instruction::AShr, Ty, CostKind, Opd1Info, Opd2Info, 1917 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 1918 return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1; 1919 } 1920 } 1921 1922 InstructionCost Cost = BaseT::getArithmeticInstrCost( 1923 Opcode, Ty, CostKind, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo); 1924 if (Ty->isVectorTy()) { 1925 // On AArch64, vector divisions are not supported natively and are 1926 // expanded into scalar divisions of each pair of elements. 1927 Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, CostKind, 1928 Opd1Info, Opd2Info, Opd1PropInfo, 1929 Opd2PropInfo); 1930 Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind, 1931 Opd1Info, Opd2Info, Opd1PropInfo, 1932 Opd2PropInfo); 1933 // TODO: if one of the arguments is scalar, then it's not necessary to 1934 // double the cost of handling the vector elements. 1935 Cost += Cost; 1936 } 1937 return Cost; 1938 } 1939 case ISD::MUL: 1940 // Since we do not have a MUL.2d instruction, a mul <2 x i64> is expensive 1941 // as elements are extracted from the vectors and the muls scalarized. 1942 // As getScalarizationOverhead is a bit too pessimistic, we estimate the 1943 // cost for a i64 vector directly here, which is: 1944 // - four 2-cost i64 extracts, 1945 // - two 2-cost i64 inserts, and 1946 // - two 1-cost muls. 1947 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with 1948 // LT.first = 2 the cost is 28. If both operands are extensions it will not 1949 // need to scalarize so the cost can be cheaper (smull or umull). 1950 if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args)) 1951 return LT.first; 1952 return LT.first * 14; 1953 case ISD::ADD: 1954 case ISD::XOR: 1955 case ISD::OR: 1956 case ISD::AND: 1957 case ISD::SRL: 1958 case ISD::SRA: 1959 case ISD::SHL: 1960 // These nodes are marked as 'custom' for combining purposes only. 1961 // We know that they are legal. See LowerAdd in ISelLowering. 1962 return LT.first; 1963 1964 case ISD::FADD: 1965 case ISD::FSUB: 1966 case ISD::FMUL: 1967 case ISD::FDIV: 1968 case ISD::FNEG: 1969 // These nodes are marked as 'custom' just to lower them to SVE. 1970 // We know said lowering will incur no additional cost. 1971 if (!Ty->getScalarType()->isFP128Ty()) 1972 return 2 * LT.first; 1973 1974 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, 1975 Opd2Info, Opd1PropInfo, Opd2PropInfo); 1976 } 1977 } 1978 1979 InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty, 1980 ScalarEvolution *SE, 1981 const SCEV *Ptr) { 1982 // Address computations in vectorized code with non-consecutive addresses will 1983 // likely result in more instructions compared to scalar code where the 1984 // computation can more often be merged into the index mode. The resulting 1985 // extra micro-ops can significantly decrease throughput. 1986 unsigned NumVectorInstToHideOverhead = 10; 1987 int MaxMergeDistance = 64; 1988 1989 if (Ty->isVectorTy() && SE && 1990 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) 1991 return NumVectorInstToHideOverhead; 1992 1993 // In many cases the address computation is not merged into the instruction 1994 // addressing mode. 1995 return 1; 1996 } 1997 1998 InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 1999 Type *CondTy, 2000 CmpInst::Predicate VecPred, 2001 TTI::TargetCostKind CostKind, 2002 const Instruction *I) { 2003 // TODO: Handle other cost kinds. 2004 if (CostKind != TTI::TCK_RecipThroughput) 2005 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 2006 I); 2007 2008 int ISD = TLI->InstructionOpcodeToISD(Opcode); 2009 // We don't lower some vector selects well that are wider than the register 2010 // width. 2011 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) { 2012 // We would need this many instructions to hide the scalarization happening. 2013 const int AmortizationCost = 20; 2014 2015 // If VecPred is not set, check if we can get a predicate from the context 2016 // instruction, if its type matches the requested ValTy. 2017 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) { 2018 CmpInst::Predicate CurrentPred; 2019 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(), 2020 m_Value()))) 2021 VecPred = CurrentPred; 2022 } 2023 // Check if we have a compare/select chain that can be lowered using 2024 // a (F)CMxx & BFI pair. 2025 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE || 2026 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT || 2027 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ || 2028 VecPred == CmpInst::FCMP_UNE) { 2029 static const auto ValidMinMaxTys = { 2030 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, 2031 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64}; 2032 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16}; 2033 2034 auto LT = TLI->getTypeLegalizationCost(DL, ValTy); 2035 if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; }) || 2036 (ST->hasFullFP16() && 2037 any_of(ValidFP16MinMaxTys, [<](MVT M) { return M == LT.second; }))) 2038 return LT.first; 2039 } 2040 2041 static const TypeConversionCostTblEntry 2042 VectorSelectTbl[] = { 2043 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 }, 2044 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 }, 2045 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 }, 2046 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost }, 2047 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost }, 2048 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost } 2049 }; 2050 2051 EVT SelCondTy = TLI->getValueType(DL, CondTy); 2052 EVT SelValTy = TLI->getValueType(DL, ValTy); 2053 if (SelCondTy.isSimple() && SelValTy.isSimple()) { 2054 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD, 2055 SelCondTy.getSimpleVT(), 2056 SelValTy.getSimpleVT())) 2057 return Entry->Cost; 2058 } 2059 } 2060 // The base case handles scalable vectors fine for now, since it treats the 2061 // cost as 1 * legalization cost. 2062 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); 2063 } 2064 2065 AArch64TTIImpl::TTI::MemCmpExpansionOptions 2066 AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { 2067 TTI::MemCmpExpansionOptions Options; 2068 if (ST->requiresStrictAlign()) { 2069 // TODO: Add cost modeling for strict align. Misaligned loads expand to 2070 // a bunch of instructions when strict align is enabled. 2071 return Options; 2072 } 2073 Options.AllowOverlappingLoads = true; 2074 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); 2075 Options.NumLoadsPerBlock = Options.MaxNumLoads; 2076 // TODO: Though vector loads usually perform well on AArch64, in some targets 2077 // they may wake up the FP unit, which raises the power consumption. Perhaps 2078 // they could be used with no holds barred (-O3). 2079 Options.LoadSizes = {8, 4, 2, 1}; 2080 return Options; 2081 } 2082 2083 bool AArch64TTIImpl::prefersVectorizedAddressing() const { 2084 return ST->hasSVE(); 2085 } 2086 2087 InstructionCost 2088 AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, 2089 Align Alignment, unsigned AddressSpace, 2090 TTI::TargetCostKind CostKind) { 2091 if (useNeonVector(Src)) 2092 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 2093 CostKind); 2094 auto LT = TLI->getTypeLegalizationCost(DL, Src); 2095 if (!LT.first.isValid()) 2096 return InstructionCost::getInvalid(); 2097 2098 // The code-generator is currently not able to handle scalable vectors 2099 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 2100 // it. This change will be removed when code-generation for these types is 2101 // sufficiently reliable. 2102 if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1)) 2103 return InstructionCost::getInvalid(); 2104 2105 return LT.first * 2; 2106 } 2107 2108 static unsigned getSVEGatherScatterOverhead(unsigned Opcode) { 2109 return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead; 2110 } 2111 2112 InstructionCost AArch64TTIImpl::getGatherScatterOpCost( 2113 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, 2114 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { 2115 if (useNeonVector(DataTy)) 2116 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 2117 Alignment, CostKind, I); 2118 auto *VT = cast<VectorType>(DataTy); 2119 auto LT = TLI->getTypeLegalizationCost(DL, DataTy); 2120 if (!LT.first.isValid()) 2121 return InstructionCost::getInvalid(); 2122 2123 // The code-generator is currently not able to handle scalable vectors 2124 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 2125 // it. This change will be removed when code-generation for these types is 2126 // sufficiently reliable. 2127 if (cast<VectorType>(DataTy)->getElementCount() == 2128 ElementCount::getScalable(1)) 2129 return InstructionCost::getInvalid(); 2130 2131 ElementCount LegalVF = LT.second.getVectorElementCount(); 2132 InstructionCost MemOpCost = 2133 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I); 2134 // Add on an overhead cost for using gathers/scatters. 2135 // TODO: At the moment this is applied unilaterally for all CPUs, but at some 2136 // point we may want a per-CPU overhead. 2137 MemOpCost *= getSVEGatherScatterOverhead(Opcode); 2138 return LT.first * MemOpCost * getMaxNumElements(LegalVF); 2139 } 2140 2141 bool AArch64TTIImpl::useNeonVector(const Type *Ty) const { 2142 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors(); 2143 } 2144 2145 InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, 2146 MaybeAlign Alignment, 2147 unsigned AddressSpace, 2148 TTI::TargetCostKind CostKind, 2149 const Instruction *I) { 2150 EVT VT = TLI->getValueType(DL, Ty, true); 2151 // Type legalization can't handle structs 2152 if (VT == MVT::Other) 2153 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, 2154 CostKind); 2155 2156 auto LT = TLI->getTypeLegalizationCost(DL, Ty); 2157 if (!LT.first.isValid()) 2158 return InstructionCost::getInvalid(); 2159 2160 // The code-generator is currently not able to handle scalable vectors 2161 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 2162 // it. This change will be removed when code-generation for these types is 2163 // sufficiently reliable. 2164 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty)) 2165 if (VTy->getElementCount() == ElementCount::getScalable(1)) 2166 return InstructionCost::getInvalid(); 2167 2168 // TODO: consider latency as well for TCK_SizeAndLatency. 2169 if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) 2170 return LT.first; 2171 2172 if (CostKind != TTI::TCK_RecipThroughput) 2173 return 1; 2174 2175 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store && 2176 LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) { 2177 // Unaligned stores are extremely inefficient. We don't split all 2178 // unaligned 128-bit stores because the negative impact that has shown in 2179 // practice on inlined block copy code. 2180 // We make such stores expensive so that we will only vectorize if there 2181 // are 6 other instructions getting vectorized. 2182 const int AmortizationCost = 6; 2183 2184 return LT.first * 2 * AmortizationCost; 2185 } 2186 2187 // Check truncating stores and extending loads. 2188 if (useNeonVector(Ty) && 2189 Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) { 2190 // v4i8 types are lowered to scalar a load/store and sshll/xtn. 2191 if (VT == MVT::v4i8) 2192 return 2; 2193 // Otherwise we need to scalarize. 2194 return cast<FixedVectorType>(Ty)->getNumElements() * 2; 2195 } 2196 2197 return LT.first; 2198 } 2199 2200 InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost( 2201 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 2202 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 2203 bool UseMaskForCond, bool UseMaskForGaps) { 2204 assert(Factor >= 2 && "Invalid interleave factor"); 2205 auto *VecVTy = cast<FixedVectorType>(VecTy); 2206 2207 if (!UseMaskForCond && !UseMaskForGaps && 2208 Factor <= TLI->getMaxSupportedInterleaveFactor()) { 2209 unsigned NumElts = VecVTy->getNumElements(); 2210 auto *SubVecTy = 2211 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor); 2212 2213 // ldN/stN only support legal vector types of size 64 or 128 in bits. 2214 // Accesses having vector types that are a multiple of 128 bits can be 2215 // matched to more than one ldN/stN instruction. 2216 bool UseScalable; 2217 if (NumElts % Factor == 0 && 2218 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable)) 2219 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable); 2220 } 2221 2222 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 2223 Alignment, AddressSpace, CostKind, 2224 UseMaskForCond, UseMaskForGaps); 2225 } 2226 2227 InstructionCost 2228 AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { 2229 InstructionCost Cost = 0; 2230 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2231 for (auto *I : Tys) { 2232 if (!I->isVectorTy()) 2233 continue; 2234 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() == 2235 128) 2236 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) + 2237 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind); 2238 } 2239 return Cost; 2240 } 2241 2242 unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) { 2243 return ST->getMaxInterleaveFactor(); 2244 } 2245 2246 // For Falkor, we want to avoid having too many strided loads in a loop since 2247 // that can exhaust the HW prefetcher resources. We adjust the unroller 2248 // MaxCount preference below to attempt to ensure unrolling doesn't create too 2249 // many strided loads. 2250 static void 2251 getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, 2252 TargetTransformInfo::UnrollingPreferences &UP) { 2253 enum { MaxStridedLoads = 7 }; 2254 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) { 2255 int StridedLoads = 0; 2256 // FIXME? We could make this more precise by looking at the CFG and 2257 // e.g. not counting loads in each side of an if-then-else diamond. 2258 for (const auto BB : L->blocks()) { 2259 for (auto &I : *BB) { 2260 LoadInst *LMemI = dyn_cast<LoadInst>(&I); 2261 if (!LMemI) 2262 continue; 2263 2264 Value *PtrValue = LMemI->getPointerOperand(); 2265 if (L->isLoopInvariant(PtrValue)) 2266 continue; 2267 2268 const SCEV *LSCEV = SE.getSCEV(PtrValue); 2269 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV); 2270 if (!LSCEVAddRec || !LSCEVAddRec->isAffine()) 2271 continue; 2272 2273 // FIXME? We could take pairing of unrolled load copies into account 2274 // by looking at the AddRec, but we would probably have to limit this 2275 // to loops with no stores or other memory optimization barriers. 2276 ++StridedLoads; 2277 // We've seen enough strided loads that seeing more won't make a 2278 // difference. 2279 if (StridedLoads > MaxStridedLoads / 2) 2280 return StridedLoads; 2281 } 2282 } 2283 return StridedLoads; 2284 }; 2285 2286 int StridedLoads = countStridedLoads(L, SE); 2287 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads 2288 << " strided loads\n"); 2289 // Pick the largest power of 2 unroll count that won't result in too many 2290 // strided loads. 2291 if (StridedLoads) { 2292 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads); 2293 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to " 2294 << UP.MaxCount << '\n'); 2295 } 2296 } 2297 2298 void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 2299 TTI::UnrollingPreferences &UP, 2300 OptimizationRemarkEmitter *ORE) { 2301 // Enable partial unrolling and runtime unrolling. 2302 BaseT::getUnrollingPreferences(L, SE, UP, ORE); 2303 2304 UP.UpperBound = true; 2305 2306 // For inner loop, it is more likely to be a hot one, and the runtime check 2307 // can be promoted out from LICM pass, so the overhead is less, let's try 2308 // a larger threshold to unroll more loops. 2309 if (L->getLoopDepth() > 1) 2310 UP.PartialThreshold *= 2; 2311 2312 // Disable partial & runtime unrolling on -Os. 2313 UP.PartialOptSizeThreshold = 0; 2314 2315 if (ST->getProcFamily() == AArch64Subtarget::Falkor && 2316 EnableFalkorHWPFUnrollFix) 2317 getFalkorUnrollingPreferences(L, SE, UP); 2318 2319 // Scan the loop: don't unroll loops with calls as this could prevent 2320 // inlining. Don't unroll vector loops either, as they don't benefit much from 2321 // unrolling. 2322 for (auto *BB : L->getBlocks()) { 2323 for (auto &I : *BB) { 2324 // Don't unroll vectorised loop. 2325 if (I.getType()->isVectorTy()) 2326 return; 2327 2328 if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 2329 if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 2330 if (!isLoweredToCall(F)) 2331 continue; 2332 } 2333 return; 2334 } 2335 } 2336 } 2337 2338 // Enable runtime unrolling for in-order models 2339 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by 2340 // checking for that case, we can ensure that the default behaviour is 2341 // unchanged 2342 if (ST->getProcFamily() != AArch64Subtarget::Others && 2343 !ST->getSchedModel().isOutOfOrder()) { 2344 UP.Runtime = true; 2345 UP.Partial = true; 2346 UP.UnrollRemainder = true; 2347 UP.DefaultUnrollRuntimeCount = 4; 2348 2349 UP.UnrollAndJam = true; 2350 UP.UnrollAndJamInnerLoopThreshold = 60; 2351 } 2352 } 2353 2354 void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 2355 TTI::PeelingPreferences &PP) { 2356 BaseT::getPeelingPreferences(L, SE, PP); 2357 } 2358 2359 Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, 2360 Type *ExpectedType) { 2361 switch (Inst->getIntrinsicID()) { 2362 default: 2363 return nullptr; 2364 case Intrinsic::aarch64_neon_st2: 2365 case Intrinsic::aarch64_neon_st3: 2366 case Intrinsic::aarch64_neon_st4: { 2367 // Create a struct type 2368 StructType *ST = dyn_cast<StructType>(ExpectedType); 2369 if (!ST) 2370 return nullptr; 2371 unsigned NumElts = Inst->arg_size() - 1; 2372 if (ST->getNumElements() != NumElts) 2373 return nullptr; 2374 for (unsigned i = 0, e = NumElts; i != e; ++i) { 2375 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i)) 2376 return nullptr; 2377 } 2378 Value *Res = UndefValue::get(ExpectedType); 2379 IRBuilder<> Builder(Inst); 2380 for (unsigned i = 0, e = NumElts; i != e; ++i) { 2381 Value *L = Inst->getArgOperand(i); 2382 Res = Builder.CreateInsertValue(Res, L, i); 2383 } 2384 return Res; 2385 } 2386 case Intrinsic::aarch64_neon_ld2: 2387 case Intrinsic::aarch64_neon_ld3: 2388 case Intrinsic::aarch64_neon_ld4: 2389 if (Inst->getType() == ExpectedType) 2390 return Inst; 2391 return nullptr; 2392 } 2393 } 2394 2395 bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, 2396 MemIntrinsicInfo &Info) { 2397 switch (Inst->getIntrinsicID()) { 2398 default: 2399 break; 2400 case Intrinsic::aarch64_neon_ld2: 2401 case Intrinsic::aarch64_neon_ld3: 2402 case Intrinsic::aarch64_neon_ld4: 2403 Info.ReadMem = true; 2404 Info.WriteMem = false; 2405 Info.PtrVal = Inst->getArgOperand(0); 2406 break; 2407 case Intrinsic::aarch64_neon_st2: 2408 case Intrinsic::aarch64_neon_st3: 2409 case Intrinsic::aarch64_neon_st4: 2410 Info.ReadMem = false; 2411 Info.WriteMem = true; 2412 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1); 2413 break; 2414 } 2415 2416 switch (Inst->getIntrinsicID()) { 2417 default: 2418 return false; 2419 case Intrinsic::aarch64_neon_ld2: 2420 case Intrinsic::aarch64_neon_st2: 2421 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS; 2422 break; 2423 case Intrinsic::aarch64_neon_ld3: 2424 case Intrinsic::aarch64_neon_st3: 2425 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS; 2426 break; 2427 case Intrinsic::aarch64_neon_ld4: 2428 case Intrinsic::aarch64_neon_st4: 2429 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS; 2430 break; 2431 } 2432 return true; 2433 } 2434 2435 /// See if \p I should be considered for address type promotion. We check if \p 2436 /// I is a sext with right type and used in memory accesses. If it used in a 2437 /// "complex" getelementptr, we allow it to be promoted without finding other 2438 /// sext instructions that sign extended the same initial value. A getelementptr 2439 /// is considered as "complex" if it has more than 2 operands. 2440 bool AArch64TTIImpl::shouldConsiderAddressTypePromotion( 2441 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) { 2442 bool Considerable = false; 2443 AllowPromotionWithoutCommonHeader = false; 2444 if (!isa<SExtInst>(&I)) 2445 return false; 2446 Type *ConsideredSExtType = 2447 Type::getInt64Ty(I.getParent()->getParent()->getContext()); 2448 if (I.getType() != ConsideredSExtType) 2449 return false; 2450 // See if the sext is the one with the right type and used in at least one 2451 // GetElementPtrInst. 2452 for (const User *U : I.users()) { 2453 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) { 2454 Considerable = true; 2455 // A getelementptr is considered as "complex" if it has more than 2 2456 // operands. We will promote a SExt used in such complex GEP as we 2457 // expect some computation to be merged if they are done on 64 bits. 2458 if (GEPInst->getNumOperands() > 2) { 2459 AllowPromotionWithoutCommonHeader = true; 2460 break; 2461 } 2462 } 2463 } 2464 return Considerable; 2465 } 2466 2467 bool AArch64TTIImpl::isLegalToVectorizeReduction( 2468 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const { 2469 if (!VF.isScalable()) 2470 return true; 2471 2472 Type *Ty = RdxDesc.getRecurrenceType(); 2473 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty)) 2474 return false; 2475 2476 switch (RdxDesc.getRecurrenceKind()) { 2477 case RecurKind::Add: 2478 case RecurKind::FAdd: 2479 case RecurKind::And: 2480 case RecurKind::Or: 2481 case RecurKind::Xor: 2482 case RecurKind::SMin: 2483 case RecurKind::SMax: 2484 case RecurKind::UMin: 2485 case RecurKind::UMax: 2486 case RecurKind::FMin: 2487 case RecurKind::FMax: 2488 case RecurKind::SelectICmp: 2489 case RecurKind::SelectFCmp: 2490 case RecurKind::FMulAdd: 2491 return true; 2492 default: 2493 return false; 2494 } 2495 } 2496 2497 InstructionCost 2498 AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, 2499 bool IsUnsigned, 2500 TTI::TargetCostKind CostKind) { 2501 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 2502 2503 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16()) 2504 return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind); 2505 2506 assert((isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) && 2507 "Both vector needs to be equally scalable"); 2508 2509 InstructionCost LegalizationCost = 0; 2510 if (LT.first > 1) { 2511 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext()); 2512 unsigned MinMaxOpcode = 2513 Ty->isFPOrFPVectorTy() 2514 ? Intrinsic::maxnum 2515 : (IsUnsigned ? Intrinsic::umin : Intrinsic::smin); 2516 IntrinsicCostAttributes Attrs(MinMaxOpcode, LegalVTy, {LegalVTy, LegalVTy}); 2517 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1); 2518 } 2519 2520 return LegalizationCost + /*Cost of horizontal reduction*/ 2; 2521 } 2522 2523 InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE( 2524 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) { 2525 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 2526 InstructionCost LegalizationCost = 0; 2527 if (LT.first > 1) { 2528 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext()); 2529 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind); 2530 LegalizationCost *= LT.first - 1; 2531 } 2532 2533 int ISD = TLI->InstructionOpcodeToISD(Opcode); 2534 assert(ISD && "Invalid opcode"); 2535 // Add the final reduction cost for the legal horizontal reduction 2536 switch (ISD) { 2537 case ISD::ADD: 2538 case ISD::AND: 2539 case ISD::OR: 2540 case ISD::XOR: 2541 case ISD::FADD: 2542 return LegalizationCost + 2; 2543 default: 2544 return InstructionCost::getInvalid(); 2545 } 2546 } 2547 2548 InstructionCost 2549 AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, 2550 Optional<FastMathFlags> FMF, 2551 TTI::TargetCostKind CostKind) { 2552 if (TTI::requiresOrderedReduction(FMF)) { 2553 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) { 2554 InstructionCost BaseCost = 2555 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 2556 // Add on extra cost to reflect the extra overhead on some CPUs. We still 2557 // end up vectorizing for more computationally intensive loops. 2558 return BaseCost + FixedVTy->getNumElements(); 2559 } 2560 2561 if (Opcode != Instruction::FAdd) 2562 return InstructionCost::getInvalid(); 2563 2564 auto *VTy = cast<ScalableVectorType>(ValTy); 2565 InstructionCost Cost = 2566 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind); 2567 Cost *= getMaxNumElements(VTy->getElementCount()); 2568 return Cost; 2569 } 2570 2571 if (isa<ScalableVectorType>(ValTy)) 2572 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind); 2573 2574 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 2575 MVT MTy = LT.second; 2576 int ISD = TLI->InstructionOpcodeToISD(Opcode); 2577 assert(ISD && "Invalid opcode"); 2578 2579 // Horizontal adds can use the 'addv' instruction. We model the cost of these 2580 // instructions as twice a normal vector add, plus 1 for each legalization 2581 // step (LT.first). This is the only arithmetic vector reduction operation for 2582 // which we have an instruction. 2583 // OR, XOR and AND costs should match the codegen from: 2584 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll 2585 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll 2586 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll 2587 static const CostTblEntry CostTblNoPairwise[]{ 2588 {ISD::ADD, MVT::v8i8, 2}, 2589 {ISD::ADD, MVT::v16i8, 2}, 2590 {ISD::ADD, MVT::v4i16, 2}, 2591 {ISD::ADD, MVT::v8i16, 2}, 2592 {ISD::ADD, MVT::v4i32, 2}, 2593 {ISD::OR, MVT::v8i8, 15}, 2594 {ISD::OR, MVT::v16i8, 17}, 2595 {ISD::OR, MVT::v4i16, 7}, 2596 {ISD::OR, MVT::v8i16, 9}, 2597 {ISD::OR, MVT::v2i32, 3}, 2598 {ISD::OR, MVT::v4i32, 5}, 2599 {ISD::OR, MVT::v2i64, 3}, 2600 {ISD::XOR, MVT::v8i8, 15}, 2601 {ISD::XOR, MVT::v16i8, 17}, 2602 {ISD::XOR, MVT::v4i16, 7}, 2603 {ISD::XOR, MVT::v8i16, 9}, 2604 {ISD::XOR, MVT::v2i32, 3}, 2605 {ISD::XOR, MVT::v4i32, 5}, 2606 {ISD::XOR, MVT::v2i64, 3}, 2607 {ISD::AND, MVT::v8i8, 15}, 2608 {ISD::AND, MVT::v16i8, 17}, 2609 {ISD::AND, MVT::v4i16, 7}, 2610 {ISD::AND, MVT::v8i16, 9}, 2611 {ISD::AND, MVT::v2i32, 3}, 2612 {ISD::AND, MVT::v4i32, 5}, 2613 {ISD::AND, MVT::v2i64, 3}, 2614 }; 2615 switch (ISD) { 2616 default: 2617 break; 2618 case ISD::ADD: 2619 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy)) 2620 return (LT.first - 1) + Entry->Cost; 2621 break; 2622 case ISD::XOR: 2623 case ISD::AND: 2624 case ISD::OR: 2625 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy); 2626 if (!Entry) 2627 break; 2628 auto *ValVTy = cast<FixedVectorType>(ValTy); 2629 if (!ValVTy->getElementType()->isIntegerTy(1) && 2630 MTy.getVectorNumElements() <= ValVTy->getNumElements() && 2631 isPowerOf2_32(ValVTy->getNumElements())) { 2632 InstructionCost ExtraCost = 0; 2633 if (LT.first != 1) { 2634 // Type needs to be split, so there is an extra cost of LT.first - 1 2635 // arithmetic ops. 2636 auto *Ty = FixedVectorType::get(ValTy->getElementType(), 2637 MTy.getVectorNumElements()); 2638 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind); 2639 ExtraCost *= LT.first - 1; 2640 } 2641 return Entry->Cost + ExtraCost; 2642 } 2643 break; 2644 } 2645 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 2646 } 2647 2648 InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) { 2649 static const CostTblEntry ShuffleTbl[] = { 2650 { TTI::SK_Splice, MVT::nxv16i8, 1 }, 2651 { TTI::SK_Splice, MVT::nxv8i16, 1 }, 2652 { TTI::SK_Splice, MVT::nxv4i32, 1 }, 2653 { TTI::SK_Splice, MVT::nxv2i64, 1 }, 2654 { TTI::SK_Splice, MVT::nxv2f16, 1 }, 2655 { TTI::SK_Splice, MVT::nxv4f16, 1 }, 2656 { TTI::SK_Splice, MVT::nxv8f16, 1 }, 2657 { TTI::SK_Splice, MVT::nxv2bf16, 1 }, 2658 { TTI::SK_Splice, MVT::nxv4bf16, 1 }, 2659 { TTI::SK_Splice, MVT::nxv8bf16, 1 }, 2660 { TTI::SK_Splice, MVT::nxv2f32, 1 }, 2661 { TTI::SK_Splice, MVT::nxv4f32, 1 }, 2662 { TTI::SK_Splice, MVT::nxv2f64, 1 }, 2663 }; 2664 2665 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 2666 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext()); 2667 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2668 EVT PromotedVT = LT.second.getScalarType() == MVT::i1 2669 ? TLI->getPromotedVTForPredicate(EVT(LT.second)) 2670 : LT.second; 2671 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext()); 2672 InstructionCost LegalizationCost = 0; 2673 if (Index < 0) { 2674 LegalizationCost = 2675 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy, 2676 CmpInst::BAD_ICMP_PREDICATE, CostKind) + 2677 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy, 2678 CmpInst::BAD_ICMP_PREDICATE, CostKind); 2679 } 2680 2681 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp 2682 // Cost performed on a promoted type. 2683 if (LT.second.getScalarType() == MVT::i1) { 2684 LegalizationCost += 2685 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy, 2686 TTI::CastContextHint::None, CostKind) + 2687 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy, 2688 TTI::CastContextHint::None, CostKind); 2689 } 2690 const auto *Entry = 2691 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT()); 2692 assert(Entry && "Illegal Type for Splice"); 2693 LegalizationCost += Entry->Cost; 2694 return LegalizationCost * LT.first; 2695 } 2696 2697 InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, 2698 VectorType *Tp, 2699 ArrayRef<int> Mask, int Index, 2700 VectorType *SubTp, 2701 ArrayRef<const Value *> Args) { 2702 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 2703 // If we have a Mask, and the LT is being legalized somehow, split the Mask 2704 // into smaller vectors and sum the cost of each shuffle. 2705 if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() && 2706 Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() && 2707 cast<FixedVectorType>(Tp)->getNumElements() > 2708 LT.second.getVectorNumElements() && 2709 !Index && !SubTp) { 2710 unsigned TpNumElts = cast<FixedVectorType>(Tp)->getNumElements(); 2711 assert(Mask.size() == TpNumElts && "Expected Mask and Tp size to match!"); 2712 unsigned LTNumElts = LT.second.getVectorNumElements(); 2713 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts; 2714 VectorType *NTp = 2715 VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount()); 2716 InstructionCost Cost; 2717 for (unsigned N = 0; N < NumVecs; N++) { 2718 SmallVector<int> NMask; 2719 // Split the existing mask into chunks of size LTNumElts. Track the source 2720 // sub-vectors to ensure the result has at most 2 inputs. 2721 unsigned Source1, Source2; 2722 unsigned NumSources = 0; 2723 for (unsigned E = 0; E < LTNumElts; E++) { 2724 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E] 2725 : UndefMaskElem; 2726 if (MaskElt < 0) { 2727 NMask.push_back(UndefMaskElem); 2728 continue; 2729 } 2730 2731 // Calculate which source from the input this comes from and whether it 2732 // is new to us. 2733 unsigned Source = MaskElt / LTNumElts; 2734 if (NumSources == 0) { 2735 Source1 = Source; 2736 NumSources = 1; 2737 } else if (NumSources == 1 && Source != Source1) { 2738 Source2 = Source; 2739 NumSources = 2; 2740 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) { 2741 NumSources++; 2742 } 2743 2744 // Add to the new mask. For the NumSources>2 case these are not correct, 2745 // but are only used for the modular lane number. 2746 if (Source == Source1) 2747 NMask.push_back(MaskElt % LTNumElts); 2748 else if (Source == Source2) 2749 NMask.push_back(MaskElt % LTNumElts + LTNumElts); 2750 else 2751 NMask.push_back(MaskElt % LTNumElts); 2752 } 2753 // If the sub-mask has at most 2 input sub-vectors then re-cost it using 2754 // getShuffleCost. If not then cost it using the worst case. 2755 if (NumSources <= 2) 2756 Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc 2757 : TTI::SK_PermuteTwoSrc, 2758 NTp, NMask, 0, nullptr, Args); 2759 else if (any_of(enumerate(NMask), [&](const auto &ME) { 2760 return ME.value() % LTNumElts == ME.index(); 2761 })) 2762 Cost += LTNumElts - 1; 2763 else 2764 Cost += LTNumElts; 2765 } 2766 return Cost; 2767 } 2768 2769 Kind = improveShuffleKindFromMask(Kind, Mask); 2770 2771 // Check for broadcast loads. 2772 if (Kind == TTI::SK_Broadcast) { 2773 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]); 2774 if (IsLoad && LT.second.isVector() && 2775 isLegalBroadcastLoad(Tp->getElementType(), 2776 LT.second.getVectorElementCount())) 2777 return 0; // broadcast is handled by ld1r 2778 } 2779 2780 // If we have 4 elements for the shuffle and a Mask, get the cost straight 2781 // from the perfect shuffle tables. 2782 if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) && 2783 (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) && 2784 all_of(Mask, [](int E) { return E < 8; })) 2785 return getPerfectShuffleCost(Mask); 2786 2787 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose || 2788 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc || 2789 Kind == TTI::SK_Reverse) { 2790 2791 static const CostTblEntry ShuffleTbl[] = { 2792 // Broadcast shuffle kinds can be performed with 'dup'. 2793 { TTI::SK_Broadcast, MVT::v8i8, 1 }, 2794 { TTI::SK_Broadcast, MVT::v16i8, 1 }, 2795 { TTI::SK_Broadcast, MVT::v4i16, 1 }, 2796 { TTI::SK_Broadcast, MVT::v8i16, 1 }, 2797 { TTI::SK_Broadcast, MVT::v2i32, 1 }, 2798 { TTI::SK_Broadcast, MVT::v4i32, 1 }, 2799 { TTI::SK_Broadcast, MVT::v2i64, 1 }, 2800 { TTI::SK_Broadcast, MVT::v2f32, 1 }, 2801 { TTI::SK_Broadcast, MVT::v4f32, 1 }, 2802 { TTI::SK_Broadcast, MVT::v2f64, 1 }, 2803 // Transpose shuffle kinds can be performed with 'trn1/trn2' and 2804 // 'zip1/zip2' instructions. 2805 { TTI::SK_Transpose, MVT::v8i8, 1 }, 2806 { TTI::SK_Transpose, MVT::v16i8, 1 }, 2807 { TTI::SK_Transpose, MVT::v4i16, 1 }, 2808 { TTI::SK_Transpose, MVT::v8i16, 1 }, 2809 { TTI::SK_Transpose, MVT::v2i32, 1 }, 2810 { TTI::SK_Transpose, MVT::v4i32, 1 }, 2811 { TTI::SK_Transpose, MVT::v2i64, 1 }, 2812 { TTI::SK_Transpose, MVT::v2f32, 1 }, 2813 { TTI::SK_Transpose, MVT::v4f32, 1 }, 2814 { TTI::SK_Transpose, MVT::v2f64, 1 }, 2815 // Select shuffle kinds. 2816 // TODO: handle vXi8/vXi16. 2817 { TTI::SK_Select, MVT::v2i32, 1 }, // mov. 2818 { TTI::SK_Select, MVT::v4i32, 2 }, // rev+trn (or similar). 2819 { TTI::SK_Select, MVT::v2i64, 1 }, // mov. 2820 { TTI::SK_Select, MVT::v2f32, 1 }, // mov. 2821 { TTI::SK_Select, MVT::v4f32, 2 }, // rev+trn (or similar). 2822 { TTI::SK_Select, MVT::v2f64, 1 }, // mov. 2823 // PermuteSingleSrc shuffle kinds. 2824 { TTI::SK_PermuteSingleSrc, MVT::v2i32, 1 }, // mov. 2825 { TTI::SK_PermuteSingleSrc, MVT::v4i32, 3 }, // perfectshuffle worst case. 2826 { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // mov. 2827 { TTI::SK_PermuteSingleSrc, MVT::v2f32, 1 }, // mov. 2828 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 3 }, // perfectshuffle worst case. 2829 { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // mov. 2830 { TTI::SK_PermuteSingleSrc, MVT::v4i16, 3 }, // perfectshuffle worst case. 2831 { TTI::SK_PermuteSingleSrc, MVT::v4f16, 3 }, // perfectshuffle worst case. 2832 { TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3 }, // perfectshuffle worst case. 2833 { TTI::SK_PermuteSingleSrc, MVT::v8i16, 8 }, // constpool + load + tbl 2834 { TTI::SK_PermuteSingleSrc, MVT::v8f16, 8 }, // constpool + load + tbl 2835 { TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8 }, // constpool + load + tbl 2836 { TTI::SK_PermuteSingleSrc, MVT::v8i8, 8 }, // constpool + load + tbl 2837 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 8 }, // constpool + load + tbl 2838 // Reverse can be lowered with `rev`. 2839 { TTI::SK_Reverse, MVT::v2i32, 1 }, // mov. 2840 { TTI::SK_Reverse, MVT::v4i32, 2 }, // REV64; EXT 2841 { TTI::SK_Reverse, MVT::v2i64, 1 }, // mov. 2842 { TTI::SK_Reverse, MVT::v2f32, 1 }, // mov. 2843 { TTI::SK_Reverse, MVT::v4f32, 2 }, // REV64; EXT 2844 { TTI::SK_Reverse, MVT::v2f64, 1 }, // mov. 2845 { TTI::SK_Reverse, MVT::v8f16, 2 }, // REV64; EXT 2846 { TTI::SK_Reverse, MVT::v8i16, 2 }, // REV64; EXT 2847 { TTI::SK_Reverse, MVT::v16i8, 2 }, // REV64; EXT 2848 { TTI::SK_Reverse, MVT::v4f16, 1 }, // REV64 2849 { TTI::SK_Reverse, MVT::v4i16, 1 }, // REV64 2850 { TTI::SK_Reverse, MVT::v8i8, 1 }, // REV64 2851 // Broadcast shuffle kinds for scalable vectors 2852 { TTI::SK_Broadcast, MVT::nxv16i8, 1 }, 2853 { TTI::SK_Broadcast, MVT::nxv8i16, 1 }, 2854 { TTI::SK_Broadcast, MVT::nxv4i32, 1 }, 2855 { TTI::SK_Broadcast, MVT::nxv2i64, 1 }, 2856 { TTI::SK_Broadcast, MVT::nxv2f16, 1 }, 2857 { TTI::SK_Broadcast, MVT::nxv4f16, 1 }, 2858 { TTI::SK_Broadcast, MVT::nxv8f16, 1 }, 2859 { TTI::SK_Broadcast, MVT::nxv2bf16, 1 }, 2860 { TTI::SK_Broadcast, MVT::nxv4bf16, 1 }, 2861 { TTI::SK_Broadcast, MVT::nxv8bf16, 1 }, 2862 { TTI::SK_Broadcast, MVT::nxv2f32, 1 }, 2863 { TTI::SK_Broadcast, MVT::nxv4f32, 1 }, 2864 { TTI::SK_Broadcast, MVT::nxv2f64, 1 }, 2865 { TTI::SK_Broadcast, MVT::nxv16i1, 1 }, 2866 { TTI::SK_Broadcast, MVT::nxv8i1, 1 }, 2867 { TTI::SK_Broadcast, MVT::nxv4i1, 1 }, 2868 { TTI::SK_Broadcast, MVT::nxv2i1, 1 }, 2869 // Handle the cases for vector.reverse with scalable vectors 2870 { TTI::SK_Reverse, MVT::nxv16i8, 1 }, 2871 { TTI::SK_Reverse, MVT::nxv8i16, 1 }, 2872 { TTI::SK_Reverse, MVT::nxv4i32, 1 }, 2873 { TTI::SK_Reverse, MVT::nxv2i64, 1 }, 2874 { TTI::SK_Reverse, MVT::nxv2f16, 1 }, 2875 { TTI::SK_Reverse, MVT::nxv4f16, 1 }, 2876 { TTI::SK_Reverse, MVT::nxv8f16, 1 }, 2877 { TTI::SK_Reverse, MVT::nxv2bf16, 1 }, 2878 { TTI::SK_Reverse, MVT::nxv4bf16, 1 }, 2879 { TTI::SK_Reverse, MVT::nxv8bf16, 1 }, 2880 { TTI::SK_Reverse, MVT::nxv2f32, 1 }, 2881 { TTI::SK_Reverse, MVT::nxv4f32, 1 }, 2882 { TTI::SK_Reverse, MVT::nxv2f64, 1 }, 2883 { TTI::SK_Reverse, MVT::nxv16i1, 1 }, 2884 { TTI::SK_Reverse, MVT::nxv8i1, 1 }, 2885 { TTI::SK_Reverse, MVT::nxv4i1, 1 }, 2886 { TTI::SK_Reverse, MVT::nxv2i1, 1 }, 2887 }; 2888 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second)) 2889 return LT.first * Entry->Cost; 2890 } 2891 2892 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp)) 2893 return getSpliceCost(Tp, Index); 2894 2895 // Inserting a subvector can often be done with either a D, S or H register 2896 // move, so long as the inserted vector is "aligned". 2897 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() && 2898 LT.second.getSizeInBits() <= 128 && SubTp) { 2899 std::pair<InstructionCost, MVT> SubLT = 2900 TLI->getTypeLegalizationCost(DL, SubTp); 2901 if (SubLT.second.isVector()) { 2902 int NumElts = LT.second.getVectorNumElements(); 2903 int NumSubElts = SubLT.second.getVectorNumElements(); 2904 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) 2905 return SubLT.first; 2906 } 2907 } 2908 2909 return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp); 2910 } 2911