1 //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "AArch64TargetTransformInfo.h" 10 #include "AArch64ExpandImm.h" 11 #include "AArch64PerfectShuffle.h" 12 #include "MCTargetDesc/AArch64AddressingModes.h" 13 #include "llvm/Analysis/IVDescriptors.h" 14 #include "llvm/Analysis/LoopInfo.h" 15 #include "llvm/Analysis/TargetTransformInfo.h" 16 #include "llvm/CodeGen/BasicTTIImpl.h" 17 #include "llvm/CodeGen/CostTable.h" 18 #include "llvm/CodeGen/TargetLowering.h" 19 #include "llvm/IR/IntrinsicInst.h" 20 #include "llvm/IR/Intrinsics.h" 21 #include "llvm/IR/IntrinsicsAArch64.h" 22 #include "llvm/IR/PatternMatch.h" 23 #include "llvm/Support/Debug.h" 24 #include "llvm/Transforms/InstCombine/InstCombiner.h" 25 #include <algorithm> 26 using namespace llvm; 27 using namespace llvm::PatternMatch; 28 29 #define DEBUG_TYPE "aarch64tti" 30 31 static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", 32 cl::init(true), cl::Hidden); 33 34 static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10), 35 cl::Hidden); 36 37 static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead", 38 cl::init(10), cl::Hidden); 39 40 bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, 41 const Function *Callee) const { 42 const TargetMachine &TM = getTLI()->getTargetMachine(); 43 44 const FeatureBitset &CallerBits = 45 TM.getSubtargetImpl(*Caller)->getFeatureBits(); 46 const FeatureBitset &CalleeBits = 47 TM.getSubtargetImpl(*Callee)->getFeatureBits(); 48 49 // Inline a callee if its target-features are a subset of the callers 50 // target-features. 51 return (CallerBits & CalleeBits) == CalleeBits; 52 } 53 54 bool AArch64TTIImpl::shouldMaximizeVectorBandwidth( 55 TargetTransformInfo::RegisterKind K) const { 56 assert(K != TargetTransformInfo::RGK_Scalar); 57 return K == TargetTransformInfo::RGK_FixedWidthVector; 58 } 59 60 /// Calculate the cost of materializing a 64-bit value. This helper 61 /// method might only calculate a fraction of a larger immediate. Therefore it 62 /// is valid to return a cost of ZERO. 63 InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) { 64 // Check if the immediate can be encoded within an instruction. 65 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64)) 66 return 0; 67 68 if (Val < 0) 69 Val = ~Val; 70 71 // Calculate how many moves we will need to materialize this constant. 72 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 73 AArch64_IMM::expandMOVImm(Val, 64, Insn); 74 return Insn.size(); 75 } 76 77 /// Calculate the cost of materializing the given constant. 78 InstructionCost AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, 79 TTI::TargetCostKind CostKind) { 80 assert(Ty->isIntegerTy()); 81 82 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 83 if (BitSize == 0) 84 return ~0U; 85 86 // Sign-extend all constants to a multiple of 64-bit. 87 APInt ImmVal = Imm; 88 if (BitSize & 0x3f) 89 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU); 90 91 // Split the constant into 64-bit chunks and calculate the cost for each 92 // chunk. 93 InstructionCost Cost = 0; 94 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { 95 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); 96 int64_t Val = Tmp.getSExtValue(); 97 Cost += getIntImmCost(Val); 98 } 99 // We need at least one instruction to materialze the constant. 100 return std::max<InstructionCost>(1, Cost); 101 } 102 103 InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, 104 const APInt &Imm, Type *Ty, 105 TTI::TargetCostKind CostKind, 106 Instruction *Inst) { 107 assert(Ty->isIntegerTy()); 108 109 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 110 // There is no cost model for constants with a bit size of 0. Return TCC_Free 111 // here, so that constant hoisting will ignore this constant. 112 if (BitSize == 0) 113 return TTI::TCC_Free; 114 115 unsigned ImmIdx = ~0U; 116 switch (Opcode) { 117 default: 118 return TTI::TCC_Free; 119 case Instruction::GetElementPtr: 120 // Always hoist the base address of a GetElementPtr. 121 if (Idx == 0) 122 return 2 * TTI::TCC_Basic; 123 return TTI::TCC_Free; 124 case Instruction::Store: 125 ImmIdx = 0; 126 break; 127 case Instruction::Add: 128 case Instruction::Sub: 129 case Instruction::Mul: 130 case Instruction::UDiv: 131 case Instruction::SDiv: 132 case Instruction::URem: 133 case Instruction::SRem: 134 case Instruction::And: 135 case Instruction::Or: 136 case Instruction::Xor: 137 case Instruction::ICmp: 138 ImmIdx = 1; 139 break; 140 // Always return TCC_Free for the shift value of a shift instruction. 141 case Instruction::Shl: 142 case Instruction::LShr: 143 case Instruction::AShr: 144 if (Idx == 1) 145 return TTI::TCC_Free; 146 break; 147 case Instruction::Trunc: 148 case Instruction::ZExt: 149 case Instruction::SExt: 150 case Instruction::IntToPtr: 151 case Instruction::PtrToInt: 152 case Instruction::BitCast: 153 case Instruction::PHI: 154 case Instruction::Call: 155 case Instruction::Select: 156 case Instruction::Ret: 157 case Instruction::Load: 158 break; 159 } 160 161 if (Idx == ImmIdx) { 162 int NumConstants = (BitSize + 63) / 64; 163 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 164 return (Cost <= NumConstants * TTI::TCC_Basic) 165 ? static_cast<int>(TTI::TCC_Free) 166 : Cost; 167 } 168 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 169 } 170 171 InstructionCost 172 AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, 173 const APInt &Imm, Type *Ty, 174 TTI::TargetCostKind CostKind) { 175 assert(Ty->isIntegerTy()); 176 177 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 178 // There is no cost model for constants with a bit size of 0. Return TCC_Free 179 // here, so that constant hoisting will ignore this constant. 180 if (BitSize == 0) 181 return TTI::TCC_Free; 182 183 // Most (all?) AArch64 intrinsics do not support folding immediates into the 184 // selected instruction, so we compute the materialization cost for the 185 // immediate directly. 186 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv) 187 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 188 189 switch (IID) { 190 default: 191 return TTI::TCC_Free; 192 case Intrinsic::sadd_with_overflow: 193 case Intrinsic::uadd_with_overflow: 194 case Intrinsic::ssub_with_overflow: 195 case Intrinsic::usub_with_overflow: 196 case Intrinsic::smul_with_overflow: 197 case Intrinsic::umul_with_overflow: 198 if (Idx == 1) { 199 int NumConstants = (BitSize + 63) / 64; 200 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 201 return (Cost <= NumConstants * TTI::TCC_Basic) 202 ? static_cast<int>(TTI::TCC_Free) 203 : Cost; 204 } 205 break; 206 case Intrinsic::experimental_stackmap: 207 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 208 return TTI::TCC_Free; 209 break; 210 case Intrinsic::experimental_patchpoint_void: 211 case Intrinsic::experimental_patchpoint_i64: 212 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 213 return TTI::TCC_Free; 214 break; 215 case Intrinsic::experimental_gc_statepoint: 216 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 217 return TTI::TCC_Free; 218 break; 219 } 220 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 221 } 222 223 TargetTransformInfo::PopcntSupportKind 224 AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) { 225 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 226 if (TyWidth == 32 || TyWidth == 64) 227 return TTI::PSK_FastHardware; 228 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount. 229 return TTI::PSK_Software; 230 } 231 232 InstructionCost 233 AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 234 TTI::TargetCostKind CostKind) { 235 auto *RetTy = ICA.getReturnType(); 236 switch (ICA.getID()) { 237 case Intrinsic::umin: 238 case Intrinsic::umax: 239 case Intrinsic::smin: 240 case Intrinsic::smax: { 241 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, 242 MVT::v8i16, MVT::v2i32, MVT::v4i32}; 243 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 244 // v2i64 types get converted to cmp+bif hence the cost of 2 245 if (LT.second == MVT::v2i64) 246 return LT.first * 2; 247 if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; })) 248 return LT.first; 249 break; 250 } 251 case Intrinsic::sadd_sat: 252 case Intrinsic::ssub_sat: 253 case Intrinsic::uadd_sat: 254 case Intrinsic::usub_sat: { 255 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, 256 MVT::v8i16, MVT::v2i32, MVT::v4i32, 257 MVT::v2i64}; 258 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 259 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we 260 // need to extend the type, as it uses shr(qadd(shl, shl)). 261 unsigned Instrs = 262 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4; 263 if (any_of(ValidSatTys, [<](MVT M) { return M == LT.second; })) 264 return LT.first * Instrs; 265 break; 266 } 267 case Intrinsic::abs: { 268 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, 269 MVT::v8i16, MVT::v2i32, MVT::v4i32, 270 MVT::v2i64}; 271 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 272 if (any_of(ValidAbsTys, [<](MVT M) { return M == LT.second; })) 273 return LT.first; 274 break; 275 } 276 case Intrinsic::experimental_stepvector: { 277 InstructionCost Cost = 1; // Cost of the `index' instruction 278 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 279 // Legalisation of illegal vectors involves an `index' instruction plus 280 // (LT.first - 1) vector adds. 281 if (LT.first > 1) { 282 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext()); 283 InstructionCost AddCost = 284 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind); 285 Cost += AddCost * (LT.first - 1); 286 } 287 return Cost; 288 } 289 case Intrinsic::bitreverse: { 290 static const CostTblEntry BitreverseTbl[] = { 291 {Intrinsic::bitreverse, MVT::i32, 1}, 292 {Intrinsic::bitreverse, MVT::i64, 1}, 293 {Intrinsic::bitreverse, MVT::v8i8, 1}, 294 {Intrinsic::bitreverse, MVT::v16i8, 1}, 295 {Intrinsic::bitreverse, MVT::v4i16, 2}, 296 {Intrinsic::bitreverse, MVT::v8i16, 2}, 297 {Intrinsic::bitreverse, MVT::v2i32, 2}, 298 {Intrinsic::bitreverse, MVT::v4i32, 2}, 299 {Intrinsic::bitreverse, MVT::v1i64, 2}, 300 {Intrinsic::bitreverse, MVT::v2i64, 2}, 301 }; 302 const auto LegalisationCost = TLI->getTypeLegalizationCost(DL, RetTy); 303 const auto *Entry = 304 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second); 305 if (Entry) { 306 // Cost Model is using the legal type(i32) that i8 and i16 will be 307 // converted to +1 so that we match the actual lowering cost 308 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 || 309 TLI->getValueType(DL, RetTy, true) == MVT::i16) 310 return LegalisationCost.first * Entry->Cost + 1; 311 312 return LegalisationCost.first * Entry->Cost; 313 } 314 break; 315 } 316 case Intrinsic::ctpop: { 317 static const CostTblEntry CtpopCostTbl[] = { 318 {ISD::CTPOP, MVT::v2i64, 4}, 319 {ISD::CTPOP, MVT::v4i32, 3}, 320 {ISD::CTPOP, MVT::v8i16, 2}, 321 {ISD::CTPOP, MVT::v16i8, 1}, 322 {ISD::CTPOP, MVT::i64, 4}, 323 {ISD::CTPOP, MVT::v2i32, 3}, 324 {ISD::CTPOP, MVT::v4i16, 2}, 325 {ISD::CTPOP, MVT::v8i8, 1}, 326 {ISD::CTPOP, MVT::i32, 5}, 327 }; 328 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 329 MVT MTy = LT.second; 330 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) { 331 // Extra cost of +1 when illegal vector types are legalized by promoting 332 // the integer type. 333 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() != 334 RetTy->getScalarSizeInBits() 335 ? 1 336 : 0; 337 return LT.first * Entry->Cost + ExtraCost; 338 } 339 break; 340 } 341 case Intrinsic::sadd_with_overflow: 342 case Intrinsic::uadd_with_overflow: 343 case Intrinsic::ssub_with_overflow: 344 case Intrinsic::usub_with_overflow: 345 case Intrinsic::smul_with_overflow: 346 case Intrinsic::umul_with_overflow: { 347 static const CostTblEntry WithOverflowCostTbl[] = { 348 {Intrinsic::sadd_with_overflow, MVT::i8, 3}, 349 {Intrinsic::uadd_with_overflow, MVT::i8, 3}, 350 {Intrinsic::sadd_with_overflow, MVT::i16, 3}, 351 {Intrinsic::uadd_with_overflow, MVT::i16, 3}, 352 {Intrinsic::sadd_with_overflow, MVT::i32, 1}, 353 {Intrinsic::uadd_with_overflow, MVT::i32, 1}, 354 {Intrinsic::sadd_with_overflow, MVT::i64, 1}, 355 {Intrinsic::uadd_with_overflow, MVT::i64, 1}, 356 {Intrinsic::ssub_with_overflow, MVT::i8, 3}, 357 {Intrinsic::usub_with_overflow, MVT::i8, 3}, 358 {Intrinsic::ssub_with_overflow, MVT::i16, 3}, 359 {Intrinsic::usub_with_overflow, MVT::i16, 3}, 360 {Intrinsic::ssub_with_overflow, MVT::i32, 1}, 361 {Intrinsic::usub_with_overflow, MVT::i32, 1}, 362 {Intrinsic::ssub_with_overflow, MVT::i64, 1}, 363 {Intrinsic::usub_with_overflow, MVT::i64, 1}, 364 {Intrinsic::smul_with_overflow, MVT::i8, 5}, 365 {Intrinsic::umul_with_overflow, MVT::i8, 4}, 366 {Intrinsic::smul_with_overflow, MVT::i16, 5}, 367 {Intrinsic::umul_with_overflow, MVT::i16, 4}, 368 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst 369 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw 370 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp 371 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr 372 }; 373 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true); 374 if (MTy.isSimple()) 375 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(), 376 MTy.getSimpleVT())) 377 return Entry->Cost; 378 break; 379 } 380 case Intrinsic::fptosi_sat: 381 case Intrinsic::fptoui_sat: { 382 if (ICA.getArgTypes().empty()) 383 break; 384 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat; 385 auto LT = TLI->getTypeLegalizationCost(DL, ICA.getArgTypes()[0]); 386 EVT MTy = TLI->getValueType(DL, RetTy); 387 // Check for the legal types, which are where the size of the input and the 388 // output are the same, or we are using cvt f64->i32 or f32->i64. 389 if ((LT.second == MVT::f32 || LT.second == MVT::f64 || 390 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 || 391 LT.second == MVT::v2f64) && 392 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() || 393 (LT.second == MVT::f64 && MTy == MVT::i32) || 394 (LT.second == MVT::f32 && MTy == MVT::i64))) 395 return LT.first; 396 // Similarly for fp16 sizes 397 if (ST->hasFullFP16() && 398 ((LT.second == MVT::f16 && MTy == MVT::i32) || 399 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) && 400 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))) 401 return LT.first; 402 403 // Otherwise we use a legal convert followed by a min+max 404 if ((LT.second.getScalarType() == MVT::f32 || 405 LT.second.getScalarType() == MVT::f64 || 406 (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) && 407 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) { 408 Type *LegalTy = 409 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits()); 410 if (LT.second.isVector()) 411 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount()); 412 InstructionCost Cost = 1; 413 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin, 414 LegalTy, {LegalTy, LegalTy}); 415 Cost += getIntrinsicInstrCost(Attrs1, CostKind); 416 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax, 417 LegalTy, {LegalTy, LegalTy}); 418 Cost += getIntrinsicInstrCost(Attrs2, CostKind); 419 return LT.first * Cost; 420 } 421 break; 422 } 423 case Intrinsic::fshl: 424 case Intrinsic::fshr: 425 // FIXME: Match legacy behavior; this is probably not the right costing. 426 if (isa<ScalableVectorType>(RetTy)) 427 return 1; 428 break; 429 default: 430 break; 431 } 432 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 433 } 434 435 /// The function will remove redundant reinterprets casting in the presence 436 /// of the control flow 437 static Optional<Instruction *> processPhiNode(InstCombiner &IC, 438 IntrinsicInst &II) { 439 SmallVector<Instruction *, 32> Worklist; 440 auto RequiredType = II.getType(); 441 442 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0)); 443 assert(PN && "Expected Phi Node!"); 444 445 // Don't create a new Phi unless we can remove the old one. 446 if (!PN->hasOneUse()) 447 return None; 448 449 for (Value *IncValPhi : PN->incoming_values()) { 450 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi); 451 if (!Reinterpret || 452 Reinterpret->getIntrinsicID() != 453 Intrinsic::aarch64_sve_convert_to_svbool || 454 RequiredType != Reinterpret->getArgOperand(0)->getType()) 455 return None; 456 } 457 458 // Create the new Phi 459 LLVMContext &Ctx = PN->getContext(); 460 IRBuilder<> Builder(Ctx); 461 Builder.SetInsertPoint(PN); 462 PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues()); 463 Worklist.push_back(PN); 464 465 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) { 466 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I)); 467 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I)); 468 Worklist.push_back(Reinterpret); 469 } 470 471 // Cleanup Phi Node and reinterprets 472 return IC.replaceInstUsesWith(II, NPN); 473 } 474 475 // (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _)))) 476 // => (binop (pred) (from_svbool _) (from_svbool _)) 477 // 478 // The above transformation eliminates a `to_svbool` in the predicate 479 // operand of bitwise operation `binop` by narrowing the vector width of 480 // the operation. For example, it would convert a `<vscale x 16 x i1> 481 // and` into a `<vscale x 4 x i1> and`. This is profitable because 482 // to_svbool must zero the new lanes during widening, whereas 483 // from_svbool is free. 484 static Optional<Instruction *> tryCombineFromSVBoolBinOp(InstCombiner &IC, 485 IntrinsicInst &II) { 486 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0)); 487 if (!BinOp) 488 return None; 489 490 auto IntrinsicID = BinOp->getIntrinsicID(); 491 switch (IntrinsicID) { 492 case Intrinsic::aarch64_sve_and_z: 493 case Intrinsic::aarch64_sve_bic_z: 494 case Intrinsic::aarch64_sve_eor_z: 495 case Intrinsic::aarch64_sve_nand_z: 496 case Intrinsic::aarch64_sve_nor_z: 497 case Intrinsic::aarch64_sve_orn_z: 498 case Intrinsic::aarch64_sve_orr_z: 499 break; 500 default: 501 return None; 502 } 503 504 auto BinOpPred = BinOp->getOperand(0); 505 auto BinOpOp1 = BinOp->getOperand(1); 506 auto BinOpOp2 = BinOp->getOperand(2); 507 508 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred); 509 if (!PredIntr || 510 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool) 511 return None; 512 513 auto PredOp = PredIntr->getOperand(0); 514 auto PredOpTy = cast<VectorType>(PredOp->getType()); 515 if (PredOpTy != II.getType()) 516 return None; 517 518 IRBuilder<> Builder(II.getContext()); 519 Builder.SetInsertPoint(&II); 520 521 SmallVector<Value *> NarrowedBinOpArgs = {PredOp}; 522 auto NarrowBinOpOp1 = Builder.CreateIntrinsic( 523 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1}); 524 NarrowedBinOpArgs.push_back(NarrowBinOpOp1); 525 if (BinOpOp1 == BinOpOp2) 526 NarrowedBinOpArgs.push_back(NarrowBinOpOp1); 527 else 528 NarrowedBinOpArgs.push_back(Builder.CreateIntrinsic( 529 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2})); 530 531 auto NarrowedBinOp = 532 Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs); 533 return IC.replaceInstUsesWith(II, NarrowedBinOp); 534 } 535 536 static Optional<Instruction *> instCombineConvertFromSVBool(InstCombiner &IC, 537 IntrinsicInst &II) { 538 // If the reinterpret instruction operand is a PHI Node 539 if (isa<PHINode>(II.getArgOperand(0))) 540 return processPhiNode(IC, II); 541 542 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II)) 543 return BinOpCombine; 544 545 SmallVector<Instruction *, 32> CandidatesForRemoval; 546 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr; 547 548 const auto *IVTy = cast<VectorType>(II.getType()); 549 550 // Walk the chain of conversions. 551 while (Cursor) { 552 // If the type of the cursor has fewer lanes than the final result, zeroing 553 // must take place, which breaks the equivalence chain. 554 const auto *CursorVTy = cast<VectorType>(Cursor->getType()); 555 if (CursorVTy->getElementCount().getKnownMinValue() < 556 IVTy->getElementCount().getKnownMinValue()) 557 break; 558 559 // If the cursor has the same type as I, it is a viable replacement. 560 if (Cursor->getType() == IVTy) 561 EarliestReplacement = Cursor; 562 563 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor); 564 565 // If this is not an SVE conversion intrinsic, this is the end of the chain. 566 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() == 567 Intrinsic::aarch64_sve_convert_to_svbool || 568 IntrinsicCursor->getIntrinsicID() == 569 Intrinsic::aarch64_sve_convert_from_svbool)) 570 break; 571 572 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor); 573 Cursor = IntrinsicCursor->getOperand(0); 574 } 575 576 // If no viable replacement in the conversion chain was found, there is 577 // nothing to do. 578 if (!EarliestReplacement) 579 return None; 580 581 return IC.replaceInstUsesWith(II, EarliestReplacement); 582 } 583 584 static Optional<Instruction *> instCombineSVESel(InstCombiner &IC, 585 IntrinsicInst &II) { 586 IRBuilder<> Builder(&II); 587 auto Select = Builder.CreateSelect(II.getOperand(0), II.getOperand(1), 588 II.getOperand(2)); 589 return IC.replaceInstUsesWith(II, Select); 590 } 591 592 static Optional<Instruction *> instCombineSVEDup(InstCombiner &IC, 593 IntrinsicInst &II) { 594 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); 595 if (!Pg) 596 return None; 597 598 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) 599 return None; 600 601 const auto PTruePattern = 602 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue(); 603 if (PTruePattern != AArch64SVEPredPattern::vl1) 604 return None; 605 606 // The intrinsic is inserting into lane zero so use an insert instead. 607 auto *IdxTy = Type::getInt64Ty(II.getContext()); 608 auto *Insert = InsertElementInst::Create( 609 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0)); 610 Insert->insertBefore(&II); 611 Insert->takeName(&II); 612 613 return IC.replaceInstUsesWith(II, Insert); 614 } 615 616 static Optional<Instruction *> instCombineSVEDupX(InstCombiner &IC, 617 IntrinsicInst &II) { 618 // Replace DupX with a regular IR splat. 619 IRBuilder<> Builder(II.getContext()); 620 Builder.SetInsertPoint(&II); 621 auto *RetTy = cast<ScalableVectorType>(II.getType()); 622 Value *Splat = 623 Builder.CreateVectorSplat(RetTy->getElementCount(), II.getArgOperand(0)); 624 Splat->takeName(&II); 625 return IC.replaceInstUsesWith(II, Splat); 626 } 627 628 static Optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC, 629 IntrinsicInst &II) { 630 LLVMContext &Ctx = II.getContext(); 631 IRBuilder<> Builder(Ctx); 632 Builder.SetInsertPoint(&II); 633 634 // Check that the predicate is all active 635 auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0)); 636 if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) 637 return None; 638 639 const auto PTruePattern = 640 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue(); 641 if (PTruePattern != AArch64SVEPredPattern::all) 642 return None; 643 644 // Check that we have a compare of zero.. 645 auto *SplatValue = 646 dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2))); 647 if (!SplatValue || !SplatValue->isZero()) 648 return None; 649 650 // ..against a dupq 651 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); 652 if (!DupQLane || 653 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane) 654 return None; 655 656 // Where the dupq is a lane 0 replicate of a vector insert 657 if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero()) 658 return None; 659 660 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0)); 661 if (!VecIns || 662 VecIns->getIntrinsicID() != Intrinsic::experimental_vector_insert) 663 return None; 664 665 // Where the vector insert is a fixed constant vector insert into undef at 666 // index zero 667 if (!isa<UndefValue>(VecIns->getArgOperand(0))) 668 return None; 669 670 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero()) 671 return None; 672 673 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1)); 674 if (!ConstVec) 675 return None; 676 677 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType()); 678 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType()); 679 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements()) 680 return None; 681 682 unsigned NumElts = VecTy->getNumElements(); 683 unsigned PredicateBits = 0; 684 685 // Expand intrinsic operands to a 16-bit byte level predicate 686 for (unsigned I = 0; I < NumElts; ++I) { 687 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I)); 688 if (!Arg) 689 return None; 690 if (!Arg->isZero()) 691 PredicateBits |= 1 << (I * (16 / NumElts)); 692 } 693 694 // If all bits are zero bail early with an empty predicate 695 if (PredicateBits == 0) { 696 auto *PFalse = Constant::getNullValue(II.getType()); 697 PFalse->takeName(&II); 698 return IC.replaceInstUsesWith(II, PFalse); 699 } 700 701 // Calculate largest predicate type used (where byte predicate is largest) 702 unsigned Mask = 8; 703 for (unsigned I = 0; I < 16; ++I) 704 if ((PredicateBits & (1 << I)) != 0) 705 Mask |= (I % 8); 706 707 unsigned PredSize = Mask & -Mask; 708 auto *PredType = ScalableVectorType::get( 709 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8)); 710 711 // Ensure all relevant bits are set 712 for (unsigned I = 0; I < 16; I += PredSize) 713 if ((PredicateBits & (1 << I)) == 0) 714 return None; 715 716 auto *PTruePat = 717 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all); 718 auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, 719 {PredType}, {PTruePat}); 720 auto *ConvertToSVBool = Builder.CreateIntrinsic( 721 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue}); 722 auto *ConvertFromSVBool = 723 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, 724 {II.getType()}, {ConvertToSVBool}); 725 726 ConvertFromSVBool->takeName(&II); 727 return IC.replaceInstUsesWith(II, ConvertFromSVBool); 728 } 729 730 static Optional<Instruction *> instCombineSVELast(InstCombiner &IC, 731 IntrinsicInst &II) { 732 IRBuilder<> Builder(II.getContext()); 733 Builder.SetInsertPoint(&II); 734 Value *Pg = II.getArgOperand(0); 735 Value *Vec = II.getArgOperand(1); 736 auto IntrinsicID = II.getIntrinsicID(); 737 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta; 738 739 // lastX(splat(X)) --> X 740 if (auto *SplatVal = getSplatValue(Vec)) 741 return IC.replaceInstUsesWith(II, SplatVal); 742 743 // If x and/or y is a splat value then: 744 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y)) 745 Value *LHS, *RHS; 746 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) { 747 if (isSplatValue(LHS) || isSplatValue(RHS)) { 748 auto *OldBinOp = cast<BinaryOperator>(Vec); 749 auto OpC = OldBinOp->getOpcode(); 750 auto *NewLHS = 751 Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS}); 752 auto *NewRHS = 753 Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS}); 754 auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags( 755 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), &II); 756 return IC.replaceInstUsesWith(II, NewBinOp); 757 } 758 } 759 760 auto *C = dyn_cast<Constant>(Pg); 761 if (IsAfter && C && C->isNullValue()) { 762 // The intrinsic is extracting lane 0 so use an extract instead. 763 auto *IdxTy = Type::getInt64Ty(II.getContext()); 764 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0)); 765 Extract->insertBefore(&II); 766 Extract->takeName(&II); 767 return IC.replaceInstUsesWith(II, Extract); 768 } 769 770 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg); 771 if (!IntrPG) 772 return None; 773 774 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) 775 return None; 776 777 const auto PTruePattern = 778 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue(); 779 780 // Can the intrinsic's predicate be converted to a known constant index? 781 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern); 782 if (!MinNumElts) 783 return None; 784 785 unsigned Idx = MinNumElts - 1; 786 // Increment the index if extracting the element after the last active 787 // predicate element. 788 if (IsAfter) 789 ++Idx; 790 791 // Ignore extracts whose index is larger than the known minimum vector 792 // length. NOTE: This is an artificial constraint where we prefer to 793 // maintain what the user asked for until an alternative is proven faster. 794 auto *PgVTy = cast<ScalableVectorType>(Pg->getType()); 795 if (Idx >= PgVTy->getMinNumElements()) 796 return None; 797 798 // The intrinsic is extracting a fixed lane so use an extract instead. 799 auto *IdxTy = Type::getInt64Ty(II.getContext()); 800 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx)); 801 Extract->insertBefore(&II); 802 Extract->takeName(&II); 803 return IC.replaceInstUsesWith(II, Extract); 804 } 805 806 static Optional<Instruction *> instCombineRDFFR(InstCombiner &IC, 807 IntrinsicInst &II) { 808 LLVMContext &Ctx = II.getContext(); 809 IRBuilder<> Builder(Ctx); 810 Builder.SetInsertPoint(&II); 811 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr 812 // can work with RDFFR_PP for ptest elimination. 813 auto *AllPat = 814 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all); 815 auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, 816 {II.getType()}, {AllPat}); 817 auto *RDFFR = 818 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue}); 819 RDFFR->takeName(&II); 820 return IC.replaceInstUsesWith(II, RDFFR); 821 } 822 823 static Optional<Instruction *> 824 instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) { 825 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue(); 826 827 if (Pattern == AArch64SVEPredPattern::all) { 828 LLVMContext &Ctx = II.getContext(); 829 IRBuilder<> Builder(Ctx); 830 Builder.SetInsertPoint(&II); 831 832 Constant *StepVal = ConstantInt::get(II.getType(), NumElts); 833 auto *VScale = Builder.CreateVScale(StepVal); 834 VScale->takeName(&II); 835 return IC.replaceInstUsesWith(II, VScale); 836 } 837 838 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern); 839 840 return MinNumElts && NumElts >= MinNumElts 841 ? Optional<Instruction *>(IC.replaceInstUsesWith( 842 II, ConstantInt::get(II.getType(), MinNumElts))) 843 : None; 844 } 845 846 static Optional<Instruction *> instCombineSVEPTest(InstCombiner &IC, 847 IntrinsicInst &II) { 848 IntrinsicInst *Op1 = dyn_cast<IntrinsicInst>(II.getArgOperand(0)); 849 IntrinsicInst *Op2 = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); 850 851 if (Op1 && Op2 && 852 Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && 853 Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && 854 Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) { 855 856 IRBuilder<> Builder(II.getContext()); 857 Builder.SetInsertPoint(&II); 858 859 Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)}; 860 Type *Tys[] = {Op1->getArgOperand(0)->getType()}; 861 862 auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops); 863 864 PTest->takeName(&II); 865 return IC.replaceInstUsesWith(II, PTest); 866 } 867 868 return None; 869 } 870 871 static Optional<Instruction *> instCombineSVEVectorFMLA(InstCombiner &IC, 872 IntrinsicInst &II) { 873 // fold (fadd p a (fmul p b c)) -> (fma p a b c) 874 Value *P = II.getOperand(0); 875 Value *A = II.getOperand(1); 876 auto FMul = II.getOperand(2); 877 Value *B, *C; 878 if (!match(FMul, m_Intrinsic<Intrinsic::aarch64_sve_fmul>( 879 m_Specific(P), m_Value(B), m_Value(C)))) 880 return None; 881 882 if (!FMul->hasOneUse()) 883 return None; 884 885 llvm::FastMathFlags FAddFlags = II.getFastMathFlags(); 886 // Stop the combine when the flags on the inputs differ in case dropping flags 887 // would lead to us missing out on more beneficial optimizations. 888 if (FAddFlags != cast<CallInst>(FMul)->getFastMathFlags()) 889 return None; 890 if (!FAddFlags.allowContract()) 891 return None; 892 893 IRBuilder<> Builder(II.getContext()); 894 Builder.SetInsertPoint(&II); 895 auto FMLA = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_fmla, 896 {II.getType()}, {P, A, B, C}, &II); 897 FMLA->setFastMathFlags(FAddFlags); 898 return IC.replaceInstUsesWith(II, FMLA); 899 } 900 901 static bool isAllActivePredicate(Value *Pred) { 902 // Look through convert.from.svbool(convert.to.svbool(...) chain. 903 Value *UncastedPred; 904 if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>( 905 m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>( 906 m_Value(UncastedPred))))) 907 // If the predicate has the same or less lanes than the uncasted 908 // predicate then we know the casting has no effect. 909 if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <= 910 cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements()) 911 Pred = UncastedPred; 912 913 return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( 914 m_ConstantInt<AArch64SVEPredPattern::all>())); 915 } 916 917 static Optional<Instruction *> 918 instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { 919 IRBuilder<> Builder(II.getContext()); 920 Builder.SetInsertPoint(&II); 921 922 Value *Pred = II.getOperand(0); 923 Value *PtrOp = II.getOperand(1); 924 Type *VecTy = II.getType(); 925 Value *VecPtr = Builder.CreateBitCast(PtrOp, VecTy->getPointerTo()); 926 927 if (isAllActivePredicate(Pred)) { 928 LoadInst *Load = Builder.CreateLoad(VecTy, VecPtr); 929 Load->copyMetadata(II); 930 return IC.replaceInstUsesWith(II, Load); 931 } 932 933 CallInst *MaskedLoad = 934 Builder.CreateMaskedLoad(VecTy, VecPtr, PtrOp->getPointerAlignment(DL), 935 Pred, ConstantAggregateZero::get(VecTy)); 936 MaskedLoad->copyMetadata(II); 937 return IC.replaceInstUsesWith(II, MaskedLoad); 938 } 939 940 static Optional<Instruction *> 941 instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { 942 IRBuilder<> Builder(II.getContext()); 943 Builder.SetInsertPoint(&II); 944 945 Value *VecOp = II.getOperand(0); 946 Value *Pred = II.getOperand(1); 947 Value *PtrOp = II.getOperand(2); 948 Value *VecPtr = 949 Builder.CreateBitCast(PtrOp, VecOp->getType()->getPointerTo()); 950 951 if (isAllActivePredicate(Pred)) { 952 StoreInst *Store = Builder.CreateStore(VecOp, VecPtr); 953 Store->copyMetadata(II); 954 return IC.eraseInstFromFunction(II); 955 } 956 957 CallInst *MaskedStore = Builder.CreateMaskedStore( 958 VecOp, VecPtr, PtrOp->getPointerAlignment(DL), Pred); 959 MaskedStore->copyMetadata(II); 960 return IC.eraseInstFromFunction(II); 961 } 962 963 static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) { 964 switch (Intrinsic) { 965 case Intrinsic::aarch64_sve_fmul: 966 return Instruction::BinaryOps::FMul; 967 case Intrinsic::aarch64_sve_fadd: 968 return Instruction::BinaryOps::FAdd; 969 case Intrinsic::aarch64_sve_fsub: 970 return Instruction::BinaryOps::FSub; 971 default: 972 return Instruction::BinaryOpsEnd; 973 } 974 } 975 976 static Optional<Instruction *> instCombineSVEVectorBinOp(InstCombiner &IC, 977 IntrinsicInst &II) { 978 auto *OpPredicate = II.getOperand(0); 979 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID()); 980 if (BinOpCode == Instruction::BinaryOpsEnd || 981 !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( 982 m_ConstantInt<AArch64SVEPredPattern::all>()))) 983 return None; 984 IRBuilder<> Builder(II.getContext()); 985 Builder.SetInsertPoint(&II); 986 Builder.setFastMathFlags(II.getFastMathFlags()); 987 auto BinOp = 988 Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2)); 989 return IC.replaceInstUsesWith(II, BinOp); 990 } 991 992 static Optional<Instruction *> instCombineSVEVectorFAdd(InstCombiner &IC, 993 IntrinsicInst &II) { 994 if (auto FMLA = instCombineSVEVectorFMLA(IC, II)) 995 return FMLA; 996 return instCombineSVEVectorBinOp(IC, II); 997 } 998 999 static Optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC, 1000 IntrinsicInst &II) { 1001 auto *OpPredicate = II.getOperand(0); 1002 auto *OpMultiplicand = II.getOperand(1); 1003 auto *OpMultiplier = II.getOperand(2); 1004 1005 IRBuilder<> Builder(II.getContext()); 1006 Builder.SetInsertPoint(&II); 1007 1008 // Return true if a given instruction is a unit splat value, false otherwise. 1009 auto IsUnitSplat = [](auto *I) { 1010 auto *SplatValue = getSplatValue(I); 1011 if (!SplatValue) 1012 return false; 1013 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); 1014 }; 1015 1016 // Return true if a given instruction is an aarch64_sve_dup intrinsic call 1017 // with a unit splat value, false otherwise. 1018 auto IsUnitDup = [](auto *I) { 1019 auto *IntrI = dyn_cast<IntrinsicInst>(I); 1020 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup) 1021 return false; 1022 1023 auto *SplatValue = IntrI->getOperand(2); 1024 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); 1025 }; 1026 1027 if (IsUnitSplat(OpMultiplier)) { 1028 // [f]mul pg %n, (dupx 1) => %n 1029 OpMultiplicand->takeName(&II); 1030 return IC.replaceInstUsesWith(II, OpMultiplicand); 1031 } else if (IsUnitDup(OpMultiplier)) { 1032 // [f]mul pg %n, (dup pg 1) => %n 1033 auto *DupInst = cast<IntrinsicInst>(OpMultiplier); 1034 auto *DupPg = DupInst->getOperand(1); 1035 // TODO: this is naive. The optimization is still valid if DupPg 1036 // 'encompasses' OpPredicate, not only if they're the same predicate. 1037 if (OpPredicate == DupPg) { 1038 OpMultiplicand->takeName(&II); 1039 return IC.replaceInstUsesWith(II, OpMultiplicand); 1040 } 1041 } 1042 1043 return instCombineSVEVectorBinOp(IC, II); 1044 } 1045 1046 static Optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC, 1047 IntrinsicInst &II) { 1048 IRBuilder<> Builder(II.getContext()); 1049 Builder.SetInsertPoint(&II); 1050 Value *UnpackArg = II.getArgOperand(0); 1051 auto *RetTy = cast<ScalableVectorType>(II.getType()); 1052 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi || 1053 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo; 1054 1055 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X)) 1056 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X)) 1057 if (auto *ScalarArg = getSplatValue(UnpackArg)) { 1058 ScalarArg = 1059 Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned); 1060 Value *NewVal = 1061 Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg); 1062 NewVal->takeName(&II); 1063 return IC.replaceInstUsesWith(II, NewVal); 1064 } 1065 1066 return None; 1067 } 1068 static Optional<Instruction *> instCombineSVETBL(InstCombiner &IC, 1069 IntrinsicInst &II) { 1070 auto *OpVal = II.getOperand(0); 1071 auto *OpIndices = II.getOperand(1); 1072 VectorType *VTy = cast<VectorType>(II.getType()); 1073 1074 // Check whether OpIndices is a constant splat value < minimal element count 1075 // of result. 1076 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices)); 1077 if (!SplatValue || 1078 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue())) 1079 return None; 1080 1081 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to 1082 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization. 1083 IRBuilder<> Builder(II.getContext()); 1084 Builder.SetInsertPoint(&II); 1085 auto *Extract = Builder.CreateExtractElement(OpVal, SplatValue); 1086 auto *VectorSplat = 1087 Builder.CreateVectorSplat(VTy->getElementCount(), Extract); 1088 1089 VectorSplat->takeName(&II); 1090 return IC.replaceInstUsesWith(II, VectorSplat); 1091 } 1092 1093 static Optional<Instruction *> instCombineSVETupleGet(InstCombiner &IC, 1094 IntrinsicInst &II) { 1095 // Try to remove sequences of tuple get/set. 1096 Value *SetTuple, *SetIndex, *SetValue; 1097 auto *GetTuple = II.getArgOperand(0); 1098 auto *GetIndex = II.getArgOperand(1); 1099 // Check that we have tuple_get(GetTuple, GetIndex) where GetTuple is a 1100 // call to tuple_set i.e. tuple_set(SetTuple, SetIndex, SetValue). 1101 // Make sure that the types of the current intrinsic and SetValue match 1102 // in order to safely remove the sequence. 1103 if (!match(GetTuple, 1104 m_Intrinsic<Intrinsic::aarch64_sve_tuple_set>( 1105 m_Value(SetTuple), m_Value(SetIndex), m_Value(SetValue))) || 1106 SetValue->getType() != II.getType()) 1107 return None; 1108 // Case where we get the same index right after setting it. 1109 // tuple_get(tuple_set(SetTuple, SetIndex, SetValue), GetIndex) --> SetValue 1110 if (GetIndex == SetIndex) 1111 return IC.replaceInstUsesWith(II, SetValue); 1112 // If we are getting a different index than what was set in the tuple_set 1113 // intrinsic. We can just set the input tuple to the one up in the chain. 1114 // tuple_get(tuple_set(SetTuple, SetIndex, SetValue), GetIndex) 1115 // --> tuple_get(SetTuple, GetIndex) 1116 return IC.replaceOperand(II, 0, SetTuple); 1117 } 1118 1119 static Optional<Instruction *> instCombineSVEZip(InstCombiner &IC, 1120 IntrinsicInst &II) { 1121 // zip1(uzp1(A, B), uzp2(A, B)) --> A 1122 // zip2(uzp1(A, B), uzp2(A, B)) --> B 1123 Value *A, *B; 1124 if (match(II.getArgOperand(0), 1125 m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) && 1126 match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>( 1127 m_Specific(A), m_Specific(B)))) 1128 return IC.replaceInstUsesWith( 1129 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B)); 1130 1131 return None; 1132 } 1133 1134 static Optional<Instruction *> instCombineLD1GatherIndex(InstCombiner &IC, 1135 IntrinsicInst &II) { 1136 Value *Mask = II.getOperand(0); 1137 Value *BasePtr = II.getOperand(1); 1138 Value *Index = II.getOperand(2); 1139 Type *Ty = II.getType(); 1140 Value *PassThru = ConstantAggregateZero::get(Ty); 1141 1142 // Contiguous gather => masked load. 1143 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1)) 1144 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer) 1145 Value *IndexBase; 1146 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>( 1147 m_Value(IndexBase), m_SpecificInt(1)))) { 1148 IRBuilder<> Builder(II.getContext()); 1149 Builder.SetInsertPoint(&II); 1150 1151 Align Alignment = 1152 BasePtr->getPointerAlignment(II.getModule()->getDataLayout()); 1153 1154 Type *VecPtrTy = PointerType::getUnqual(Ty); 1155 Value *Ptr = Builder.CreateGEP( 1156 cast<VectorType>(Ty)->getElementType(), BasePtr, IndexBase); 1157 Ptr = Builder.CreateBitCast(Ptr, VecPtrTy); 1158 CallInst *MaskedLoad = 1159 Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru); 1160 MaskedLoad->takeName(&II); 1161 return IC.replaceInstUsesWith(II, MaskedLoad); 1162 } 1163 1164 return None; 1165 } 1166 1167 static Optional<Instruction *> instCombineST1ScatterIndex(InstCombiner &IC, 1168 IntrinsicInst &II) { 1169 Value *Val = II.getOperand(0); 1170 Value *Mask = II.getOperand(1); 1171 Value *BasePtr = II.getOperand(2); 1172 Value *Index = II.getOperand(3); 1173 Type *Ty = Val->getType(); 1174 1175 // Contiguous scatter => masked store. 1176 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1)) 1177 // => (masked.store Value (gep BasePtr IndexBase) Align Mask) 1178 Value *IndexBase; 1179 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>( 1180 m_Value(IndexBase), m_SpecificInt(1)))) { 1181 IRBuilder<> Builder(II.getContext()); 1182 Builder.SetInsertPoint(&II); 1183 1184 Align Alignment = 1185 BasePtr->getPointerAlignment(II.getModule()->getDataLayout()); 1186 1187 Value *Ptr = Builder.CreateGEP( 1188 cast<VectorType>(Ty)->getElementType(), BasePtr, IndexBase); 1189 Type *VecPtrTy = PointerType::getUnqual(Ty); 1190 Ptr = Builder.CreateBitCast(Ptr, VecPtrTy); 1191 1192 (void)Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask); 1193 1194 return IC.eraseInstFromFunction(II); 1195 } 1196 1197 return None; 1198 } 1199 1200 static Optional<Instruction *> instCombineSVESDIV(InstCombiner &IC, 1201 IntrinsicInst &II) { 1202 IRBuilder<> Builder(II.getContext()); 1203 Builder.SetInsertPoint(&II); 1204 Type *Int32Ty = Builder.getInt32Ty(); 1205 Value *Pred = II.getOperand(0); 1206 Value *Vec = II.getOperand(1); 1207 Value *DivVec = II.getOperand(2); 1208 1209 Value *SplatValue = getSplatValue(DivVec); 1210 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue); 1211 if (!SplatConstantInt) 1212 return None; 1213 APInt Divisor = SplatConstantInt->getValue(); 1214 1215 if (Divisor.isPowerOf2()) { 1216 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2()); 1217 auto ASRD = Builder.CreateIntrinsic( 1218 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2}); 1219 return IC.replaceInstUsesWith(II, ASRD); 1220 } 1221 if (Divisor.isNegatedPowerOf2()) { 1222 Divisor.negate(); 1223 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2()); 1224 auto ASRD = Builder.CreateIntrinsic( 1225 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2}); 1226 auto NEG = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_neg, 1227 {ASRD->getType()}, {ASRD, Pred, ASRD}); 1228 return IC.replaceInstUsesWith(II, NEG); 1229 } 1230 1231 return None; 1232 } 1233 1234 static Optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC, 1235 IntrinsicInst &II) { 1236 Value *A = II.getArgOperand(0); 1237 Value *B = II.getArgOperand(1); 1238 if (A == B) 1239 return IC.replaceInstUsesWith(II, A); 1240 1241 return None; 1242 } 1243 1244 static Optional<Instruction *> instCombineSVESrshl(InstCombiner &IC, 1245 IntrinsicInst &II) { 1246 IRBuilder<> Builder(&II); 1247 Value *Pred = II.getOperand(0); 1248 Value *Vec = II.getOperand(1); 1249 Value *Shift = II.getOperand(2); 1250 1251 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic. 1252 Value *AbsPred, *MergedValue; 1253 if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>( 1254 m_Value(MergedValue), m_Value(AbsPred), m_Value())) && 1255 !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>( 1256 m_Value(MergedValue), m_Value(AbsPred), m_Value()))) 1257 1258 return None; 1259 1260 // Transform is valid if any of the following are true: 1261 // * The ABS merge value is an undef or non-negative 1262 // * The ABS predicate is all active 1263 // * The ABS predicate and the SRSHL predicates are the same 1264 if (!isa<UndefValue>(MergedValue) && 1265 !match(MergedValue, m_NonNegative()) && 1266 AbsPred != Pred && !isAllActivePredicate(AbsPred)) 1267 return None; 1268 1269 // Only valid when the shift amount is non-negative, otherwise the rounding 1270 // behaviour of SRSHL cannot be ignored. 1271 if (!match(Shift, m_NonNegative())) 1272 return None; 1273 1274 auto LSL = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl, {II.getType()}, 1275 {Pred, Vec, Shift}); 1276 1277 return IC.replaceInstUsesWith(II, LSL); 1278 } 1279 1280 Optional<Instruction *> 1281 AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, 1282 IntrinsicInst &II) const { 1283 Intrinsic::ID IID = II.getIntrinsicID(); 1284 switch (IID) { 1285 default: 1286 break; 1287 case Intrinsic::aarch64_neon_fmaxnm: 1288 case Intrinsic::aarch64_neon_fminnm: 1289 return instCombineMaxMinNM(IC, II); 1290 case Intrinsic::aarch64_sve_convert_from_svbool: 1291 return instCombineConvertFromSVBool(IC, II); 1292 case Intrinsic::aarch64_sve_dup: 1293 return instCombineSVEDup(IC, II); 1294 case Intrinsic::aarch64_sve_dup_x: 1295 return instCombineSVEDupX(IC, II); 1296 case Intrinsic::aarch64_sve_cmpne: 1297 case Intrinsic::aarch64_sve_cmpne_wide: 1298 return instCombineSVECmpNE(IC, II); 1299 case Intrinsic::aarch64_sve_rdffr: 1300 return instCombineRDFFR(IC, II); 1301 case Intrinsic::aarch64_sve_lasta: 1302 case Intrinsic::aarch64_sve_lastb: 1303 return instCombineSVELast(IC, II); 1304 case Intrinsic::aarch64_sve_cntd: 1305 return instCombineSVECntElts(IC, II, 2); 1306 case Intrinsic::aarch64_sve_cntw: 1307 return instCombineSVECntElts(IC, II, 4); 1308 case Intrinsic::aarch64_sve_cnth: 1309 return instCombineSVECntElts(IC, II, 8); 1310 case Intrinsic::aarch64_sve_cntb: 1311 return instCombineSVECntElts(IC, II, 16); 1312 case Intrinsic::aarch64_sve_ptest_any: 1313 case Intrinsic::aarch64_sve_ptest_first: 1314 case Intrinsic::aarch64_sve_ptest_last: 1315 return instCombineSVEPTest(IC, II); 1316 case Intrinsic::aarch64_sve_mul: 1317 case Intrinsic::aarch64_sve_fmul: 1318 return instCombineSVEVectorMul(IC, II); 1319 case Intrinsic::aarch64_sve_fadd: 1320 return instCombineSVEVectorFAdd(IC, II); 1321 case Intrinsic::aarch64_sve_fsub: 1322 return instCombineSVEVectorBinOp(IC, II); 1323 case Intrinsic::aarch64_sve_tbl: 1324 return instCombineSVETBL(IC, II); 1325 case Intrinsic::aarch64_sve_uunpkhi: 1326 case Intrinsic::aarch64_sve_uunpklo: 1327 case Intrinsic::aarch64_sve_sunpkhi: 1328 case Intrinsic::aarch64_sve_sunpklo: 1329 return instCombineSVEUnpack(IC, II); 1330 case Intrinsic::aarch64_sve_tuple_get: 1331 return instCombineSVETupleGet(IC, II); 1332 case Intrinsic::aarch64_sve_zip1: 1333 case Intrinsic::aarch64_sve_zip2: 1334 return instCombineSVEZip(IC, II); 1335 case Intrinsic::aarch64_sve_ld1_gather_index: 1336 return instCombineLD1GatherIndex(IC, II); 1337 case Intrinsic::aarch64_sve_st1_scatter_index: 1338 return instCombineST1ScatterIndex(IC, II); 1339 case Intrinsic::aarch64_sve_ld1: 1340 return instCombineSVELD1(IC, II, DL); 1341 case Intrinsic::aarch64_sve_st1: 1342 return instCombineSVEST1(IC, II, DL); 1343 case Intrinsic::aarch64_sve_sdiv: 1344 return instCombineSVESDIV(IC, II); 1345 case Intrinsic::aarch64_sve_sel: 1346 return instCombineSVESel(IC, II); 1347 case Intrinsic::aarch64_sve_srshl: 1348 return instCombineSVESrshl(IC, II); 1349 } 1350 1351 return None; 1352 } 1353 1354 Optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic( 1355 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts, 1356 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, 1357 std::function<void(Instruction *, unsigned, APInt, APInt &)> 1358 SimplifyAndSetOp) const { 1359 switch (II.getIntrinsicID()) { 1360 default: 1361 break; 1362 case Intrinsic::aarch64_neon_fcvtxn: 1363 case Intrinsic::aarch64_neon_rshrn: 1364 case Intrinsic::aarch64_neon_sqrshrn: 1365 case Intrinsic::aarch64_neon_sqrshrun: 1366 case Intrinsic::aarch64_neon_sqshrn: 1367 case Intrinsic::aarch64_neon_sqshrun: 1368 case Intrinsic::aarch64_neon_sqxtn: 1369 case Intrinsic::aarch64_neon_sqxtun: 1370 case Intrinsic::aarch64_neon_uqrshrn: 1371 case Intrinsic::aarch64_neon_uqshrn: 1372 case Intrinsic::aarch64_neon_uqxtn: 1373 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts); 1374 break; 1375 } 1376 1377 return None; 1378 } 1379 1380 bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, 1381 ArrayRef<const Value *> Args) { 1382 1383 // A helper that returns a vector type from the given type. The number of 1384 // elements in type Ty determines the vector width. 1385 auto toVectorTy = [&](Type *ArgTy) { 1386 return VectorType::get(ArgTy->getScalarType(), 1387 cast<VectorType>(DstTy)->getElementCount()); 1388 }; 1389 1390 // Exit early if DstTy is not a vector type whose elements are at least 1391 // 16-bits wide. 1392 if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16) 1393 return false; 1394 1395 // Determine if the operation has a widening variant. We consider both the 1396 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the 1397 // instructions. 1398 // 1399 // TODO: Add additional widening operations (e.g., shl, etc.) once we 1400 // verify that their extending operands are eliminated during code 1401 // generation. 1402 switch (Opcode) { 1403 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2). 1404 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2). 1405 case Instruction::Mul: // SMULL(2), UMULL(2) 1406 break; 1407 default: 1408 return false; 1409 } 1410 1411 // To be a widening instruction (either the "wide" or "long" versions), the 1412 // second operand must be a sign- or zero extend. 1413 if (Args.size() != 2 || 1414 (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1]))) 1415 return false; 1416 auto *Extend = cast<CastInst>(Args[1]); 1417 auto *Arg0 = dyn_cast<CastInst>(Args[0]); 1418 1419 // A mul only has a mull version (not like addw). Both operands need to be 1420 // extending and the same type. 1421 if (Opcode == Instruction::Mul && 1422 (!Arg0 || Arg0->getOpcode() != Extend->getOpcode() || 1423 Arg0->getOperand(0)->getType() != Extend->getOperand(0)->getType())) 1424 return false; 1425 1426 // Legalize the destination type and ensure it can be used in a widening 1427 // operation. 1428 auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy); 1429 unsigned DstElTySize = DstTyL.second.getScalarSizeInBits(); 1430 if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits()) 1431 return false; 1432 1433 // Legalize the source type and ensure it can be used in a widening 1434 // operation. 1435 auto *SrcTy = toVectorTy(Extend->getSrcTy()); 1436 auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy); 1437 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits(); 1438 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits()) 1439 return false; 1440 1441 // Get the total number of vector elements in the legalized types. 1442 InstructionCost NumDstEls = 1443 DstTyL.first * DstTyL.second.getVectorMinNumElements(); 1444 InstructionCost NumSrcEls = 1445 SrcTyL.first * SrcTyL.second.getVectorMinNumElements(); 1446 1447 // Return true if the legalized types have the same number of vector elements 1448 // and the destination element type size is twice that of the source type. 1449 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize; 1450 } 1451 1452 InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, 1453 Type *Src, 1454 TTI::CastContextHint CCH, 1455 TTI::TargetCostKind CostKind, 1456 const Instruction *I) { 1457 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1458 assert(ISD && "Invalid opcode"); 1459 1460 // If the cast is observable, and it is used by a widening instruction (e.g., 1461 // uaddl, saddw, etc.), it may be free. 1462 if (I && I->hasOneUser()) { 1463 auto *SingleUser = cast<Instruction>(*I->user_begin()); 1464 SmallVector<const Value *, 4> Operands(SingleUser->operand_values()); 1465 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) { 1466 // If the cast is the second operand, it is free. We will generate either 1467 // a "wide" or "long" version of the widening instruction. 1468 if (I == SingleUser->getOperand(1)) 1469 return 0; 1470 // If the cast is not the second operand, it will be free if it looks the 1471 // same as the second operand. In this case, we will generate a "long" 1472 // version of the widening instruction. 1473 if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1))) 1474 if (I->getOpcode() == unsigned(Cast->getOpcode()) && 1475 cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy()) 1476 return 0; 1477 } 1478 } 1479 1480 // TODO: Allow non-throughput costs that aren't binary. 1481 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost { 1482 if (CostKind != TTI::TCK_RecipThroughput) 1483 return Cost == 0 ? 0 : 1; 1484 return Cost; 1485 }; 1486 1487 EVT SrcTy = TLI->getValueType(DL, Src); 1488 EVT DstTy = TLI->getValueType(DL, Dst); 1489 1490 if (!SrcTy.isSimple() || !DstTy.isSimple()) 1491 return AdjustCost( 1492 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); 1493 1494 static const TypeConversionCostTblEntry 1495 ConversionTbl[] = { 1496 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, 1497 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 }, 1498 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, 1499 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 }, 1500 1501 // Truncations on nxvmiN 1502 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 }, 1503 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 }, 1504 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 }, 1505 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 }, 1506 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 }, 1507 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 }, 1508 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 }, 1509 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 }, 1510 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 }, 1511 { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 }, 1512 { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 }, 1513 { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 }, 1514 { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 }, 1515 { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 }, 1516 { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 }, 1517 { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 }, 1518 1519 // The number of shll instructions for the extension. 1520 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 1521 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 1522 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, 1523 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, 1524 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 1525 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 1526 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, 1527 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, 1528 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, 1529 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, 1530 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, 1531 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, 1532 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, 1533 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, 1534 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, 1535 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, 1536 1537 // LowerVectorINT_TO_FP: 1538 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 1539 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 1540 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 1541 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 1542 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 1543 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 1544 1545 // Complex: to v2f32 1546 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 1547 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, 1548 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, 1549 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 1550 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, 1551 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, 1552 1553 // Complex: to v4f32 1554 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 }, 1555 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 1556 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, 1557 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 1558 1559 // Complex: to v8f32 1560 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, 1561 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 1562 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, 1563 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 1564 1565 // Complex: to v16f32 1566 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, 1567 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, 1568 1569 // Complex: to v2f64 1570 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 1571 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, 1572 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 1573 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 1574 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, 1575 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 1576 1577 1578 // LowerVectorFP_TO_INT 1579 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 }, 1580 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, 1581 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, 1582 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, 1583 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, 1584 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, 1585 1586 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext). 1587 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 }, 1588 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 }, 1589 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 }, 1590 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 }, 1591 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 }, 1592 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 }, 1593 1594 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2 1595 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 }, 1596 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 }, 1597 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 }, 1598 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 }, 1599 1600 // Complex, from nxv2f32. 1601 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 }, 1602 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 }, 1603 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 }, 1604 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1 }, 1605 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 }, 1606 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 }, 1607 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 }, 1608 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1 }, 1609 1610 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2. 1611 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 }, 1612 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 }, 1613 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 }, 1614 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 }, 1615 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 }, 1616 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 }, 1617 1618 // Complex, from nxv2f64. 1619 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 }, 1620 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 }, 1621 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 }, 1622 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1 }, 1623 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 }, 1624 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 }, 1625 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 }, 1626 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1 }, 1627 1628 // Complex, from nxv4f32. 1629 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 }, 1630 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 }, 1631 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 }, 1632 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1 }, 1633 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 }, 1634 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 }, 1635 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 }, 1636 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1 }, 1637 1638 // Complex, from nxv8f64. Illegal -> illegal conversions not required. 1639 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 }, 1640 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7 }, 1641 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 }, 1642 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7 }, 1643 1644 // Complex, from nxv4f64. Illegal -> illegal conversions not required. 1645 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 }, 1646 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 }, 1647 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3 }, 1648 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 }, 1649 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 }, 1650 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3 }, 1651 1652 // Complex, from nxv8f32. Illegal -> illegal conversions not required. 1653 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 }, 1654 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3 }, 1655 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 }, 1656 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3 }, 1657 1658 // Complex, from nxv8f16. 1659 { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 }, 1660 { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 }, 1661 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 }, 1662 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1 }, 1663 { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 }, 1664 { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 }, 1665 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 }, 1666 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1 }, 1667 1668 // Complex, from nxv4f16. 1669 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 }, 1670 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 }, 1671 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 }, 1672 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1 }, 1673 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 }, 1674 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 }, 1675 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 }, 1676 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1 }, 1677 1678 // Complex, from nxv2f16. 1679 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 }, 1680 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 }, 1681 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 }, 1682 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1 }, 1683 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 }, 1684 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 }, 1685 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 }, 1686 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1 }, 1687 1688 // Truncate from nxvmf32 to nxvmf16. 1689 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 }, 1690 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 }, 1691 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 }, 1692 1693 // Truncate from nxvmf64 to nxvmf16. 1694 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 }, 1695 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 }, 1696 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 }, 1697 1698 // Truncate from nxvmf64 to nxvmf32. 1699 { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 }, 1700 { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 }, 1701 { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 }, 1702 1703 // Extend from nxvmf16 to nxvmf32. 1704 { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1}, 1705 { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1}, 1706 { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2}, 1707 1708 // Extend from nxvmf16 to nxvmf64. 1709 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1}, 1710 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2}, 1711 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4}, 1712 1713 // Extend from nxvmf32 to nxvmf64. 1714 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1}, 1715 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2}, 1716 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6}, 1717 1718 // Bitcasts from float to integer 1719 { ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0 }, 1720 { ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0 }, 1721 { ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0 }, 1722 1723 // Bitcasts from integer to float 1724 { ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0 }, 1725 { ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0 }, 1726 { ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0 }, 1727 }; 1728 1729 if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD, 1730 DstTy.getSimpleVT(), 1731 SrcTy.getSimpleVT())) 1732 return AdjustCost(Entry->Cost); 1733 1734 static const TypeConversionCostTblEntry FP16Tbl[] = { 1735 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs 1736 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1}, 1737 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs 1738 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1}, 1739 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs 1740 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2}, 1741 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn 1742 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2}, 1743 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs 1744 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1}, 1745 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs 1746 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4}, 1747 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn 1748 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3}, 1749 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs 1750 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2}, 1751 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs 1752 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8}, 1753 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf 1754 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf 1755 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf 1756 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf 1757 }; 1758 1759 if (ST->hasFullFP16()) 1760 if (const auto *Entry = ConvertCostTableLookup( 1761 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) 1762 return AdjustCost(Entry->Cost); 1763 1764 return AdjustCost( 1765 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); 1766 } 1767 1768 InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, 1769 Type *Dst, 1770 VectorType *VecTy, 1771 unsigned Index) { 1772 1773 // Make sure we were given a valid extend opcode. 1774 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && 1775 "Invalid opcode"); 1776 1777 // We are extending an element we extract from a vector, so the source type 1778 // of the extend is the element type of the vector. 1779 auto *Src = VecTy->getElementType(); 1780 1781 // Sign- and zero-extends are for integer types only. 1782 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type"); 1783 1784 // Get the cost for the extract. We compute the cost (if any) for the extend 1785 // below. 1786 InstructionCost Cost = 1787 getVectorInstrCost(Instruction::ExtractElement, VecTy, Index); 1788 1789 // Legalize the types. 1790 auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy); 1791 auto DstVT = TLI->getValueType(DL, Dst); 1792 auto SrcVT = TLI->getValueType(DL, Src); 1793 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 1794 1795 // If the resulting type is still a vector and the destination type is legal, 1796 // we may get the extension for free. If not, get the default cost for the 1797 // extend. 1798 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT)) 1799 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 1800 CostKind); 1801 1802 // The destination type should be larger than the element type. If not, get 1803 // the default cost for the extend. 1804 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits()) 1805 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 1806 CostKind); 1807 1808 switch (Opcode) { 1809 default: 1810 llvm_unreachable("Opcode should be either SExt or ZExt"); 1811 1812 // For sign-extends, we only need a smov, which performs the extension 1813 // automatically. 1814 case Instruction::SExt: 1815 return Cost; 1816 1817 // For zero-extends, the extend is performed automatically by a umov unless 1818 // the destination type is i64 and the element type is i8 or i16. 1819 case Instruction::ZExt: 1820 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u) 1821 return Cost; 1822 } 1823 1824 // If we are unable to perform the extend for free, get the default cost. 1825 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 1826 CostKind); 1827 } 1828 1829 InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode, 1830 TTI::TargetCostKind CostKind, 1831 const Instruction *I) { 1832 if (CostKind != TTI::TCK_RecipThroughput) 1833 return Opcode == Instruction::PHI ? 0 : 1; 1834 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind"); 1835 // Branches are assumed to be predicted. 1836 return 0; 1837 } 1838 1839 InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, 1840 unsigned Index) { 1841 assert(Val->isVectorTy() && "This must be a vector type"); 1842 1843 if (Index != -1U) { 1844 // Legalize the type. 1845 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); 1846 1847 // This type is legalized to a scalar type. 1848 if (!LT.second.isVector()) 1849 return 0; 1850 1851 // The type may be split. For fixed-width vectors we can normalize the 1852 // index to the new type. 1853 if (LT.second.isFixedLengthVector()) { 1854 unsigned Width = LT.second.getVectorNumElements(); 1855 Index = Index % Width; 1856 } 1857 1858 // The element at index zero is already inside the vector. 1859 if (Index == 0) 1860 return 0; 1861 } 1862 1863 // All other insert/extracts cost this much. 1864 return ST->getVectorInsertExtractBaseCost(); 1865 } 1866 1867 InstructionCost AArch64TTIImpl::getArithmeticInstrCost( 1868 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 1869 TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, 1870 TTI::OperandValueProperties Opd1PropInfo, 1871 TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args, 1872 const Instruction *CxtI) { 1873 // TODO: Handle more cost kinds. 1874 if (CostKind != TTI::TCK_RecipThroughput) 1875 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, 1876 Opd2Info, Opd1PropInfo, 1877 Opd2PropInfo, Args, CxtI); 1878 1879 // Legalize the type. 1880 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 1881 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1882 1883 switch (ISD) { 1884 default: 1885 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, 1886 Opd2Info, Opd1PropInfo, Opd2PropInfo); 1887 case ISD::SDIV: 1888 if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue && 1889 Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { 1890 // On AArch64, scalar signed division by constants power-of-two are 1891 // normally expanded to the sequence ADD + CMP + SELECT + SRA. 1892 // The OperandValue properties many not be same as that of previous 1893 // operation; conservatively assume OP_None. 1894 InstructionCost Cost = getArithmeticInstrCost( 1895 Instruction::Add, Ty, CostKind, Opd1Info, Opd2Info, 1896 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 1897 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Opd1Info, 1898 Opd2Info, TargetTransformInfo::OP_None, 1899 TargetTransformInfo::OP_None); 1900 Cost += getArithmeticInstrCost( 1901 Instruction::Select, Ty, CostKind, Opd1Info, Opd2Info, 1902 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 1903 Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Opd1Info, 1904 Opd2Info, TargetTransformInfo::OP_None, 1905 TargetTransformInfo::OP_None); 1906 return Cost; 1907 } 1908 LLVM_FALLTHROUGH; 1909 case ISD::UDIV: { 1910 if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue) { 1911 auto VT = TLI->getValueType(DL, Ty); 1912 if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) { 1913 // Vector signed division by constant are expanded to the 1914 // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division 1915 // to MULHS + SUB + SRL + ADD + SRL. 1916 InstructionCost MulCost = getArithmeticInstrCost( 1917 Instruction::Mul, Ty, CostKind, Opd1Info, Opd2Info, 1918 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 1919 InstructionCost AddCost = getArithmeticInstrCost( 1920 Instruction::Add, Ty, CostKind, Opd1Info, Opd2Info, 1921 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 1922 InstructionCost ShrCost = getArithmeticInstrCost( 1923 Instruction::AShr, Ty, CostKind, Opd1Info, Opd2Info, 1924 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 1925 return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1; 1926 } 1927 } 1928 1929 InstructionCost Cost = BaseT::getArithmeticInstrCost( 1930 Opcode, Ty, CostKind, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo); 1931 if (Ty->isVectorTy()) { 1932 // On AArch64, vector divisions are not supported natively and are 1933 // expanded into scalar divisions of each pair of elements. 1934 Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, CostKind, 1935 Opd1Info, Opd2Info, Opd1PropInfo, 1936 Opd2PropInfo); 1937 Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind, 1938 Opd1Info, Opd2Info, Opd1PropInfo, 1939 Opd2PropInfo); 1940 // TODO: if one of the arguments is scalar, then it's not necessary to 1941 // double the cost of handling the vector elements. 1942 Cost += Cost; 1943 } 1944 return Cost; 1945 } 1946 case ISD::MUL: 1947 // Since we do not have a MUL.2d instruction, a mul <2 x i64> is expensive 1948 // as elements are extracted from the vectors and the muls scalarized. 1949 // As getScalarizationOverhead is a bit too pessimistic, we estimate the 1950 // cost for a i64 vector directly here, which is: 1951 // - four 2-cost i64 extracts, 1952 // - two 2-cost i64 inserts, and 1953 // - two 1-cost muls. 1954 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with 1955 // LT.first = 2 the cost is 28. If both operands are extensions it will not 1956 // need to scalarize so the cost can be cheaper (smull or umull). 1957 if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args)) 1958 return LT.first; 1959 return LT.first * 14; 1960 case ISD::ADD: 1961 case ISD::XOR: 1962 case ISD::OR: 1963 case ISD::AND: 1964 case ISD::SRL: 1965 case ISD::SRA: 1966 case ISD::SHL: 1967 // These nodes are marked as 'custom' for combining purposes only. 1968 // We know that they are legal. See LowerAdd in ISelLowering. 1969 return LT.first; 1970 1971 case ISD::FADD: 1972 case ISD::FSUB: 1973 case ISD::FMUL: 1974 case ISD::FDIV: 1975 case ISD::FNEG: 1976 // These nodes are marked as 'custom' just to lower them to SVE. 1977 // We know said lowering will incur no additional cost. 1978 if (!Ty->getScalarType()->isFP128Ty()) 1979 return 2 * LT.first; 1980 1981 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, 1982 Opd2Info, Opd1PropInfo, Opd2PropInfo); 1983 } 1984 } 1985 1986 InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty, 1987 ScalarEvolution *SE, 1988 const SCEV *Ptr) { 1989 // Address computations in vectorized code with non-consecutive addresses will 1990 // likely result in more instructions compared to scalar code where the 1991 // computation can more often be merged into the index mode. The resulting 1992 // extra micro-ops can significantly decrease throughput. 1993 unsigned NumVectorInstToHideOverhead = 10; 1994 int MaxMergeDistance = 64; 1995 1996 if (Ty->isVectorTy() && SE && 1997 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) 1998 return NumVectorInstToHideOverhead; 1999 2000 // In many cases the address computation is not merged into the instruction 2001 // addressing mode. 2002 return 1; 2003 } 2004 2005 InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 2006 Type *CondTy, 2007 CmpInst::Predicate VecPred, 2008 TTI::TargetCostKind CostKind, 2009 const Instruction *I) { 2010 // TODO: Handle other cost kinds. 2011 if (CostKind != TTI::TCK_RecipThroughput) 2012 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 2013 I); 2014 2015 int ISD = TLI->InstructionOpcodeToISD(Opcode); 2016 // We don't lower some vector selects well that are wider than the register 2017 // width. 2018 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) { 2019 // We would need this many instructions to hide the scalarization happening. 2020 const int AmortizationCost = 20; 2021 2022 // If VecPred is not set, check if we can get a predicate from the context 2023 // instruction, if its type matches the requested ValTy. 2024 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) { 2025 CmpInst::Predicate CurrentPred; 2026 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(), 2027 m_Value()))) 2028 VecPred = CurrentPred; 2029 } 2030 // Check if we have a compare/select chain that can be lowered using 2031 // a (F)CMxx & BFI pair. 2032 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE || 2033 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT || 2034 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ || 2035 VecPred == CmpInst::FCMP_UNE) { 2036 static const auto ValidMinMaxTys = { 2037 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, 2038 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64}; 2039 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16}; 2040 2041 auto LT = TLI->getTypeLegalizationCost(DL, ValTy); 2042 if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; }) || 2043 (ST->hasFullFP16() && 2044 any_of(ValidFP16MinMaxTys, [<](MVT M) { return M == LT.second; }))) 2045 return LT.first; 2046 } 2047 2048 static const TypeConversionCostTblEntry 2049 VectorSelectTbl[] = { 2050 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 }, 2051 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 }, 2052 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 }, 2053 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost }, 2054 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost }, 2055 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost } 2056 }; 2057 2058 EVT SelCondTy = TLI->getValueType(DL, CondTy); 2059 EVT SelValTy = TLI->getValueType(DL, ValTy); 2060 if (SelCondTy.isSimple() && SelValTy.isSimple()) { 2061 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD, 2062 SelCondTy.getSimpleVT(), 2063 SelValTy.getSimpleVT())) 2064 return Entry->Cost; 2065 } 2066 } 2067 // The base case handles scalable vectors fine for now, since it treats the 2068 // cost as 1 * legalization cost. 2069 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); 2070 } 2071 2072 AArch64TTIImpl::TTI::MemCmpExpansionOptions 2073 AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { 2074 TTI::MemCmpExpansionOptions Options; 2075 if (ST->requiresStrictAlign()) { 2076 // TODO: Add cost modeling for strict align. Misaligned loads expand to 2077 // a bunch of instructions when strict align is enabled. 2078 return Options; 2079 } 2080 Options.AllowOverlappingLoads = true; 2081 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); 2082 Options.NumLoadsPerBlock = Options.MaxNumLoads; 2083 // TODO: Though vector loads usually perform well on AArch64, in some targets 2084 // they may wake up the FP unit, which raises the power consumption. Perhaps 2085 // they could be used with no holds barred (-O3). 2086 Options.LoadSizes = {8, 4, 2, 1}; 2087 return Options; 2088 } 2089 2090 bool AArch64TTIImpl::prefersVectorizedAddressing() const { 2091 return ST->hasSVE(); 2092 } 2093 2094 InstructionCost 2095 AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, 2096 Align Alignment, unsigned AddressSpace, 2097 TTI::TargetCostKind CostKind) { 2098 if (useNeonVector(Src)) 2099 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 2100 CostKind); 2101 auto LT = TLI->getTypeLegalizationCost(DL, Src); 2102 if (!LT.first.isValid()) 2103 return InstructionCost::getInvalid(); 2104 2105 // The code-generator is currently not able to handle scalable vectors 2106 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 2107 // it. This change will be removed when code-generation for these types is 2108 // sufficiently reliable. 2109 if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1)) 2110 return InstructionCost::getInvalid(); 2111 2112 return LT.first * 2; 2113 } 2114 2115 static unsigned getSVEGatherScatterOverhead(unsigned Opcode) { 2116 return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead; 2117 } 2118 2119 InstructionCost AArch64TTIImpl::getGatherScatterOpCost( 2120 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, 2121 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { 2122 if (useNeonVector(DataTy)) 2123 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 2124 Alignment, CostKind, I); 2125 auto *VT = cast<VectorType>(DataTy); 2126 auto LT = TLI->getTypeLegalizationCost(DL, DataTy); 2127 if (!LT.first.isValid()) 2128 return InstructionCost::getInvalid(); 2129 2130 // The code-generator is currently not able to handle scalable vectors 2131 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 2132 // it. This change will be removed when code-generation for these types is 2133 // sufficiently reliable. 2134 if (cast<VectorType>(DataTy)->getElementCount() == 2135 ElementCount::getScalable(1)) 2136 return InstructionCost::getInvalid(); 2137 2138 ElementCount LegalVF = LT.second.getVectorElementCount(); 2139 InstructionCost MemOpCost = 2140 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I); 2141 // Add on an overhead cost for using gathers/scatters. 2142 // TODO: At the moment this is applied unilaterally for all CPUs, but at some 2143 // point we may want a per-CPU overhead. 2144 MemOpCost *= getSVEGatherScatterOverhead(Opcode); 2145 return LT.first * MemOpCost * getMaxNumElements(LegalVF); 2146 } 2147 2148 bool AArch64TTIImpl::useNeonVector(const Type *Ty) const { 2149 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors(); 2150 } 2151 2152 InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, 2153 MaybeAlign Alignment, 2154 unsigned AddressSpace, 2155 TTI::TargetCostKind CostKind, 2156 const Instruction *I) { 2157 EVT VT = TLI->getValueType(DL, Ty, true); 2158 // Type legalization can't handle structs 2159 if (VT == MVT::Other) 2160 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, 2161 CostKind); 2162 2163 auto LT = TLI->getTypeLegalizationCost(DL, Ty); 2164 if (!LT.first.isValid()) 2165 return InstructionCost::getInvalid(); 2166 2167 // The code-generator is currently not able to handle scalable vectors 2168 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 2169 // it. This change will be removed when code-generation for these types is 2170 // sufficiently reliable. 2171 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty)) 2172 if (VTy->getElementCount() == ElementCount::getScalable(1)) 2173 return InstructionCost::getInvalid(); 2174 2175 // TODO: consider latency as well for TCK_SizeAndLatency. 2176 if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) 2177 return LT.first; 2178 2179 if (CostKind != TTI::TCK_RecipThroughput) 2180 return 1; 2181 2182 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store && 2183 LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) { 2184 // Unaligned stores are extremely inefficient. We don't split all 2185 // unaligned 128-bit stores because the negative impact that has shown in 2186 // practice on inlined block copy code. 2187 // We make such stores expensive so that we will only vectorize if there 2188 // are 6 other instructions getting vectorized. 2189 const int AmortizationCost = 6; 2190 2191 return LT.first * 2 * AmortizationCost; 2192 } 2193 2194 // Check truncating stores and extending loads. 2195 if (useNeonVector(Ty) && 2196 Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) { 2197 // v4i8 types are lowered to scalar a load/store and sshll/xtn. 2198 if (VT == MVT::v4i8) 2199 return 2; 2200 // Otherwise we need to scalarize. 2201 return cast<FixedVectorType>(Ty)->getNumElements() * 2; 2202 } 2203 2204 return LT.first; 2205 } 2206 2207 InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost( 2208 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 2209 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 2210 bool UseMaskForCond, bool UseMaskForGaps) { 2211 assert(Factor >= 2 && "Invalid interleave factor"); 2212 auto *VecVTy = cast<FixedVectorType>(VecTy); 2213 2214 if (!UseMaskForCond && !UseMaskForGaps && 2215 Factor <= TLI->getMaxSupportedInterleaveFactor()) { 2216 unsigned NumElts = VecVTy->getNumElements(); 2217 auto *SubVecTy = 2218 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor); 2219 2220 // ldN/stN only support legal vector types of size 64 or 128 in bits. 2221 // Accesses having vector types that are a multiple of 128 bits can be 2222 // matched to more than one ldN/stN instruction. 2223 bool UseScalable; 2224 if (NumElts % Factor == 0 && 2225 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable)) 2226 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable); 2227 } 2228 2229 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 2230 Alignment, AddressSpace, CostKind, 2231 UseMaskForCond, UseMaskForGaps); 2232 } 2233 2234 InstructionCost 2235 AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { 2236 InstructionCost Cost = 0; 2237 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2238 for (auto *I : Tys) { 2239 if (!I->isVectorTy()) 2240 continue; 2241 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() == 2242 128) 2243 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) + 2244 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind); 2245 } 2246 return Cost; 2247 } 2248 2249 unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) { 2250 return ST->getMaxInterleaveFactor(); 2251 } 2252 2253 // For Falkor, we want to avoid having too many strided loads in a loop since 2254 // that can exhaust the HW prefetcher resources. We adjust the unroller 2255 // MaxCount preference below to attempt to ensure unrolling doesn't create too 2256 // many strided loads. 2257 static void 2258 getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, 2259 TargetTransformInfo::UnrollingPreferences &UP) { 2260 enum { MaxStridedLoads = 7 }; 2261 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) { 2262 int StridedLoads = 0; 2263 // FIXME? We could make this more precise by looking at the CFG and 2264 // e.g. not counting loads in each side of an if-then-else diamond. 2265 for (const auto BB : L->blocks()) { 2266 for (auto &I : *BB) { 2267 LoadInst *LMemI = dyn_cast<LoadInst>(&I); 2268 if (!LMemI) 2269 continue; 2270 2271 Value *PtrValue = LMemI->getPointerOperand(); 2272 if (L->isLoopInvariant(PtrValue)) 2273 continue; 2274 2275 const SCEV *LSCEV = SE.getSCEV(PtrValue); 2276 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV); 2277 if (!LSCEVAddRec || !LSCEVAddRec->isAffine()) 2278 continue; 2279 2280 // FIXME? We could take pairing of unrolled load copies into account 2281 // by looking at the AddRec, but we would probably have to limit this 2282 // to loops with no stores or other memory optimization barriers. 2283 ++StridedLoads; 2284 // We've seen enough strided loads that seeing more won't make a 2285 // difference. 2286 if (StridedLoads > MaxStridedLoads / 2) 2287 return StridedLoads; 2288 } 2289 } 2290 return StridedLoads; 2291 }; 2292 2293 int StridedLoads = countStridedLoads(L, SE); 2294 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads 2295 << " strided loads\n"); 2296 // Pick the largest power of 2 unroll count that won't result in too many 2297 // strided loads. 2298 if (StridedLoads) { 2299 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads); 2300 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to " 2301 << UP.MaxCount << '\n'); 2302 } 2303 } 2304 2305 void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 2306 TTI::UnrollingPreferences &UP, 2307 OptimizationRemarkEmitter *ORE) { 2308 // Enable partial unrolling and runtime unrolling. 2309 BaseT::getUnrollingPreferences(L, SE, UP, ORE); 2310 2311 UP.UpperBound = true; 2312 2313 // For inner loop, it is more likely to be a hot one, and the runtime check 2314 // can be promoted out from LICM pass, so the overhead is less, let's try 2315 // a larger threshold to unroll more loops. 2316 if (L->getLoopDepth() > 1) 2317 UP.PartialThreshold *= 2; 2318 2319 // Disable partial & runtime unrolling on -Os. 2320 UP.PartialOptSizeThreshold = 0; 2321 2322 if (ST->getProcFamily() == AArch64Subtarget::Falkor && 2323 EnableFalkorHWPFUnrollFix) 2324 getFalkorUnrollingPreferences(L, SE, UP); 2325 2326 // Scan the loop: don't unroll loops with calls as this could prevent 2327 // inlining. Don't unroll vector loops either, as they don't benefit much from 2328 // unrolling. 2329 for (auto *BB : L->getBlocks()) { 2330 for (auto &I : *BB) { 2331 // Don't unroll vectorised loop. 2332 if (I.getType()->isVectorTy()) 2333 return; 2334 2335 if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 2336 if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 2337 if (!isLoweredToCall(F)) 2338 continue; 2339 } 2340 return; 2341 } 2342 } 2343 } 2344 2345 // Enable runtime unrolling for in-order models 2346 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by 2347 // checking for that case, we can ensure that the default behaviour is 2348 // unchanged 2349 if (ST->getProcFamily() != AArch64Subtarget::Others && 2350 !ST->getSchedModel().isOutOfOrder()) { 2351 UP.Runtime = true; 2352 UP.Partial = true; 2353 UP.UnrollRemainder = true; 2354 UP.DefaultUnrollRuntimeCount = 4; 2355 2356 UP.UnrollAndJam = true; 2357 UP.UnrollAndJamInnerLoopThreshold = 60; 2358 } 2359 } 2360 2361 void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 2362 TTI::PeelingPreferences &PP) { 2363 BaseT::getPeelingPreferences(L, SE, PP); 2364 } 2365 2366 Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, 2367 Type *ExpectedType) { 2368 switch (Inst->getIntrinsicID()) { 2369 default: 2370 return nullptr; 2371 case Intrinsic::aarch64_neon_st2: 2372 case Intrinsic::aarch64_neon_st3: 2373 case Intrinsic::aarch64_neon_st4: { 2374 // Create a struct type 2375 StructType *ST = dyn_cast<StructType>(ExpectedType); 2376 if (!ST) 2377 return nullptr; 2378 unsigned NumElts = Inst->arg_size() - 1; 2379 if (ST->getNumElements() != NumElts) 2380 return nullptr; 2381 for (unsigned i = 0, e = NumElts; i != e; ++i) { 2382 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i)) 2383 return nullptr; 2384 } 2385 Value *Res = UndefValue::get(ExpectedType); 2386 IRBuilder<> Builder(Inst); 2387 for (unsigned i = 0, e = NumElts; i != e; ++i) { 2388 Value *L = Inst->getArgOperand(i); 2389 Res = Builder.CreateInsertValue(Res, L, i); 2390 } 2391 return Res; 2392 } 2393 case Intrinsic::aarch64_neon_ld2: 2394 case Intrinsic::aarch64_neon_ld3: 2395 case Intrinsic::aarch64_neon_ld4: 2396 if (Inst->getType() == ExpectedType) 2397 return Inst; 2398 return nullptr; 2399 } 2400 } 2401 2402 bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, 2403 MemIntrinsicInfo &Info) { 2404 switch (Inst->getIntrinsicID()) { 2405 default: 2406 break; 2407 case Intrinsic::aarch64_neon_ld2: 2408 case Intrinsic::aarch64_neon_ld3: 2409 case Intrinsic::aarch64_neon_ld4: 2410 Info.ReadMem = true; 2411 Info.WriteMem = false; 2412 Info.PtrVal = Inst->getArgOperand(0); 2413 break; 2414 case Intrinsic::aarch64_neon_st2: 2415 case Intrinsic::aarch64_neon_st3: 2416 case Intrinsic::aarch64_neon_st4: 2417 Info.ReadMem = false; 2418 Info.WriteMem = true; 2419 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1); 2420 break; 2421 } 2422 2423 switch (Inst->getIntrinsicID()) { 2424 default: 2425 return false; 2426 case Intrinsic::aarch64_neon_ld2: 2427 case Intrinsic::aarch64_neon_st2: 2428 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS; 2429 break; 2430 case Intrinsic::aarch64_neon_ld3: 2431 case Intrinsic::aarch64_neon_st3: 2432 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS; 2433 break; 2434 case Intrinsic::aarch64_neon_ld4: 2435 case Intrinsic::aarch64_neon_st4: 2436 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS; 2437 break; 2438 } 2439 return true; 2440 } 2441 2442 /// See if \p I should be considered for address type promotion. We check if \p 2443 /// I is a sext with right type and used in memory accesses. If it used in a 2444 /// "complex" getelementptr, we allow it to be promoted without finding other 2445 /// sext instructions that sign extended the same initial value. A getelementptr 2446 /// is considered as "complex" if it has more than 2 operands. 2447 bool AArch64TTIImpl::shouldConsiderAddressTypePromotion( 2448 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) { 2449 bool Considerable = false; 2450 AllowPromotionWithoutCommonHeader = false; 2451 if (!isa<SExtInst>(&I)) 2452 return false; 2453 Type *ConsideredSExtType = 2454 Type::getInt64Ty(I.getParent()->getParent()->getContext()); 2455 if (I.getType() != ConsideredSExtType) 2456 return false; 2457 // See if the sext is the one with the right type and used in at least one 2458 // GetElementPtrInst. 2459 for (const User *U : I.users()) { 2460 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) { 2461 Considerable = true; 2462 // A getelementptr is considered as "complex" if it has more than 2 2463 // operands. We will promote a SExt used in such complex GEP as we 2464 // expect some computation to be merged if they are done on 64 bits. 2465 if (GEPInst->getNumOperands() > 2) { 2466 AllowPromotionWithoutCommonHeader = true; 2467 break; 2468 } 2469 } 2470 } 2471 return Considerable; 2472 } 2473 2474 bool AArch64TTIImpl::isLegalToVectorizeReduction( 2475 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const { 2476 if (!VF.isScalable()) 2477 return true; 2478 2479 Type *Ty = RdxDesc.getRecurrenceType(); 2480 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty)) 2481 return false; 2482 2483 switch (RdxDesc.getRecurrenceKind()) { 2484 case RecurKind::Add: 2485 case RecurKind::FAdd: 2486 case RecurKind::And: 2487 case RecurKind::Or: 2488 case RecurKind::Xor: 2489 case RecurKind::SMin: 2490 case RecurKind::SMax: 2491 case RecurKind::UMin: 2492 case RecurKind::UMax: 2493 case RecurKind::FMin: 2494 case RecurKind::FMax: 2495 case RecurKind::SelectICmp: 2496 case RecurKind::SelectFCmp: 2497 case RecurKind::FMulAdd: 2498 return true; 2499 default: 2500 return false; 2501 } 2502 } 2503 2504 InstructionCost 2505 AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, 2506 bool IsUnsigned, 2507 TTI::TargetCostKind CostKind) { 2508 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 2509 2510 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16()) 2511 return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind); 2512 2513 assert((isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) && 2514 "Both vector needs to be equally scalable"); 2515 2516 InstructionCost LegalizationCost = 0; 2517 if (LT.first > 1) { 2518 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext()); 2519 unsigned MinMaxOpcode = 2520 Ty->isFPOrFPVectorTy() 2521 ? Intrinsic::maxnum 2522 : (IsUnsigned ? Intrinsic::umin : Intrinsic::smin); 2523 IntrinsicCostAttributes Attrs(MinMaxOpcode, LegalVTy, {LegalVTy, LegalVTy}); 2524 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1); 2525 } 2526 2527 return LegalizationCost + /*Cost of horizontal reduction*/ 2; 2528 } 2529 2530 InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE( 2531 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) { 2532 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 2533 InstructionCost LegalizationCost = 0; 2534 if (LT.first > 1) { 2535 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext()); 2536 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind); 2537 LegalizationCost *= LT.first - 1; 2538 } 2539 2540 int ISD = TLI->InstructionOpcodeToISD(Opcode); 2541 assert(ISD && "Invalid opcode"); 2542 // Add the final reduction cost for the legal horizontal reduction 2543 switch (ISD) { 2544 case ISD::ADD: 2545 case ISD::AND: 2546 case ISD::OR: 2547 case ISD::XOR: 2548 case ISD::FADD: 2549 return LegalizationCost + 2; 2550 default: 2551 return InstructionCost::getInvalid(); 2552 } 2553 } 2554 2555 InstructionCost 2556 AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, 2557 Optional<FastMathFlags> FMF, 2558 TTI::TargetCostKind CostKind) { 2559 if (TTI::requiresOrderedReduction(FMF)) { 2560 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) { 2561 InstructionCost BaseCost = 2562 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 2563 // Add on extra cost to reflect the extra overhead on some CPUs. We still 2564 // end up vectorizing for more computationally intensive loops. 2565 return BaseCost + FixedVTy->getNumElements(); 2566 } 2567 2568 if (Opcode != Instruction::FAdd) 2569 return InstructionCost::getInvalid(); 2570 2571 auto *VTy = cast<ScalableVectorType>(ValTy); 2572 InstructionCost Cost = 2573 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind); 2574 Cost *= getMaxNumElements(VTy->getElementCount()); 2575 return Cost; 2576 } 2577 2578 if (isa<ScalableVectorType>(ValTy)) 2579 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind); 2580 2581 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 2582 MVT MTy = LT.second; 2583 int ISD = TLI->InstructionOpcodeToISD(Opcode); 2584 assert(ISD && "Invalid opcode"); 2585 2586 // Horizontal adds can use the 'addv' instruction. We model the cost of these 2587 // instructions as twice a normal vector add, plus 1 for each legalization 2588 // step (LT.first). This is the only arithmetic vector reduction operation for 2589 // which we have an instruction. 2590 // OR, XOR and AND costs should match the codegen from: 2591 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll 2592 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll 2593 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll 2594 static const CostTblEntry CostTblNoPairwise[]{ 2595 {ISD::ADD, MVT::v8i8, 2}, 2596 {ISD::ADD, MVT::v16i8, 2}, 2597 {ISD::ADD, MVT::v4i16, 2}, 2598 {ISD::ADD, MVT::v8i16, 2}, 2599 {ISD::ADD, MVT::v4i32, 2}, 2600 {ISD::OR, MVT::v8i8, 15}, 2601 {ISD::OR, MVT::v16i8, 17}, 2602 {ISD::OR, MVT::v4i16, 7}, 2603 {ISD::OR, MVT::v8i16, 9}, 2604 {ISD::OR, MVT::v2i32, 3}, 2605 {ISD::OR, MVT::v4i32, 5}, 2606 {ISD::OR, MVT::v2i64, 3}, 2607 {ISD::XOR, MVT::v8i8, 15}, 2608 {ISD::XOR, MVT::v16i8, 17}, 2609 {ISD::XOR, MVT::v4i16, 7}, 2610 {ISD::XOR, MVT::v8i16, 9}, 2611 {ISD::XOR, MVT::v2i32, 3}, 2612 {ISD::XOR, MVT::v4i32, 5}, 2613 {ISD::XOR, MVT::v2i64, 3}, 2614 {ISD::AND, MVT::v8i8, 15}, 2615 {ISD::AND, MVT::v16i8, 17}, 2616 {ISD::AND, MVT::v4i16, 7}, 2617 {ISD::AND, MVT::v8i16, 9}, 2618 {ISD::AND, MVT::v2i32, 3}, 2619 {ISD::AND, MVT::v4i32, 5}, 2620 {ISD::AND, MVT::v2i64, 3}, 2621 }; 2622 switch (ISD) { 2623 default: 2624 break; 2625 case ISD::ADD: 2626 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy)) 2627 return (LT.first - 1) + Entry->Cost; 2628 break; 2629 case ISD::XOR: 2630 case ISD::AND: 2631 case ISD::OR: 2632 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy); 2633 if (!Entry) 2634 break; 2635 auto *ValVTy = cast<FixedVectorType>(ValTy); 2636 if (!ValVTy->getElementType()->isIntegerTy(1) && 2637 MTy.getVectorNumElements() <= ValVTy->getNumElements() && 2638 isPowerOf2_32(ValVTy->getNumElements())) { 2639 InstructionCost ExtraCost = 0; 2640 if (LT.first != 1) { 2641 // Type needs to be split, so there is an extra cost of LT.first - 1 2642 // arithmetic ops. 2643 auto *Ty = FixedVectorType::get(ValTy->getElementType(), 2644 MTy.getVectorNumElements()); 2645 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind); 2646 ExtraCost *= LT.first - 1; 2647 } 2648 return Entry->Cost + ExtraCost; 2649 } 2650 break; 2651 } 2652 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 2653 } 2654 2655 InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) { 2656 static const CostTblEntry ShuffleTbl[] = { 2657 { TTI::SK_Splice, MVT::nxv16i8, 1 }, 2658 { TTI::SK_Splice, MVT::nxv8i16, 1 }, 2659 { TTI::SK_Splice, MVT::nxv4i32, 1 }, 2660 { TTI::SK_Splice, MVT::nxv2i64, 1 }, 2661 { TTI::SK_Splice, MVT::nxv2f16, 1 }, 2662 { TTI::SK_Splice, MVT::nxv4f16, 1 }, 2663 { TTI::SK_Splice, MVT::nxv8f16, 1 }, 2664 { TTI::SK_Splice, MVT::nxv2bf16, 1 }, 2665 { TTI::SK_Splice, MVT::nxv4bf16, 1 }, 2666 { TTI::SK_Splice, MVT::nxv8bf16, 1 }, 2667 { TTI::SK_Splice, MVT::nxv2f32, 1 }, 2668 { TTI::SK_Splice, MVT::nxv4f32, 1 }, 2669 { TTI::SK_Splice, MVT::nxv2f64, 1 }, 2670 }; 2671 2672 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 2673 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext()); 2674 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2675 EVT PromotedVT = LT.second.getScalarType() == MVT::i1 2676 ? TLI->getPromotedVTForPredicate(EVT(LT.second)) 2677 : LT.second; 2678 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext()); 2679 InstructionCost LegalizationCost = 0; 2680 if (Index < 0) { 2681 LegalizationCost = 2682 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy, 2683 CmpInst::BAD_ICMP_PREDICATE, CostKind) + 2684 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy, 2685 CmpInst::BAD_ICMP_PREDICATE, CostKind); 2686 } 2687 2688 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp 2689 // Cost performed on a promoted type. 2690 if (LT.second.getScalarType() == MVT::i1) { 2691 LegalizationCost += 2692 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy, 2693 TTI::CastContextHint::None, CostKind) + 2694 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy, 2695 TTI::CastContextHint::None, CostKind); 2696 } 2697 const auto *Entry = 2698 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT()); 2699 assert(Entry && "Illegal Type for Splice"); 2700 LegalizationCost += Entry->Cost; 2701 return LegalizationCost * LT.first; 2702 } 2703 2704 InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, 2705 VectorType *Tp, 2706 ArrayRef<int> Mask, int Index, 2707 VectorType *SubTp, 2708 ArrayRef<const Value *> Args) { 2709 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 2710 // If we have a Mask, and the LT is being legalized somehow, split the Mask 2711 // into smaller vectors and sum the cost of each shuffle. 2712 if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() && 2713 Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() && 2714 cast<FixedVectorType>(Tp)->getNumElements() > 2715 LT.second.getVectorNumElements() && 2716 !Index && !SubTp) { 2717 unsigned TpNumElts = cast<FixedVectorType>(Tp)->getNumElements(); 2718 assert(Mask.size() == TpNumElts && "Expected Mask and Tp size to match!"); 2719 unsigned LTNumElts = LT.second.getVectorNumElements(); 2720 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts; 2721 VectorType *NTp = 2722 VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount()); 2723 InstructionCost Cost; 2724 for (unsigned N = 0; N < NumVecs; N++) { 2725 SmallVector<int> NMask; 2726 // Split the existing mask into chunks of size LTNumElts. Track the source 2727 // sub-vectors to ensure the result has at most 2 inputs. 2728 unsigned Source1, Source2; 2729 unsigned NumSources = 0; 2730 for (unsigned E = 0; E < LTNumElts; E++) { 2731 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E] 2732 : UndefMaskElem; 2733 if (MaskElt < 0) { 2734 NMask.push_back(UndefMaskElem); 2735 continue; 2736 } 2737 2738 // Calculate which source from the input this comes from and whether it 2739 // is new to us. 2740 unsigned Source = MaskElt / LTNumElts; 2741 if (NumSources == 0) { 2742 Source1 = Source; 2743 NumSources = 1; 2744 } else if (NumSources == 1 && Source != Source1) { 2745 Source2 = Source; 2746 NumSources = 2; 2747 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) { 2748 NumSources++; 2749 } 2750 2751 // Add to the new mask. For the NumSources>2 case these are not correct, 2752 // but are only used for the modular lane number. 2753 if (Source == Source1) 2754 NMask.push_back(MaskElt % LTNumElts); 2755 else if (Source == Source2) 2756 NMask.push_back(MaskElt % LTNumElts + LTNumElts); 2757 else 2758 NMask.push_back(MaskElt % LTNumElts); 2759 } 2760 // If the sub-mask has at most 2 input sub-vectors then re-cost it using 2761 // getShuffleCost. If not then cost it using the worst case. 2762 if (NumSources <= 2) 2763 Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc 2764 : TTI::SK_PermuteTwoSrc, 2765 NTp, NMask, 0, nullptr, Args); 2766 else if (any_of(enumerate(NMask), [&](const auto &ME) { 2767 return ME.value() % LTNumElts == ME.index(); 2768 })) 2769 Cost += LTNumElts - 1; 2770 else 2771 Cost += LTNumElts; 2772 } 2773 return Cost; 2774 } 2775 2776 Kind = improveShuffleKindFromMask(Kind, Mask); 2777 2778 // Check for broadcast loads. 2779 if (Kind == TTI::SK_Broadcast) { 2780 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]); 2781 if (IsLoad && LT.second.isVector() && 2782 isLegalBroadcastLoad(Tp->getElementType(), 2783 LT.second.getVectorElementCount())) 2784 return 0; // broadcast is handled by ld1r 2785 } 2786 2787 // If we have 4 elements for the shuffle and a Mask, get the cost straight 2788 // from the perfect shuffle tables. 2789 if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) && 2790 (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) && 2791 all_of(Mask, [](int E) { return E < 8; })) 2792 return getPerfectShuffleCost(Mask); 2793 2794 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose || 2795 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc || 2796 Kind == TTI::SK_Reverse) { 2797 2798 static const CostTblEntry ShuffleTbl[] = { 2799 // Broadcast shuffle kinds can be performed with 'dup'. 2800 { TTI::SK_Broadcast, MVT::v8i8, 1 }, 2801 { TTI::SK_Broadcast, MVT::v16i8, 1 }, 2802 { TTI::SK_Broadcast, MVT::v4i16, 1 }, 2803 { TTI::SK_Broadcast, MVT::v8i16, 1 }, 2804 { TTI::SK_Broadcast, MVT::v2i32, 1 }, 2805 { TTI::SK_Broadcast, MVT::v4i32, 1 }, 2806 { TTI::SK_Broadcast, MVT::v2i64, 1 }, 2807 { TTI::SK_Broadcast, MVT::v2f32, 1 }, 2808 { TTI::SK_Broadcast, MVT::v4f32, 1 }, 2809 { TTI::SK_Broadcast, MVT::v2f64, 1 }, 2810 // Transpose shuffle kinds can be performed with 'trn1/trn2' and 2811 // 'zip1/zip2' instructions. 2812 { TTI::SK_Transpose, MVT::v8i8, 1 }, 2813 { TTI::SK_Transpose, MVT::v16i8, 1 }, 2814 { TTI::SK_Transpose, MVT::v4i16, 1 }, 2815 { TTI::SK_Transpose, MVT::v8i16, 1 }, 2816 { TTI::SK_Transpose, MVT::v2i32, 1 }, 2817 { TTI::SK_Transpose, MVT::v4i32, 1 }, 2818 { TTI::SK_Transpose, MVT::v2i64, 1 }, 2819 { TTI::SK_Transpose, MVT::v2f32, 1 }, 2820 { TTI::SK_Transpose, MVT::v4f32, 1 }, 2821 { TTI::SK_Transpose, MVT::v2f64, 1 }, 2822 // Select shuffle kinds. 2823 // TODO: handle vXi8/vXi16. 2824 { TTI::SK_Select, MVT::v2i32, 1 }, // mov. 2825 { TTI::SK_Select, MVT::v4i32, 2 }, // rev+trn (or similar). 2826 { TTI::SK_Select, MVT::v2i64, 1 }, // mov. 2827 { TTI::SK_Select, MVT::v2f32, 1 }, // mov. 2828 { TTI::SK_Select, MVT::v4f32, 2 }, // rev+trn (or similar). 2829 { TTI::SK_Select, MVT::v2f64, 1 }, // mov. 2830 // PermuteSingleSrc shuffle kinds. 2831 { TTI::SK_PermuteSingleSrc, MVT::v2i32, 1 }, // mov. 2832 { TTI::SK_PermuteSingleSrc, MVT::v4i32, 3 }, // perfectshuffle worst case. 2833 { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // mov. 2834 { TTI::SK_PermuteSingleSrc, MVT::v2f32, 1 }, // mov. 2835 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 3 }, // perfectshuffle worst case. 2836 { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // mov. 2837 { TTI::SK_PermuteSingleSrc, MVT::v4i16, 3 }, // perfectshuffle worst case. 2838 { TTI::SK_PermuteSingleSrc, MVT::v4f16, 3 }, // perfectshuffle worst case. 2839 { TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3 }, // perfectshuffle worst case. 2840 { TTI::SK_PermuteSingleSrc, MVT::v8i16, 8 }, // constpool + load + tbl 2841 { TTI::SK_PermuteSingleSrc, MVT::v8f16, 8 }, // constpool + load + tbl 2842 { TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8 }, // constpool + load + tbl 2843 { TTI::SK_PermuteSingleSrc, MVT::v8i8, 8 }, // constpool + load + tbl 2844 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 8 }, // constpool + load + tbl 2845 // Reverse can be lowered with `rev`. 2846 { TTI::SK_Reverse, MVT::v2i32, 1 }, // mov. 2847 { TTI::SK_Reverse, MVT::v4i32, 2 }, // REV64; EXT 2848 { TTI::SK_Reverse, MVT::v2i64, 1 }, // mov. 2849 { TTI::SK_Reverse, MVT::v2f32, 1 }, // mov. 2850 { TTI::SK_Reverse, MVT::v4f32, 2 }, // REV64; EXT 2851 { TTI::SK_Reverse, MVT::v2f64, 1 }, // mov. 2852 { TTI::SK_Reverse, MVT::v8f16, 2 }, // REV64; EXT 2853 { TTI::SK_Reverse, MVT::v8i16, 2 }, // REV64; EXT 2854 { TTI::SK_Reverse, MVT::v16i8, 2 }, // REV64; EXT 2855 { TTI::SK_Reverse, MVT::v4f16, 1 }, // REV64 2856 { TTI::SK_Reverse, MVT::v4i16, 1 }, // REV64 2857 { TTI::SK_Reverse, MVT::v8i8, 1 }, // REV64 2858 // Broadcast shuffle kinds for scalable vectors 2859 { TTI::SK_Broadcast, MVT::nxv16i8, 1 }, 2860 { TTI::SK_Broadcast, MVT::nxv8i16, 1 }, 2861 { TTI::SK_Broadcast, MVT::nxv4i32, 1 }, 2862 { TTI::SK_Broadcast, MVT::nxv2i64, 1 }, 2863 { TTI::SK_Broadcast, MVT::nxv2f16, 1 }, 2864 { TTI::SK_Broadcast, MVT::nxv4f16, 1 }, 2865 { TTI::SK_Broadcast, MVT::nxv8f16, 1 }, 2866 { TTI::SK_Broadcast, MVT::nxv2bf16, 1 }, 2867 { TTI::SK_Broadcast, MVT::nxv4bf16, 1 }, 2868 { TTI::SK_Broadcast, MVT::nxv8bf16, 1 }, 2869 { TTI::SK_Broadcast, MVT::nxv2f32, 1 }, 2870 { TTI::SK_Broadcast, MVT::nxv4f32, 1 }, 2871 { TTI::SK_Broadcast, MVT::nxv2f64, 1 }, 2872 { TTI::SK_Broadcast, MVT::nxv16i1, 1 }, 2873 { TTI::SK_Broadcast, MVT::nxv8i1, 1 }, 2874 { TTI::SK_Broadcast, MVT::nxv4i1, 1 }, 2875 { TTI::SK_Broadcast, MVT::nxv2i1, 1 }, 2876 // Handle the cases for vector.reverse with scalable vectors 2877 { TTI::SK_Reverse, MVT::nxv16i8, 1 }, 2878 { TTI::SK_Reverse, MVT::nxv8i16, 1 }, 2879 { TTI::SK_Reverse, MVT::nxv4i32, 1 }, 2880 { TTI::SK_Reverse, MVT::nxv2i64, 1 }, 2881 { TTI::SK_Reverse, MVT::nxv2f16, 1 }, 2882 { TTI::SK_Reverse, MVT::nxv4f16, 1 }, 2883 { TTI::SK_Reverse, MVT::nxv8f16, 1 }, 2884 { TTI::SK_Reverse, MVT::nxv2bf16, 1 }, 2885 { TTI::SK_Reverse, MVT::nxv4bf16, 1 }, 2886 { TTI::SK_Reverse, MVT::nxv8bf16, 1 }, 2887 { TTI::SK_Reverse, MVT::nxv2f32, 1 }, 2888 { TTI::SK_Reverse, MVT::nxv4f32, 1 }, 2889 { TTI::SK_Reverse, MVT::nxv2f64, 1 }, 2890 { TTI::SK_Reverse, MVT::nxv16i1, 1 }, 2891 { TTI::SK_Reverse, MVT::nxv8i1, 1 }, 2892 { TTI::SK_Reverse, MVT::nxv4i1, 1 }, 2893 { TTI::SK_Reverse, MVT::nxv2i1, 1 }, 2894 }; 2895 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second)) 2896 return LT.first * Entry->Cost; 2897 } 2898 2899 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp)) 2900 return getSpliceCost(Tp, Index); 2901 2902 // Inserting a subvector can often be done with either a D, S or H register 2903 // move, so long as the inserted vector is "aligned". 2904 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() && 2905 LT.second.getSizeInBits() <= 128 && SubTp) { 2906 std::pair<InstructionCost, MVT> SubLT = 2907 TLI->getTypeLegalizationCost(DL, SubTp); 2908 if (SubLT.second.isVector()) { 2909 int NumElts = LT.second.getVectorNumElements(); 2910 int NumSubElts = SubLT.second.getVectorNumElements(); 2911 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) 2912 return SubLT.first; 2913 } 2914 } 2915 2916 return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp); 2917 } 2918