1 //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "AArch64TargetTransformInfo.h" 10 #include "AArch64ExpandImm.h" 11 #include "AArch64PerfectShuffle.h" 12 #include "MCTargetDesc/AArch64AddressingModes.h" 13 #include "llvm/Analysis/IVDescriptors.h" 14 #include "llvm/Analysis/LoopInfo.h" 15 #include "llvm/Analysis/TargetTransformInfo.h" 16 #include "llvm/CodeGen/BasicTTIImpl.h" 17 #include "llvm/CodeGen/CostTable.h" 18 #include "llvm/CodeGen/TargetLowering.h" 19 #include "llvm/IR/IntrinsicInst.h" 20 #include "llvm/IR/Intrinsics.h" 21 #include "llvm/IR/IntrinsicsAArch64.h" 22 #include "llvm/IR/PatternMatch.h" 23 #include "llvm/Support/Debug.h" 24 #include "llvm/Transforms/InstCombine/InstCombiner.h" 25 #include <algorithm> 26 using namespace llvm; 27 using namespace llvm::PatternMatch; 28 29 #define DEBUG_TYPE "aarch64tti" 30 31 static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", 32 cl::init(true), cl::Hidden); 33 34 static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10), 35 cl::Hidden); 36 37 static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead", 38 cl::init(10), cl::Hidden); 39 40 bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, 41 const Function *Callee) const { 42 const TargetMachine &TM = getTLI()->getTargetMachine(); 43 44 const FeatureBitset &CallerBits = 45 TM.getSubtargetImpl(*Caller)->getFeatureBits(); 46 const FeatureBitset &CalleeBits = 47 TM.getSubtargetImpl(*Callee)->getFeatureBits(); 48 49 // Inline a callee if its target-features are a subset of the callers 50 // target-features. 51 return (CallerBits & CalleeBits) == CalleeBits; 52 } 53 54 bool AArch64TTIImpl::shouldMaximizeVectorBandwidth( 55 TargetTransformInfo::RegisterKind K) const { 56 assert(K != TargetTransformInfo::RGK_Scalar); 57 return K == TargetTransformInfo::RGK_FixedWidthVector; 58 } 59 60 /// Calculate the cost of materializing a 64-bit value. This helper 61 /// method might only calculate a fraction of a larger immediate. Therefore it 62 /// is valid to return a cost of ZERO. 63 InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) { 64 // Check if the immediate can be encoded within an instruction. 65 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64)) 66 return 0; 67 68 if (Val < 0) 69 Val = ~Val; 70 71 // Calculate how many moves we will need to materialize this constant. 72 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 73 AArch64_IMM::expandMOVImm(Val, 64, Insn); 74 return Insn.size(); 75 } 76 77 /// Calculate the cost of materializing the given constant. 78 InstructionCost AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, 79 TTI::TargetCostKind CostKind) { 80 assert(Ty->isIntegerTy()); 81 82 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 83 if (BitSize == 0) 84 return ~0U; 85 86 // Sign-extend all constants to a multiple of 64-bit. 87 APInt ImmVal = Imm; 88 if (BitSize & 0x3f) 89 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU); 90 91 // Split the constant into 64-bit chunks and calculate the cost for each 92 // chunk. 93 InstructionCost Cost = 0; 94 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { 95 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); 96 int64_t Val = Tmp.getSExtValue(); 97 Cost += getIntImmCost(Val); 98 } 99 // We need at least one instruction to materialze the constant. 100 return std::max<InstructionCost>(1, Cost); 101 } 102 103 InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, 104 const APInt &Imm, Type *Ty, 105 TTI::TargetCostKind CostKind, 106 Instruction *Inst) { 107 assert(Ty->isIntegerTy()); 108 109 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 110 // There is no cost model for constants with a bit size of 0. Return TCC_Free 111 // here, so that constant hoisting will ignore this constant. 112 if (BitSize == 0) 113 return TTI::TCC_Free; 114 115 unsigned ImmIdx = ~0U; 116 switch (Opcode) { 117 default: 118 return TTI::TCC_Free; 119 case Instruction::GetElementPtr: 120 // Always hoist the base address of a GetElementPtr. 121 if (Idx == 0) 122 return 2 * TTI::TCC_Basic; 123 return TTI::TCC_Free; 124 case Instruction::Store: 125 ImmIdx = 0; 126 break; 127 case Instruction::Add: 128 case Instruction::Sub: 129 case Instruction::Mul: 130 case Instruction::UDiv: 131 case Instruction::SDiv: 132 case Instruction::URem: 133 case Instruction::SRem: 134 case Instruction::And: 135 case Instruction::Or: 136 case Instruction::Xor: 137 case Instruction::ICmp: 138 ImmIdx = 1; 139 break; 140 // Always return TCC_Free for the shift value of a shift instruction. 141 case Instruction::Shl: 142 case Instruction::LShr: 143 case Instruction::AShr: 144 if (Idx == 1) 145 return TTI::TCC_Free; 146 break; 147 case Instruction::Trunc: 148 case Instruction::ZExt: 149 case Instruction::SExt: 150 case Instruction::IntToPtr: 151 case Instruction::PtrToInt: 152 case Instruction::BitCast: 153 case Instruction::PHI: 154 case Instruction::Call: 155 case Instruction::Select: 156 case Instruction::Ret: 157 case Instruction::Load: 158 break; 159 } 160 161 if (Idx == ImmIdx) { 162 int NumConstants = (BitSize + 63) / 64; 163 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 164 return (Cost <= NumConstants * TTI::TCC_Basic) 165 ? static_cast<int>(TTI::TCC_Free) 166 : Cost; 167 } 168 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 169 } 170 171 InstructionCost 172 AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, 173 const APInt &Imm, Type *Ty, 174 TTI::TargetCostKind CostKind) { 175 assert(Ty->isIntegerTy()); 176 177 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 178 // There is no cost model for constants with a bit size of 0. Return TCC_Free 179 // here, so that constant hoisting will ignore this constant. 180 if (BitSize == 0) 181 return TTI::TCC_Free; 182 183 // Most (all?) AArch64 intrinsics do not support folding immediates into the 184 // selected instruction, so we compute the materialization cost for the 185 // immediate directly. 186 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv) 187 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 188 189 switch (IID) { 190 default: 191 return TTI::TCC_Free; 192 case Intrinsic::sadd_with_overflow: 193 case Intrinsic::uadd_with_overflow: 194 case Intrinsic::ssub_with_overflow: 195 case Intrinsic::usub_with_overflow: 196 case Intrinsic::smul_with_overflow: 197 case Intrinsic::umul_with_overflow: 198 if (Idx == 1) { 199 int NumConstants = (BitSize + 63) / 64; 200 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 201 return (Cost <= NumConstants * TTI::TCC_Basic) 202 ? static_cast<int>(TTI::TCC_Free) 203 : Cost; 204 } 205 break; 206 case Intrinsic::experimental_stackmap: 207 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 208 return TTI::TCC_Free; 209 break; 210 case Intrinsic::experimental_patchpoint_void: 211 case Intrinsic::experimental_patchpoint_i64: 212 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 213 return TTI::TCC_Free; 214 break; 215 case Intrinsic::experimental_gc_statepoint: 216 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 217 return TTI::TCC_Free; 218 break; 219 } 220 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 221 } 222 223 TargetTransformInfo::PopcntSupportKind 224 AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) { 225 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 226 if (TyWidth == 32 || TyWidth == 64) 227 return TTI::PSK_FastHardware; 228 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount. 229 return TTI::PSK_Software; 230 } 231 232 InstructionCost 233 AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 234 TTI::TargetCostKind CostKind) { 235 auto *RetTy = ICA.getReturnType(); 236 switch (ICA.getID()) { 237 case Intrinsic::umin: 238 case Intrinsic::umax: 239 case Intrinsic::smin: 240 case Intrinsic::smax: { 241 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, 242 MVT::v8i16, MVT::v2i32, MVT::v4i32}; 243 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 244 // v2i64 types get converted to cmp+bif hence the cost of 2 245 if (LT.second == MVT::v2i64) 246 return LT.first * 2; 247 if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; })) 248 return LT.first; 249 break; 250 } 251 case Intrinsic::sadd_sat: 252 case Intrinsic::ssub_sat: 253 case Intrinsic::uadd_sat: 254 case Intrinsic::usub_sat: { 255 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, 256 MVT::v8i16, MVT::v2i32, MVT::v4i32, 257 MVT::v2i64}; 258 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 259 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we 260 // need to extend the type, as it uses shr(qadd(shl, shl)). 261 unsigned Instrs = 262 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4; 263 if (any_of(ValidSatTys, [<](MVT M) { return M == LT.second; })) 264 return LT.first * Instrs; 265 break; 266 } 267 case Intrinsic::abs: { 268 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, 269 MVT::v8i16, MVT::v2i32, MVT::v4i32, 270 MVT::v2i64}; 271 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 272 if (any_of(ValidAbsTys, [<](MVT M) { return M == LT.second; })) 273 return LT.first; 274 break; 275 } 276 case Intrinsic::experimental_stepvector: { 277 InstructionCost Cost = 1; // Cost of the `index' instruction 278 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 279 // Legalisation of illegal vectors involves an `index' instruction plus 280 // (LT.first - 1) vector adds. 281 if (LT.first > 1) { 282 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext()); 283 InstructionCost AddCost = 284 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind); 285 Cost += AddCost * (LT.first - 1); 286 } 287 return Cost; 288 } 289 case Intrinsic::bitreverse: { 290 static const CostTblEntry BitreverseTbl[] = { 291 {Intrinsic::bitreverse, MVT::i32, 1}, 292 {Intrinsic::bitreverse, MVT::i64, 1}, 293 {Intrinsic::bitreverse, MVT::v8i8, 1}, 294 {Intrinsic::bitreverse, MVT::v16i8, 1}, 295 {Intrinsic::bitreverse, MVT::v4i16, 2}, 296 {Intrinsic::bitreverse, MVT::v8i16, 2}, 297 {Intrinsic::bitreverse, MVT::v2i32, 2}, 298 {Intrinsic::bitreverse, MVT::v4i32, 2}, 299 {Intrinsic::bitreverse, MVT::v1i64, 2}, 300 {Intrinsic::bitreverse, MVT::v2i64, 2}, 301 }; 302 const auto LegalisationCost = TLI->getTypeLegalizationCost(DL, RetTy); 303 const auto *Entry = 304 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second); 305 if (Entry) { 306 // Cost Model is using the legal type(i32) that i8 and i16 will be 307 // converted to +1 so that we match the actual lowering cost 308 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 || 309 TLI->getValueType(DL, RetTy, true) == MVT::i16) 310 return LegalisationCost.first * Entry->Cost + 1; 311 312 return LegalisationCost.first * Entry->Cost; 313 } 314 break; 315 } 316 case Intrinsic::ctpop: { 317 static const CostTblEntry CtpopCostTbl[] = { 318 {ISD::CTPOP, MVT::v2i64, 4}, 319 {ISD::CTPOP, MVT::v4i32, 3}, 320 {ISD::CTPOP, MVT::v8i16, 2}, 321 {ISD::CTPOP, MVT::v16i8, 1}, 322 {ISD::CTPOP, MVT::i64, 4}, 323 {ISD::CTPOP, MVT::v2i32, 3}, 324 {ISD::CTPOP, MVT::v4i16, 2}, 325 {ISD::CTPOP, MVT::v8i8, 1}, 326 {ISD::CTPOP, MVT::i32, 5}, 327 }; 328 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 329 MVT MTy = LT.second; 330 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) { 331 // Extra cost of +1 when illegal vector types are legalized by promoting 332 // the integer type. 333 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() != 334 RetTy->getScalarSizeInBits() 335 ? 1 336 : 0; 337 return LT.first * Entry->Cost + ExtraCost; 338 } 339 break; 340 } 341 case Intrinsic::sadd_with_overflow: 342 case Intrinsic::uadd_with_overflow: 343 case Intrinsic::ssub_with_overflow: 344 case Intrinsic::usub_with_overflow: 345 case Intrinsic::smul_with_overflow: 346 case Intrinsic::umul_with_overflow: { 347 static const CostTblEntry WithOverflowCostTbl[] = { 348 {Intrinsic::sadd_with_overflow, MVT::i8, 3}, 349 {Intrinsic::uadd_with_overflow, MVT::i8, 3}, 350 {Intrinsic::sadd_with_overflow, MVT::i16, 3}, 351 {Intrinsic::uadd_with_overflow, MVT::i16, 3}, 352 {Intrinsic::sadd_with_overflow, MVT::i32, 1}, 353 {Intrinsic::uadd_with_overflow, MVT::i32, 1}, 354 {Intrinsic::sadd_with_overflow, MVT::i64, 1}, 355 {Intrinsic::uadd_with_overflow, MVT::i64, 1}, 356 {Intrinsic::ssub_with_overflow, MVT::i8, 3}, 357 {Intrinsic::usub_with_overflow, MVT::i8, 3}, 358 {Intrinsic::ssub_with_overflow, MVT::i16, 3}, 359 {Intrinsic::usub_with_overflow, MVT::i16, 3}, 360 {Intrinsic::ssub_with_overflow, MVT::i32, 1}, 361 {Intrinsic::usub_with_overflow, MVT::i32, 1}, 362 {Intrinsic::ssub_with_overflow, MVT::i64, 1}, 363 {Intrinsic::usub_with_overflow, MVT::i64, 1}, 364 {Intrinsic::smul_with_overflow, MVT::i8, 5}, 365 {Intrinsic::umul_with_overflow, MVT::i8, 4}, 366 {Intrinsic::smul_with_overflow, MVT::i16, 5}, 367 {Intrinsic::umul_with_overflow, MVT::i16, 4}, 368 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst 369 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw 370 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp 371 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr 372 }; 373 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true); 374 if (MTy.isSimple()) 375 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(), 376 MTy.getSimpleVT())) 377 return Entry->Cost; 378 break; 379 } 380 case Intrinsic::fptosi_sat: 381 case Intrinsic::fptoui_sat: { 382 if (ICA.getArgTypes().empty()) 383 break; 384 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat; 385 auto LT = TLI->getTypeLegalizationCost(DL, ICA.getArgTypes()[0]); 386 EVT MTy = TLI->getValueType(DL, RetTy); 387 // Check for the legal types, which are where the size of the input and the 388 // output are the same, or we are using cvt f64->i32 or f32->i64. 389 if ((LT.second == MVT::f32 || LT.second == MVT::f64 || 390 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 || 391 LT.second == MVT::v2f64) && 392 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() || 393 (LT.second == MVT::f64 && MTy == MVT::i32) || 394 (LT.second == MVT::f32 && MTy == MVT::i64))) 395 return LT.first; 396 // Similarly for fp16 sizes 397 if (ST->hasFullFP16() && 398 ((LT.second == MVT::f16 && MTy == MVT::i32) || 399 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) && 400 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))) 401 return LT.first; 402 403 // Otherwise we use a legal convert followed by a min+max 404 if ((LT.second.getScalarType() == MVT::f32 || 405 LT.second.getScalarType() == MVT::f64 || 406 (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) && 407 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) { 408 Type *LegalTy = 409 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits()); 410 if (LT.second.isVector()) 411 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount()); 412 InstructionCost Cost = 1; 413 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin, 414 LegalTy, {LegalTy, LegalTy}); 415 Cost += getIntrinsicInstrCost(Attrs1, CostKind); 416 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax, 417 LegalTy, {LegalTy, LegalTy}); 418 Cost += getIntrinsicInstrCost(Attrs2, CostKind); 419 return LT.first * Cost; 420 } 421 break; 422 } 423 default: 424 break; 425 } 426 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 427 } 428 429 /// The function will remove redundant reinterprets casting in the presence 430 /// of the control flow 431 static Optional<Instruction *> processPhiNode(InstCombiner &IC, 432 IntrinsicInst &II) { 433 SmallVector<Instruction *, 32> Worklist; 434 auto RequiredType = II.getType(); 435 436 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0)); 437 assert(PN && "Expected Phi Node!"); 438 439 // Don't create a new Phi unless we can remove the old one. 440 if (!PN->hasOneUse()) 441 return None; 442 443 for (Value *IncValPhi : PN->incoming_values()) { 444 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi); 445 if (!Reinterpret || 446 Reinterpret->getIntrinsicID() != 447 Intrinsic::aarch64_sve_convert_to_svbool || 448 RequiredType != Reinterpret->getArgOperand(0)->getType()) 449 return None; 450 } 451 452 // Create the new Phi 453 LLVMContext &Ctx = PN->getContext(); 454 IRBuilder<> Builder(Ctx); 455 Builder.SetInsertPoint(PN); 456 PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues()); 457 Worklist.push_back(PN); 458 459 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) { 460 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I)); 461 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I)); 462 Worklist.push_back(Reinterpret); 463 } 464 465 // Cleanup Phi Node and reinterprets 466 return IC.replaceInstUsesWith(II, NPN); 467 } 468 469 // (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _)))) 470 // => (binop (pred) (from_svbool _) (from_svbool _)) 471 // 472 // The above transformation eliminates a `to_svbool` in the predicate 473 // operand of bitwise operation `binop` by narrowing the vector width of 474 // the operation. For example, it would convert a `<vscale x 16 x i1> 475 // and` into a `<vscale x 4 x i1> and`. This is profitable because 476 // to_svbool must zero the new lanes during widening, whereas 477 // from_svbool is free. 478 static Optional<Instruction *> tryCombineFromSVBoolBinOp(InstCombiner &IC, 479 IntrinsicInst &II) { 480 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0)); 481 if (!BinOp) 482 return None; 483 484 auto IntrinsicID = BinOp->getIntrinsicID(); 485 switch (IntrinsicID) { 486 case Intrinsic::aarch64_sve_and_z: 487 case Intrinsic::aarch64_sve_bic_z: 488 case Intrinsic::aarch64_sve_eor_z: 489 case Intrinsic::aarch64_sve_nand_z: 490 case Intrinsic::aarch64_sve_nor_z: 491 case Intrinsic::aarch64_sve_orn_z: 492 case Intrinsic::aarch64_sve_orr_z: 493 break; 494 default: 495 return None; 496 } 497 498 auto BinOpPred = BinOp->getOperand(0); 499 auto BinOpOp1 = BinOp->getOperand(1); 500 auto BinOpOp2 = BinOp->getOperand(2); 501 502 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred); 503 if (!PredIntr || 504 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool) 505 return None; 506 507 auto PredOp = PredIntr->getOperand(0); 508 auto PredOpTy = cast<VectorType>(PredOp->getType()); 509 if (PredOpTy != II.getType()) 510 return None; 511 512 IRBuilder<> Builder(II.getContext()); 513 Builder.SetInsertPoint(&II); 514 515 SmallVector<Value *> NarrowedBinOpArgs = {PredOp}; 516 auto NarrowBinOpOp1 = Builder.CreateIntrinsic( 517 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1}); 518 NarrowedBinOpArgs.push_back(NarrowBinOpOp1); 519 if (BinOpOp1 == BinOpOp2) 520 NarrowedBinOpArgs.push_back(NarrowBinOpOp1); 521 else 522 NarrowedBinOpArgs.push_back(Builder.CreateIntrinsic( 523 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2})); 524 525 auto NarrowedBinOp = 526 Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs); 527 return IC.replaceInstUsesWith(II, NarrowedBinOp); 528 } 529 530 static Optional<Instruction *> instCombineConvertFromSVBool(InstCombiner &IC, 531 IntrinsicInst &II) { 532 // If the reinterpret instruction operand is a PHI Node 533 if (isa<PHINode>(II.getArgOperand(0))) 534 return processPhiNode(IC, II); 535 536 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II)) 537 return BinOpCombine; 538 539 SmallVector<Instruction *, 32> CandidatesForRemoval; 540 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr; 541 542 const auto *IVTy = cast<VectorType>(II.getType()); 543 544 // Walk the chain of conversions. 545 while (Cursor) { 546 // If the type of the cursor has fewer lanes than the final result, zeroing 547 // must take place, which breaks the equivalence chain. 548 const auto *CursorVTy = cast<VectorType>(Cursor->getType()); 549 if (CursorVTy->getElementCount().getKnownMinValue() < 550 IVTy->getElementCount().getKnownMinValue()) 551 break; 552 553 // If the cursor has the same type as I, it is a viable replacement. 554 if (Cursor->getType() == IVTy) 555 EarliestReplacement = Cursor; 556 557 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor); 558 559 // If this is not an SVE conversion intrinsic, this is the end of the chain. 560 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() == 561 Intrinsic::aarch64_sve_convert_to_svbool || 562 IntrinsicCursor->getIntrinsicID() == 563 Intrinsic::aarch64_sve_convert_from_svbool)) 564 break; 565 566 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor); 567 Cursor = IntrinsicCursor->getOperand(0); 568 } 569 570 // If no viable replacement in the conversion chain was found, there is 571 // nothing to do. 572 if (!EarliestReplacement) 573 return None; 574 575 return IC.replaceInstUsesWith(II, EarliestReplacement); 576 } 577 578 static Optional<Instruction *> instCombineSVESel(InstCombiner &IC, 579 IntrinsicInst &II) { 580 IRBuilder<> Builder(&II); 581 auto Select = Builder.CreateSelect(II.getOperand(0), II.getOperand(1), 582 II.getOperand(2)); 583 return IC.replaceInstUsesWith(II, Select); 584 } 585 586 static Optional<Instruction *> instCombineSVEDup(InstCombiner &IC, 587 IntrinsicInst &II) { 588 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); 589 if (!Pg) 590 return None; 591 592 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) 593 return None; 594 595 const auto PTruePattern = 596 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue(); 597 if (PTruePattern != AArch64SVEPredPattern::vl1) 598 return None; 599 600 // The intrinsic is inserting into lane zero so use an insert instead. 601 auto *IdxTy = Type::getInt64Ty(II.getContext()); 602 auto *Insert = InsertElementInst::Create( 603 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0)); 604 Insert->insertBefore(&II); 605 Insert->takeName(&II); 606 607 return IC.replaceInstUsesWith(II, Insert); 608 } 609 610 static Optional<Instruction *> instCombineSVEDupX(InstCombiner &IC, 611 IntrinsicInst &II) { 612 // Replace DupX with a regular IR splat. 613 IRBuilder<> Builder(II.getContext()); 614 Builder.SetInsertPoint(&II); 615 auto *RetTy = cast<ScalableVectorType>(II.getType()); 616 Value *Splat = 617 Builder.CreateVectorSplat(RetTy->getElementCount(), II.getArgOperand(0)); 618 Splat->takeName(&II); 619 return IC.replaceInstUsesWith(II, Splat); 620 } 621 622 static Optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC, 623 IntrinsicInst &II) { 624 LLVMContext &Ctx = II.getContext(); 625 IRBuilder<> Builder(Ctx); 626 Builder.SetInsertPoint(&II); 627 628 // Check that the predicate is all active 629 auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0)); 630 if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) 631 return None; 632 633 const auto PTruePattern = 634 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue(); 635 if (PTruePattern != AArch64SVEPredPattern::all) 636 return None; 637 638 // Check that we have a compare of zero.. 639 auto *SplatValue = 640 dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2))); 641 if (!SplatValue || !SplatValue->isZero()) 642 return None; 643 644 // ..against a dupq 645 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); 646 if (!DupQLane || 647 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane) 648 return None; 649 650 // Where the dupq is a lane 0 replicate of a vector insert 651 if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero()) 652 return None; 653 654 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0)); 655 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert) 656 return None; 657 658 // Where the vector insert is a fixed constant vector insert into undef at 659 // index zero 660 if (!isa<UndefValue>(VecIns->getArgOperand(0))) 661 return None; 662 663 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero()) 664 return None; 665 666 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1)); 667 if (!ConstVec) 668 return None; 669 670 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType()); 671 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType()); 672 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements()) 673 return None; 674 675 unsigned NumElts = VecTy->getNumElements(); 676 unsigned PredicateBits = 0; 677 678 // Expand intrinsic operands to a 16-bit byte level predicate 679 for (unsigned I = 0; I < NumElts; ++I) { 680 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I)); 681 if (!Arg) 682 return None; 683 if (!Arg->isZero()) 684 PredicateBits |= 1 << (I * (16 / NumElts)); 685 } 686 687 // If all bits are zero bail early with an empty predicate 688 if (PredicateBits == 0) { 689 auto *PFalse = Constant::getNullValue(II.getType()); 690 PFalse->takeName(&II); 691 return IC.replaceInstUsesWith(II, PFalse); 692 } 693 694 // Calculate largest predicate type used (where byte predicate is largest) 695 unsigned Mask = 8; 696 for (unsigned I = 0; I < 16; ++I) 697 if ((PredicateBits & (1 << I)) != 0) 698 Mask |= (I % 8); 699 700 unsigned PredSize = Mask & -Mask; 701 auto *PredType = ScalableVectorType::get( 702 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8)); 703 704 // Ensure all relevant bits are set 705 for (unsigned I = 0; I < 16; I += PredSize) 706 if ((PredicateBits & (1 << I)) == 0) 707 return None; 708 709 auto *PTruePat = 710 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all); 711 auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, 712 {PredType}, {PTruePat}); 713 auto *ConvertToSVBool = Builder.CreateIntrinsic( 714 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue}); 715 auto *ConvertFromSVBool = 716 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, 717 {II.getType()}, {ConvertToSVBool}); 718 719 ConvertFromSVBool->takeName(&II); 720 return IC.replaceInstUsesWith(II, ConvertFromSVBool); 721 } 722 723 static Optional<Instruction *> instCombineSVELast(InstCombiner &IC, 724 IntrinsicInst &II) { 725 IRBuilder<> Builder(II.getContext()); 726 Builder.SetInsertPoint(&II); 727 Value *Pg = II.getArgOperand(0); 728 Value *Vec = II.getArgOperand(1); 729 auto IntrinsicID = II.getIntrinsicID(); 730 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta; 731 732 // lastX(splat(X)) --> X 733 if (auto *SplatVal = getSplatValue(Vec)) 734 return IC.replaceInstUsesWith(II, SplatVal); 735 736 // If x and/or y is a splat value then: 737 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y)) 738 Value *LHS, *RHS; 739 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) { 740 if (isSplatValue(LHS) || isSplatValue(RHS)) { 741 auto *OldBinOp = cast<BinaryOperator>(Vec); 742 auto OpC = OldBinOp->getOpcode(); 743 auto *NewLHS = 744 Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS}); 745 auto *NewRHS = 746 Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS}); 747 auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags( 748 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), &II); 749 return IC.replaceInstUsesWith(II, NewBinOp); 750 } 751 } 752 753 auto *C = dyn_cast<Constant>(Pg); 754 if (IsAfter && C && C->isNullValue()) { 755 // The intrinsic is extracting lane 0 so use an extract instead. 756 auto *IdxTy = Type::getInt64Ty(II.getContext()); 757 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0)); 758 Extract->insertBefore(&II); 759 Extract->takeName(&II); 760 return IC.replaceInstUsesWith(II, Extract); 761 } 762 763 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg); 764 if (!IntrPG) 765 return None; 766 767 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) 768 return None; 769 770 const auto PTruePattern = 771 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue(); 772 773 // Can the intrinsic's predicate be converted to a known constant index? 774 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern); 775 if (!MinNumElts) 776 return None; 777 778 unsigned Idx = MinNumElts - 1; 779 // Increment the index if extracting the element after the last active 780 // predicate element. 781 if (IsAfter) 782 ++Idx; 783 784 // Ignore extracts whose index is larger than the known minimum vector 785 // length. NOTE: This is an artificial constraint where we prefer to 786 // maintain what the user asked for until an alternative is proven faster. 787 auto *PgVTy = cast<ScalableVectorType>(Pg->getType()); 788 if (Idx >= PgVTy->getMinNumElements()) 789 return None; 790 791 // The intrinsic is extracting a fixed lane so use an extract instead. 792 auto *IdxTy = Type::getInt64Ty(II.getContext()); 793 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx)); 794 Extract->insertBefore(&II); 795 Extract->takeName(&II); 796 return IC.replaceInstUsesWith(II, Extract); 797 } 798 799 static Optional<Instruction *> instCombineSVECondLast(InstCombiner &IC, 800 IntrinsicInst &II) { 801 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar 802 // integer variant across a variety of micro-architectures. Replace scalar 803 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple 804 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more 805 // depending on the micro-architecture, but has been observed as generally 806 // being faster, particularly when the CLAST[AB] op is a loop-carried 807 // dependency. 808 IRBuilder<> Builder(II.getContext()); 809 Builder.SetInsertPoint(&II); 810 Value *Pg = II.getArgOperand(0); 811 Value *Fallback = II.getArgOperand(1); 812 Value *Vec = II.getArgOperand(2); 813 Type *Ty = II.getType(); 814 815 if (!Ty->isIntegerTy()) 816 return None; 817 818 Type *FPTy; 819 switch (cast<IntegerType>(Ty)->getBitWidth()) { 820 default: 821 return None; 822 case 16: 823 FPTy = Builder.getHalfTy(); 824 break; 825 case 32: 826 FPTy = Builder.getFloatTy(); 827 break; 828 case 64: 829 FPTy = Builder.getDoubleTy(); 830 break; 831 } 832 833 Value *FPFallBack = Builder.CreateBitCast(Fallback, FPTy); 834 auto *FPVTy = VectorType::get( 835 FPTy, cast<VectorType>(Vec->getType())->getElementCount()); 836 Value *FPVec = Builder.CreateBitCast(Vec, FPVTy); 837 auto *FPII = Builder.CreateIntrinsic(II.getIntrinsicID(), {FPVec->getType()}, 838 {Pg, FPFallBack, FPVec}); 839 Value *FPIItoInt = Builder.CreateBitCast(FPII, II.getType()); 840 return IC.replaceInstUsesWith(II, FPIItoInt); 841 } 842 843 static Optional<Instruction *> instCombineRDFFR(InstCombiner &IC, 844 IntrinsicInst &II) { 845 LLVMContext &Ctx = II.getContext(); 846 IRBuilder<> Builder(Ctx); 847 Builder.SetInsertPoint(&II); 848 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr 849 // can work with RDFFR_PP for ptest elimination. 850 auto *AllPat = 851 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all); 852 auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, 853 {II.getType()}, {AllPat}); 854 auto *RDFFR = 855 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue}); 856 RDFFR->takeName(&II); 857 return IC.replaceInstUsesWith(II, RDFFR); 858 } 859 860 static Optional<Instruction *> 861 instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) { 862 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue(); 863 864 if (Pattern == AArch64SVEPredPattern::all) { 865 LLVMContext &Ctx = II.getContext(); 866 IRBuilder<> Builder(Ctx); 867 Builder.SetInsertPoint(&II); 868 869 Constant *StepVal = ConstantInt::get(II.getType(), NumElts); 870 auto *VScale = Builder.CreateVScale(StepVal); 871 VScale->takeName(&II); 872 return IC.replaceInstUsesWith(II, VScale); 873 } 874 875 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern); 876 877 return MinNumElts && NumElts >= MinNumElts 878 ? Optional<Instruction *>(IC.replaceInstUsesWith( 879 II, ConstantInt::get(II.getType(), MinNumElts))) 880 : None; 881 } 882 883 static Optional<Instruction *> instCombineSVEPTest(InstCombiner &IC, 884 IntrinsicInst &II) { 885 IntrinsicInst *Op1 = dyn_cast<IntrinsicInst>(II.getArgOperand(0)); 886 IntrinsicInst *Op2 = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); 887 888 if (Op1 && Op2 && 889 Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && 890 Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && 891 Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) { 892 893 IRBuilder<> Builder(II.getContext()); 894 Builder.SetInsertPoint(&II); 895 896 Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)}; 897 Type *Tys[] = {Op1->getArgOperand(0)->getType()}; 898 899 auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops); 900 901 PTest->takeName(&II); 902 return IC.replaceInstUsesWith(II, PTest); 903 } 904 905 return None; 906 } 907 908 static Optional<Instruction *> instCombineSVEVectorFMLA(InstCombiner &IC, 909 IntrinsicInst &II) { 910 // fold (fadd p a (fmul p b c)) -> (fma p a b c) 911 Value *P = II.getOperand(0); 912 Value *A = II.getOperand(1); 913 auto FMul = II.getOperand(2); 914 Value *B, *C; 915 if (!match(FMul, m_Intrinsic<Intrinsic::aarch64_sve_fmul>( 916 m_Specific(P), m_Value(B), m_Value(C)))) 917 return None; 918 919 if (!FMul->hasOneUse()) 920 return None; 921 922 llvm::FastMathFlags FAddFlags = II.getFastMathFlags(); 923 // Stop the combine when the flags on the inputs differ in case dropping flags 924 // would lead to us missing out on more beneficial optimizations. 925 if (FAddFlags != cast<CallInst>(FMul)->getFastMathFlags()) 926 return None; 927 if (!FAddFlags.allowContract()) 928 return None; 929 930 IRBuilder<> Builder(II.getContext()); 931 Builder.SetInsertPoint(&II); 932 auto FMLA = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_fmla, 933 {II.getType()}, {P, A, B, C}, &II); 934 FMLA->setFastMathFlags(FAddFlags); 935 return IC.replaceInstUsesWith(II, FMLA); 936 } 937 938 static bool isAllActivePredicate(Value *Pred) { 939 // Look through convert.from.svbool(convert.to.svbool(...) chain. 940 Value *UncastedPred; 941 if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>( 942 m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>( 943 m_Value(UncastedPred))))) 944 // If the predicate has the same or less lanes than the uncasted 945 // predicate then we know the casting has no effect. 946 if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <= 947 cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements()) 948 Pred = UncastedPred; 949 950 return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( 951 m_ConstantInt<AArch64SVEPredPattern::all>())); 952 } 953 954 static Optional<Instruction *> 955 instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { 956 IRBuilder<> Builder(II.getContext()); 957 Builder.SetInsertPoint(&II); 958 959 Value *Pred = II.getOperand(0); 960 Value *PtrOp = II.getOperand(1); 961 Type *VecTy = II.getType(); 962 Value *VecPtr = Builder.CreateBitCast(PtrOp, VecTy->getPointerTo()); 963 964 if (isAllActivePredicate(Pred)) { 965 LoadInst *Load = Builder.CreateLoad(VecTy, VecPtr); 966 Load->copyMetadata(II); 967 return IC.replaceInstUsesWith(II, Load); 968 } 969 970 CallInst *MaskedLoad = 971 Builder.CreateMaskedLoad(VecTy, VecPtr, PtrOp->getPointerAlignment(DL), 972 Pred, ConstantAggregateZero::get(VecTy)); 973 MaskedLoad->copyMetadata(II); 974 return IC.replaceInstUsesWith(II, MaskedLoad); 975 } 976 977 static Optional<Instruction *> 978 instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { 979 IRBuilder<> Builder(II.getContext()); 980 Builder.SetInsertPoint(&II); 981 982 Value *VecOp = II.getOperand(0); 983 Value *Pred = II.getOperand(1); 984 Value *PtrOp = II.getOperand(2); 985 Value *VecPtr = 986 Builder.CreateBitCast(PtrOp, VecOp->getType()->getPointerTo()); 987 988 if (isAllActivePredicate(Pred)) { 989 StoreInst *Store = Builder.CreateStore(VecOp, VecPtr); 990 Store->copyMetadata(II); 991 return IC.eraseInstFromFunction(II); 992 } 993 994 CallInst *MaskedStore = Builder.CreateMaskedStore( 995 VecOp, VecPtr, PtrOp->getPointerAlignment(DL), Pred); 996 MaskedStore->copyMetadata(II); 997 return IC.eraseInstFromFunction(II); 998 } 999 1000 static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) { 1001 switch (Intrinsic) { 1002 case Intrinsic::aarch64_sve_fmul: 1003 return Instruction::BinaryOps::FMul; 1004 case Intrinsic::aarch64_sve_fadd: 1005 return Instruction::BinaryOps::FAdd; 1006 case Intrinsic::aarch64_sve_fsub: 1007 return Instruction::BinaryOps::FSub; 1008 default: 1009 return Instruction::BinaryOpsEnd; 1010 } 1011 } 1012 1013 static Optional<Instruction *> instCombineSVEVectorBinOp(InstCombiner &IC, 1014 IntrinsicInst &II) { 1015 auto *OpPredicate = II.getOperand(0); 1016 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID()); 1017 if (BinOpCode == Instruction::BinaryOpsEnd || 1018 !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( 1019 m_ConstantInt<AArch64SVEPredPattern::all>()))) 1020 return None; 1021 IRBuilder<> Builder(II.getContext()); 1022 Builder.SetInsertPoint(&II); 1023 Builder.setFastMathFlags(II.getFastMathFlags()); 1024 auto BinOp = 1025 Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2)); 1026 return IC.replaceInstUsesWith(II, BinOp); 1027 } 1028 1029 static Optional<Instruction *> instCombineSVEVectorFAdd(InstCombiner &IC, 1030 IntrinsicInst &II) { 1031 if (auto FMLA = instCombineSVEVectorFMLA(IC, II)) 1032 return FMLA; 1033 return instCombineSVEVectorBinOp(IC, II); 1034 } 1035 1036 static Optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC, 1037 IntrinsicInst &II) { 1038 auto *OpPredicate = II.getOperand(0); 1039 auto *OpMultiplicand = II.getOperand(1); 1040 auto *OpMultiplier = II.getOperand(2); 1041 1042 IRBuilder<> Builder(II.getContext()); 1043 Builder.SetInsertPoint(&II); 1044 1045 // Return true if a given instruction is a unit splat value, false otherwise. 1046 auto IsUnitSplat = [](auto *I) { 1047 auto *SplatValue = getSplatValue(I); 1048 if (!SplatValue) 1049 return false; 1050 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); 1051 }; 1052 1053 // Return true if a given instruction is an aarch64_sve_dup intrinsic call 1054 // with a unit splat value, false otherwise. 1055 auto IsUnitDup = [](auto *I) { 1056 auto *IntrI = dyn_cast<IntrinsicInst>(I); 1057 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup) 1058 return false; 1059 1060 auto *SplatValue = IntrI->getOperand(2); 1061 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); 1062 }; 1063 1064 if (IsUnitSplat(OpMultiplier)) { 1065 // [f]mul pg %n, (dupx 1) => %n 1066 OpMultiplicand->takeName(&II); 1067 return IC.replaceInstUsesWith(II, OpMultiplicand); 1068 } else if (IsUnitDup(OpMultiplier)) { 1069 // [f]mul pg %n, (dup pg 1) => %n 1070 auto *DupInst = cast<IntrinsicInst>(OpMultiplier); 1071 auto *DupPg = DupInst->getOperand(1); 1072 // TODO: this is naive. The optimization is still valid if DupPg 1073 // 'encompasses' OpPredicate, not only if they're the same predicate. 1074 if (OpPredicate == DupPg) { 1075 OpMultiplicand->takeName(&II); 1076 return IC.replaceInstUsesWith(II, OpMultiplicand); 1077 } 1078 } 1079 1080 return instCombineSVEVectorBinOp(IC, II); 1081 } 1082 1083 static Optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC, 1084 IntrinsicInst &II) { 1085 IRBuilder<> Builder(II.getContext()); 1086 Builder.SetInsertPoint(&II); 1087 Value *UnpackArg = II.getArgOperand(0); 1088 auto *RetTy = cast<ScalableVectorType>(II.getType()); 1089 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi || 1090 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo; 1091 1092 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X)) 1093 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X)) 1094 if (auto *ScalarArg = getSplatValue(UnpackArg)) { 1095 ScalarArg = 1096 Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned); 1097 Value *NewVal = 1098 Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg); 1099 NewVal->takeName(&II); 1100 return IC.replaceInstUsesWith(II, NewVal); 1101 } 1102 1103 return None; 1104 } 1105 static Optional<Instruction *> instCombineSVETBL(InstCombiner &IC, 1106 IntrinsicInst &II) { 1107 auto *OpVal = II.getOperand(0); 1108 auto *OpIndices = II.getOperand(1); 1109 VectorType *VTy = cast<VectorType>(II.getType()); 1110 1111 // Check whether OpIndices is a constant splat value < minimal element count 1112 // of result. 1113 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices)); 1114 if (!SplatValue || 1115 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue())) 1116 return None; 1117 1118 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to 1119 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization. 1120 IRBuilder<> Builder(II.getContext()); 1121 Builder.SetInsertPoint(&II); 1122 auto *Extract = Builder.CreateExtractElement(OpVal, SplatValue); 1123 auto *VectorSplat = 1124 Builder.CreateVectorSplat(VTy->getElementCount(), Extract); 1125 1126 VectorSplat->takeName(&II); 1127 return IC.replaceInstUsesWith(II, VectorSplat); 1128 } 1129 1130 static Optional<Instruction *> instCombineSVETupleGet(InstCombiner &IC, 1131 IntrinsicInst &II) { 1132 // Try to remove sequences of tuple get/set. 1133 Value *SetTuple, *SetIndex, *SetValue; 1134 auto *GetTuple = II.getArgOperand(0); 1135 auto *GetIndex = II.getArgOperand(1); 1136 // Check that we have tuple_get(GetTuple, GetIndex) where GetTuple is a 1137 // call to tuple_set i.e. tuple_set(SetTuple, SetIndex, SetValue). 1138 // Make sure that the types of the current intrinsic and SetValue match 1139 // in order to safely remove the sequence. 1140 if (!match(GetTuple, 1141 m_Intrinsic<Intrinsic::aarch64_sve_tuple_set>( 1142 m_Value(SetTuple), m_Value(SetIndex), m_Value(SetValue))) || 1143 SetValue->getType() != II.getType()) 1144 return None; 1145 // Case where we get the same index right after setting it. 1146 // tuple_get(tuple_set(SetTuple, SetIndex, SetValue), GetIndex) --> SetValue 1147 if (GetIndex == SetIndex) 1148 return IC.replaceInstUsesWith(II, SetValue); 1149 // If we are getting a different index than what was set in the tuple_set 1150 // intrinsic. We can just set the input tuple to the one up in the chain. 1151 // tuple_get(tuple_set(SetTuple, SetIndex, SetValue), GetIndex) 1152 // --> tuple_get(SetTuple, GetIndex) 1153 return IC.replaceOperand(II, 0, SetTuple); 1154 } 1155 1156 static Optional<Instruction *> instCombineSVEZip(InstCombiner &IC, 1157 IntrinsicInst &II) { 1158 // zip1(uzp1(A, B), uzp2(A, B)) --> A 1159 // zip2(uzp1(A, B), uzp2(A, B)) --> B 1160 Value *A, *B; 1161 if (match(II.getArgOperand(0), 1162 m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) && 1163 match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>( 1164 m_Specific(A), m_Specific(B)))) 1165 return IC.replaceInstUsesWith( 1166 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B)); 1167 1168 return None; 1169 } 1170 1171 static Optional<Instruction *> instCombineLD1GatherIndex(InstCombiner &IC, 1172 IntrinsicInst &II) { 1173 Value *Mask = II.getOperand(0); 1174 Value *BasePtr = II.getOperand(1); 1175 Value *Index = II.getOperand(2); 1176 Type *Ty = II.getType(); 1177 Value *PassThru = ConstantAggregateZero::get(Ty); 1178 1179 // Contiguous gather => masked load. 1180 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1)) 1181 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer) 1182 Value *IndexBase; 1183 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>( 1184 m_Value(IndexBase), m_SpecificInt(1)))) { 1185 IRBuilder<> Builder(II.getContext()); 1186 Builder.SetInsertPoint(&II); 1187 1188 Align Alignment = 1189 BasePtr->getPointerAlignment(II.getModule()->getDataLayout()); 1190 1191 Type *VecPtrTy = PointerType::getUnqual(Ty); 1192 Value *Ptr = Builder.CreateGEP( 1193 cast<VectorType>(Ty)->getElementType(), BasePtr, IndexBase); 1194 Ptr = Builder.CreateBitCast(Ptr, VecPtrTy); 1195 CallInst *MaskedLoad = 1196 Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru); 1197 MaskedLoad->takeName(&II); 1198 return IC.replaceInstUsesWith(II, MaskedLoad); 1199 } 1200 1201 return None; 1202 } 1203 1204 static Optional<Instruction *> instCombineST1ScatterIndex(InstCombiner &IC, 1205 IntrinsicInst &II) { 1206 Value *Val = II.getOperand(0); 1207 Value *Mask = II.getOperand(1); 1208 Value *BasePtr = II.getOperand(2); 1209 Value *Index = II.getOperand(3); 1210 Type *Ty = Val->getType(); 1211 1212 // Contiguous scatter => masked store. 1213 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1)) 1214 // => (masked.store Value (gep BasePtr IndexBase) Align Mask) 1215 Value *IndexBase; 1216 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>( 1217 m_Value(IndexBase), m_SpecificInt(1)))) { 1218 IRBuilder<> Builder(II.getContext()); 1219 Builder.SetInsertPoint(&II); 1220 1221 Align Alignment = 1222 BasePtr->getPointerAlignment(II.getModule()->getDataLayout()); 1223 1224 Value *Ptr = Builder.CreateGEP( 1225 cast<VectorType>(Ty)->getElementType(), BasePtr, IndexBase); 1226 Type *VecPtrTy = PointerType::getUnqual(Ty); 1227 Ptr = Builder.CreateBitCast(Ptr, VecPtrTy); 1228 1229 (void)Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask); 1230 1231 return IC.eraseInstFromFunction(II); 1232 } 1233 1234 return None; 1235 } 1236 1237 static Optional<Instruction *> instCombineSVESDIV(InstCombiner &IC, 1238 IntrinsicInst &II) { 1239 IRBuilder<> Builder(II.getContext()); 1240 Builder.SetInsertPoint(&II); 1241 Type *Int32Ty = Builder.getInt32Ty(); 1242 Value *Pred = II.getOperand(0); 1243 Value *Vec = II.getOperand(1); 1244 Value *DivVec = II.getOperand(2); 1245 1246 Value *SplatValue = getSplatValue(DivVec); 1247 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue); 1248 if (!SplatConstantInt) 1249 return None; 1250 APInt Divisor = SplatConstantInt->getValue(); 1251 1252 if (Divisor.isPowerOf2()) { 1253 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2()); 1254 auto ASRD = Builder.CreateIntrinsic( 1255 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2}); 1256 return IC.replaceInstUsesWith(II, ASRD); 1257 } 1258 if (Divisor.isNegatedPowerOf2()) { 1259 Divisor.negate(); 1260 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2()); 1261 auto ASRD = Builder.CreateIntrinsic( 1262 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2}); 1263 auto NEG = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_neg, 1264 {ASRD->getType()}, {ASRD, Pred, ASRD}); 1265 return IC.replaceInstUsesWith(II, NEG); 1266 } 1267 1268 return None; 1269 } 1270 1271 static Optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC, 1272 IntrinsicInst &II) { 1273 Value *A = II.getArgOperand(0); 1274 Value *B = II.getArgOperand(1); 1275 if (A == B) 1276 return IC.replaceInstUsesWith(II, A); 1277 1278 return None; 1279 } 1280 1281 static Optional<Instruction *> instCombineSVESrshl(InstCombiner &IC, 1282 IntrinsicInst &II) { 1283 IRBuilder<> Builder(&II); 1284 Value *Pred = II.getOperand(0); 1285 Value *Vec = II.getOperand(1); 1286 Value *Shift = II.getOperand(2); 1287 1288 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic. 1289 Value *AbsPred, *MergedValue; 1290 if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>( 1291 m_Value(MergedValue), m_Value(AbsPred), m_Value())) && 1292 !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>( 1293 m_Value(MergedValue), m_Value(AbsPred), m_Value()))) 1294 1295 return None; 1296 1297 // Transform is valid if any of the following are true: 1298 // * The ABS merge value is an undef or non-negative 1299 // * The ABS predicate is all active 1300 // * The ABS predicate and the SRSHL predicates are the same 1301 if (!isa<UndefValue>(MergedValue) && 1302 !match(MergedValue, m_NonNegative()) && 1303 AbsPred != Pred && !isAllActivePredicate(AbsPred)) 1304 return None; 1305 1306 // Only valid when the shift amount is non-negative, otherwise the rounding 1307 // behaviour of SRSHL cannot be ignored. 1308 if (!match(Shift, m_NonNegative())) 1309 return None; 1310 1311 auto LSL = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl, {II.getType()}, 1312 {Pred, Vec, Shift}); 1313 1314 return IC.replaceInstUsesWith(II, LSL); 1315 } 1316 1317 Optional<Instruction *> 1318 AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, 1319 IntrinsicInst &II) const { 1320 Intrinsic::ID IID = II.getIntrinsicID(); 1321 switch (IID) { 1322 default: 1323 break; 1324 case Intrinsic::aarch64_neon_fmaxnm: 1325 case Intrinsic::aarch64_neon_fminnm: 1326 return instCombineMaxMinNM(IC, II); 1327 case Intrinsic::aarch64_sve_convert_from_svbool: 1328 return instCombineConvertFromSVBool(IC, II); 1329 case Intrinsic::aarch64_sve_dup: 1330 return instCombineSVEDup(IC, II); 1331 case Intrinsic::aarch64_sve_dup_x: 1332 return instCombineSVEDupX(IC, II); 1333 case Intrinsic::aarch64_sve_cmpne: 1334 case Intrinsic::aarch64_sve_cmpne_wide: 1335 return instCombineSVECmpNE(IC, II); 1336 case Intrinsic::aarch64_sve_rdffr: 1337 return instCombineRDFFR(IC, II); 1338 case Intrinsic::aarch64_sve_lasta: 1339 case Intrinsic::aarch64_sve_lastb: 1340 return instCombineSVELast(IC, II); 1341 case Intrinsic::aarch64_sve_clasta_n: 1342 case Intrinsic::aarch64_sve_clastb_n: 1343 return instCombineSVECondLast(IC, II); 1344 case Intrinsic::aarch64_sve_cntd: 1345 return instCombineSVECntElts(IC, II, 2); 1346 case Intrinsic::aarch64_sve_cntw: 1347 return instCombineSVECntElts(IC, II, 4); 1348 case Intrinsic::aarch64_sve_cnth: 1349 return instCombineSVECntElts(IC, II, 8); 1350 case Intrinsic::aarch64_sve_cntb: 1351 return instCombineSVECntElts(IC, II, 16); 1352 case Intrinsic::aarch64_sve_ptest_any: 1353 case Intrinsic::aarch64_sve_ptest_first: 1354 case Intrinsic::aarch64_sve_ptest_last: 1355 return instCombineSVEPTest(IC, II); 1356 case Intrinsic::aarch64_sve_mul: 1357 case Intrinsic::aarch64_sve_fmul: 1358 return instCombineSVEVectorMul(IC, II); 1359 case Intrinsic::aarch64_sve_fadd: 1360 return instCombineSVEVectorFAdd(IC, II); 1361 case Intrinsic::aarch64_sve_fsub: 1362 return instCombineSVEVectorBinOp(IC, II); 1363 case Intrinsic::aarch64_sve_tbl: 1364 return instCombineSVETBL(IC, II); 1365 case Intrinsic::aarch64_sve_uunpkhi: 1366 case Intrinsic::aarch64_sve_uunpklo: 1367 case Intrinsic::aarch64_sve_sunpkhi: 1368 case Intrinsic::aarch64_sve_sunpklo: 1369 return instCombineSVEUnpack(IC, II); 1370 case Intrinsic::aarch64_sve_tuple_get: 1371 return instCombineSVETupleGet(IC, II); 1372 case Intrinsic::aarch64_sve_zip1: 1373 case Intrinsic::aarch64_sve_zip2: 1374 return instCombineSVEZip(IC, II); 1375 case Intrinsic::aarch64_sve_ld1_gather_index: 1376 return instCombineLD1GatherIndex(IC, II); 1377 case Intrinsic::aarch64_sve_st1_scatter_index: 1378 return instCombineST1ScatterIndex(IC, II); 1379 case Intrinsic::aarch64_sve_ld1: 1380 return instCombineSVELD1(IC, II, DL); 1381 case Intrinsic::aarch64_sve_st1: 1382 return instCombineSVEST1(IC, II, DL); 1383 case Intrinsic::aarch64_sve_sdiv: 1384 return instCombineSVESDIV(IC, II); 1385 case Intrinsic::aarch64_sve_sel: 1386 return instCombineSVESel(IC, II); 1387 case Intrinsic::aarch64_sve_srshl: 1388 return instCombineSVESrshl(IC, II); 1389 } 1390 1391 return None; 1392 } 1393 1394 Optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic( 1395 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts, 1396 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, 1397 std::function<void(Instruction *, unsigned, APInt, APInt &)> 1398 SimplifyAndSetOp) const { 1399 switch (II.getIntrinsicID()) { 1400 default: 1401 break; 1402 case Intrinsic::aarch64_neon_fcvtxn: 1403 case Intrinsic::aarch64_neon_rshrn: 1404 case Intrinsic::aarch64_neon_sqrshrn: 1405 case Intrinsic::aarch64_neon_sqrshrun: 1406 case Intrinsic::aarch64_neon_sqshrn: 1407 case Intrinsic::aarch64_neon_sqshrun: 1408 case Intrinsic::aarch64_neon_sqxtn: 1409 case Intrinsic::aarch64_neon_sqxtun: 1410 case Intrinsic::aarch64_neon_uqrshrn: 1411 case Intrinsic::aarch64_neon_uqshrn: 1412 case Intrinsic::aarch64_neon_uqxtn: 1413 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts); 1414 break; 1415 } 1416 1417 return None; 1418 } 1419 1420 bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, 1421 ArrayRef<const Value *> Args) { 1422 1423 // A helper that returns a vector type from the given type. The number of 1424 // elements in type Ty determines the vector width. 1425 auto toVectorTy = [&](Type *ArgTy) { 1426 return VectorType::get(ArgTy->getScalarType(), 1427 cast<VectorType>(DstTy)->getElementCount()); 1428 }; 1429 1430 // Exit early if DstTy is not a vector type whose elements are at least 1431 // 16-bits wide. 1432 if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16) 1433 return false; 1434 1435 // Determine if the operation has a widening variant. We consider both the 1436 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the 1437 // instructions. 1438 // 1439 // TODO: Add additional widening operations (e.g., shl, etc.) once we 1440 // verify that their extending operands are eliminated during code 1441 // generation. 1442 switch (Opcode) { 1443 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2). 1444 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2). 1445 case Instruction::Mul: // SMULL(2), UMULL(2) 1446 break; 1447 default: 1448 return false; 1449 } 1450 1451 // To be a widening instruction (either the "wide" or "long" versions), the 1452 // second operand must be a sign- or zero extend. 1453 if (Args.size() != 2 || 1454 (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1]))) 1455 return false; 1456 auto *Extend = cast<CastInst>(Args[1]); 1457 auto *Arg0 = dyn_cast<CastInst>(Args[0]); 1458 1459 // A mul only has a mull version (not like addw). Both operands need to be 1460 // extending and the same type. 1461 if (Opcode == Instruction::Mul && 1462 (!Arg0 || Arg0->getOpcode() != Extend->getOpcode() || 1463 Arg0->getOperand(0)->getType() != Extend->getOperand(0)->getType())) 1464 return false; 1465 1466 // Legalize the destination type and ensure it can be used in a widening 1467 // operation. 1468 auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy); 1469 unsigned DstElTySize = DstTyL.second.getScalarSizeInBits(); 1470 if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits()) 1471 return false; 1472 1473 // Legalize the source type and ensure it can be used in a widening 1474 // operation. 1475 auto *SrcTy = toVectorTy(Extend->getSrcTy()); 1476 auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy); 1477 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits(); 1478 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits()) 1479 return false; 1480 1481 // Get the total number of vector elements in the legalized types. 1482 InstructionCost NumDstEls = 1483 DstTyL.first * DstTyL.second.getVectorMinNumElements(); 1484 InstructionCost NumSrcEls = 1485 SrcTyL.first * SrcTyL.second.getVectorMinNumElements(); 1486 1487 // Return true if the legalized types have the same number of vector elements 1488 // and the destination element type size is twice that of the source type. 1489 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize; 1490 } 1491 1492 InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, 1493 Type *Src, 1494 TTI::CastContextHint CCH, 1495 TTI::TargetCostKind CostKind, 1496 const Instruction *I) { 1497 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1498 assert(ISD && "Invalid opcode"); 1499 1500 // If the cast is observable, and it is used by a widening instruction (e.g., 1501 // uaddl, saddw, etc.), it may be free. 1502 if (I && I->hasOneUser()) { 1503 auto *SingleUser = cast<Instruction>(*I->user_begin()); 1504 SmallVector<const Value *, 4> Operands(SingleUser->operand_values()); 1505 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) { 1506 // If the cast is the second operand, it is free. We will generate either 1507 // a "wide" or "long" version of the widening instruction. 1508 if (I == SingleUser->getOperand(1)) 1509 return 0; 1510 // If the cast is not the second operand, it will be free if it looks the 1511 // same as the second operand. In this case, we will generate a "long" 1512 // version of the widening instruction. 1513 if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1))) 1514 if (I->getOpcode() == unsigned(Cast->getOpcode()) && 1515 cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy()) 1516 return 0; 1517 } 1518 } 1519 1520 // TODO: Allow non-throughput costs that aren't binary. 1521 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost { 1522 if (CostKind != TTI::TCK_RecipThroughput) 1523 return Cost == 0 ? 0 : 1; 1524 return Cost; 1525 }; 1526 1527 EVT SrcTy = TLI->getValueType(DL, Src); 1528 EVT DstTy = TLI->getValueType(DL, Dst); 1529 1530 if (!SrcTy.isSimple() || !DstTy.isSimple()) 1531 return AdjustCost( 1532 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); 1533 1534 static const TypeConversionCostTblEntry 1535 ConversionTbl[] = { 1536 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, 1537 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 }, 1538 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, 1539 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 }, 1540 1541 // Truncations on nxvmiN 1542 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 }, 1543 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 }, 1544 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 }, 1545 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 }, 1546 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 }, 1547 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 }, 1548 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 }, 1549 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 }, 1550 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 }, 1551 { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 }, 1552 { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 }, 1553 { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 }, 1554 { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 }, 1555 { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 }, 1556 { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 }, 1557 { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 }, 1558 1559 // The number of shll instructions for the extension. 1560 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 1561 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 1562 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, 1563 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, 1564 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 1565 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 1566 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, 1567 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, 1568 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, 1569 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, 1570 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, 1571 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, 1572 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, 1573 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, 1574 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, 1575 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, 1576 1577 // LowerVectorINT_TO_FP: 1578 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 1579 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 1580 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 1581 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 1582 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 1583 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 1584 1585 // Complex: to v2f32 1586 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 1587 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, 1588 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, 1589 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 1590 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, 1591 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, 1592 1593 // Complex: to v4f32 1594 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 }, 1595 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 1596 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, 1597 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 1598 1599 // Complex: to v8f32 1600 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, 1601 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 1602 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, 1603 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 1604 1605 // Complex: to v16f32 1606 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, 1607 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, 1608 1609 // Complex: to v2f64 1610 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 1611 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, 1612 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 1613 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 1614 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, 1615 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 1616 1617 1618 // LowerVectorFP_TO_INT 1619 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 }, 1620 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, 1621 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, 1622 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, 1623 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, 1624 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, 1625 1626 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext). 1627 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 }, 1628 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 }, 1629 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 }, 1630 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 }, 1631 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 }, 1632 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 }, 1633 1634 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2 1635 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 }, 1636 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 }, 1637 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 }, 1638 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 }, 1639 1640 // Complex, from nxv2f32. 1641 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 }, 1642 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 }, 1643 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 }, 1644 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1 }, 1645 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 }, 1646 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 }, 1647 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 }, 1648 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1 }, 1649 1650 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2. 1651 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 }, 1652 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 }, 1653 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 }, 1654 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 }, 1655 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 }, 1656 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 }, 1657 1658 // Complex, from nxv2f64. 1659 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 }, 1660 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 }, 1661 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 }, 1662 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1 }, 1663 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 }, 1664 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 }, 1665 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 }, 1666 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1 }, 1667 1668 // Complex, from nxv4f32. 1669 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 }, 1670 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 }, 1671 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 }, 1672 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1 }, 1673 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 }, 1674 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 }, 1675 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 }, 1676 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1 }, 1677 1678 // Complex, from nxv8f64. Illegal -> illegal conversions not required. 1679 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 }, 1680 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7 }, 1681 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 }, 1682 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7 }, 1683 1684 // Complex, from nxv4f64. Illegal -> illegal conversions not required. 1685 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 }, 1686 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 }, 1687 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3 }, 1688 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 }, 1689 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 }, 1690 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3 }, 1691 1692 // Complex, from nxv8f32. Illegal -> illegal conversions not required. 1693 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 }, 1694 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3 }, 1695 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 }, 1696 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3 }, 1697 1698 // Complex, from nxv8f16. 1699 { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 }, 1700 { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 }, 1701 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 }, 1702 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1 }, 1703 { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 }, 1704 { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 }, 1705 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 }, 1706 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1 }, 1707 1708 // Complex, from nxv4f16. 1709 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 }, 1710 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 }, 1711 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 }, 1712 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1 }, 1713 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 }, 1714 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 }, 1715 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 }, 1716 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1 }, 1717 1718 // Complex, from nxv2f16. 1719 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 }, 1720 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 }, 1721 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 }, 1722 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1 }, 1723 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 }, 1724 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 }, 1725 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 }, 1726 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1 }, 1727 1728 // Truncate from nxvmf32 to nxvmf16. 1729 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 }, 1730 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 }, 1731 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 }, 1732 1733 // Truncate from nxvmf64 to nxvmf16. 1734 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 }, 1735 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 }, 1736 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 }, 1737 1738 // Truncate from nxvmf64 to nxvmf32. 1739 { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 }, 1740 { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 }, 1741 { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 }, 1742 1743 // Extend from nxvmf16 to nxvmf32. 1744 { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1}, 1745 { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1}, 1746 { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2}, 1747 1748 // Extend from nxvmf16 to nxvmf64. 1749 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1}, 1750 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2}, 1751 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4}, 1752 1753 // Extend from nxvmf32 to nxvmf64. 1754 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1}, 1755 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2}, 1756 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6}, 1757 1758 // Bitcasts from float to integer 1759 { ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0 }, 1760 { ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0 }, 1761 { ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0 }, 1762 1763 // Bitcasts from integer to float 1764 { ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0 }, 1765 { ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0 }, 1766 { ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0 }, 1767 }; 1768 1769 if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD, 1770 DstTy.getSimpleVT(), 1771 SrcTy.getSimpleVT())) 1772 return AdjustCost(Entry->Cost); 1773 1774 static const TypeConversionCostTblEntry FP16Tbl[] = { 1775 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs 1776 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1}, 1777 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs 1778 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1}, 1779 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs 1780 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2}, 1781 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn 1782 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2}, 1783 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs 1784 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1}, 1785 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs 1786 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4}, 1787 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn 1788 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3}, 1789 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs 1790 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2}, 1791 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs 1792 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8}, 1793 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf 1794 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf 1795 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf 1796 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf 1797 }; 1798 1799 if (ST->hasFullFP16()) 1800 if (const auto *Entry = ConvertCostTableLookup( 1801 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) 1802 return AdjustCost(Entry->Cost); 1803 1804 return AdjustCost( 1805 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); 1806 } 1807 1808 InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, 1809 Type *Dst, 1810 VectorType *VecTy, 1811 unsigned Index) { 1812 1813 // Make sure we were given a valid extend opcode. 1814 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && 1815 "Invalid opcode"); 1816 1817 // We are extending an element we extract from a vector, so the source type 1818 // of the extend is the element type of the vector. 1819 auto *Src = VecTy->getElementType(); 1820 1821 // Sign- and zero-extends are for integer types only. 1822 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type"); 1823 1824 // Get the cost for the extract. We compute the cost (if any) for the extend 1825 // below. 1826 InstructionCost Cost = 1827 getVectorInstrCost(Instruction::ExtractElement, VecTy, Index); 1828 1829 // Legalize the types. 1830 auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy); 1831 auto DstVT = TLI->getValueType(DL, Dst); 1832 auto SrcVT = TLI->getValueType(DL, Src); 1833 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 1834 1835 // If the resulting type is still a vector and the destination type is legal, 1836 // we may get the extension for free. If not, get the default cost for the 1837 // extend. 1838 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT)) 1839 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 1840 CostKind); 1841 1842 // The destination type should be larger than the element type. If not, get 1843 // the default cost for the extend. 1844 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits()) 1845 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 1846 CostKind); 1847 1848 switch (Opcode) { 1849 default: 1850 llvm_unreachable("Opcode should be either SExt or ZExt"); 1851 1852 // For sign-extends, we only need a smov, which performs the extension 1853 // automatically. 1854 case Instruction::SExt: 1855 return Cost; 1856 1857 // For zero-extends, the extend is performed automatically by a umov unless 1858 // the destination type is i64 and the element type is i8 or i16. 1859 case Instruction::ZExt: 1860 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u) 1861 return Cost; 1862 } 1863 1864 // If we are unable to perform the extend for free, get the default cost. 1865 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 1866 CostKind); 1867 } 1868 1869 InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode, 1870 TTI::TargetCostKind CostKind, 1871 const Instruction *I) { 1872 if (CostKind != TTI::TCK_RecipThroughput) 1873 return Opcode == Instruction::PHI ? 0 : 1; 1874 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind"); 1875 // Branches are assumed to be predicted. 1876 return 0; 1877 } 1878 1879 InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, 1880 unsigned Index) { 1881 assert(Val->isVectorTy() && "This must be a vector type"); 1882 1883 if (Index != -1U) { 1884 // Legalize the type. 1885 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); 1886 1887 // This type is legalized to a scalar type. 1888 if (!LT.second.isVector()) 1889 return 0; 1890 1891 // The type may be split. For fixed-width vectors we can normalize the 1892 // index to the new type. 1893 if (LT.second.isFixedLengthVector()) { 1894 unsigned Width = LT.second.getVectorNumElements(); 1895 Index = Index % Width; 1896 } 1897 1898 // The element at index zero is already inside the vector. 1899 if (Index == 0) 1900 return 0; 1901 } 1902 1903 // All other insert/extracts cost this much. 1904 return ST->getVectorInsertExtractBaseCost(); 1905 } 1906 1907 InstructionCost AArch64TTIImpl::getArithmeticInstrCost( 1908 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 1909 TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, 1910 TTI::OperandValueProperties Opd1PropInfo, 1911 TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args, 1912 const Instruction *CxtI) { 1913 // TODO: Handle more cost kinds. 1914 if (CostKind != TTI::TCK_RecipThroughput) 1915 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, 1916 Opd2Info, Opd1PropInfo, 1917 Opd2PropInfo, Args, CxtI); 1918 1919 // Legalize the type. 1920 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 1921 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1922 1923 switch (ISD) { 1924 default: 1925 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, 1926 Opd2Info, Opd1PropInfo, Opd2PropInfo); 1927 case ISD::SDIV: 1928 if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue && 1929 Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { 1930 // On AArch64, scalar signed division by constants power-of-two are 1931 // normally expanded to the sequence ADD + CMP + SELECT + SRA. 1932 // The OperandValue properties many not be same as that of previous 1933 // operation; conservatively assume OP_None. 1934 InstructionCost Cost = getArithmeticInstrCost( 1935 Instruction::Add, Ty, CostKind, Opd1Info, Opd2Info, 1936 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 1937 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Opd1Info, 1938 Opd2Info, TargetTransformInfo::OP_None, 1939 TargetTransformInfo::OP_None); 1940 Cost += getArithmeticInstrCost( 1941 Instruction::Select, Ty, CostKind, Opd1Info, Opd2Info, 1942 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 1943 Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Opd1Info, 1944 Opd2Info, TargetTransformInfo::OP_None, 1945 TargetTransformInfo::OP_None); 1946 return Cost; 1947 } 1948 LLVM_FALLTHROUGH; 1949 case ISD::UDIV: { 1950 if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue) { 1951 auto VT = TLI->getValueType(DL, Ty); 1952 if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) { 1953 // Vector signed division by constant are expanded to the 1954 // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division 1955 // to MULHS + SUB + SRL + ADD + SRL. 1956 InstructionCost MulCost = getArithmeticInstrCost( 1957 Instruction::Mul, Ty, CostKind, Opd1Info, Opd2Info, 1958 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 1959 InstructionCost AddCost = getArithmeticInstrCost( 1960 Instruction::Add, Ty, CostKind, Opd1Info, Opd2Info, 1961 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 1962 InstructionCost ShrCost = getArithmeticInstrCost( 1963 Instruction::AShr, Ty, CostKind, Opd1Info, Opd2Info, 1964 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 1965 return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1; 1966 } 1967 } 1968 1969 InstructionCost Cost = BaseT::getArithmeticInstrCost( 1970 Opcode, Ty, CostKind, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo); 1971 if (Ty->isVectorTy()) { 1972 // On AArch64, vector divisions are not supported natively and are 1973 // expanded into scalar divisions of each pair of elements. 1974 Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, CostKind, 1975 Opd1Info, Opd2Info, Opd1PropInfo, 1976 Opd2PropInfo); 1977 Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind, 1978 Opd1Info, Opd2Info, Opd1PropInfo, 1979 Opd2PropInfo); 1980 // TODO: if one of the arguments is scalar, then it's not necessary to 1981 // double the cost of handling the vector elements. 1982 Cost += Cost; 1983 } 1984 return Cost; 1985 } 1986 case ISD::MUL: 1987 // Since we do not have a MUL.2d instruction, a mul <2 x i64> is expensive 1988 // as elements are extracted from the vectors and the muls scalarized. 1989 // As getScalarizationOverhead is a bit too pessimistic, we estimate the 1990 // cost for a i64 vector directly here, which is: 1991 // - four 2-cost i64 extracts, 1992 // - two 2-cost i64 inserts, and 1993 // - two 1-cost muls. 1994 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with 1995 // LT.first = 2 the cost is 28. If both operands are extensions it will not 1996 // need to scalarize so the cost can be cheaper (smull or umull). 1997 if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args)) 1998 return LT.first; 1999 return LT.first * 14; 2000 case ISD::ADD: 2001 case ISD::XOR: 2002 case ISD::OR: 2003 case ISD::AND: 2004 case ISD::SRL: 2005 case ISD::SRA: 2006 case ISD::SHL: 2007 // These nodes are marked as 'custom' for combining purposes only. 2008 // We know that they are legal. See LowerAdd in ISelLowering. 2009 return LT.first; 2010 2011 case ISD::FADD: 2012 case ISD::FSUB: 2013 case ISD::FMUL: 2014 case ISD::FDIV: 2015 case ISD::FNEG: 2016 // These nodes are marked as 'custom' just to lower them to SVE. 2017 // We know said lowering will incur no additional cost. 2018 if (!Ty->getScalarType()->isFP128Ty()) 2019 return 2 * LT.first; 2020 2021 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, 2022 Opd2Info, Opd1PropInfo, Opd2PropInfo); 2023 } 2024 } 2025 2026 InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty, 2027 ScalarEvolution *SE, 2028 const SCEV *Ptr) { 2029 // Address computations in vectorized code with non-consecutive addresses will 2030 // likely result in more instructions compared to scalar code where the 2031 // computation can more often be merged into the index mode. The resulting 2032 // extra micro-ops can significantly decrease throughput. 2033 unsigned NumVectorInstToHideOverhead = 10; 2034 int MaxMergeDistance = 64; 2035 2036 if (Ty->isVectorTy() && SE && 2037 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) 2038 return NumVectorInstToHideOverhead; 2039 2040 // In many cases the address computation is not merged into the instruction 2041 // addressing mode. 2042 return 1; 2043 } 2044 2045 InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 2046 Type *CondTy, 2047 CmpInst::Predicate VecPred, 2048 TTI::TargetCostKind CostKind, 2049 const Instruction *I) { 2050 // TODO: Handle other cost kinds. 2051 if (CostKind != TTI::TCK_RecipThroughput) 2052 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 2053 I); 2054 2055 int ISD = TLI->InstructionOpcodeToISD(Opcode); 2056 // We don't lower some vector selects well that are wider than the register 2057 // width. 2058 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) { 2059 // We would need this many instructions to hide the scalarization happening. 2060 const int AmortizationCost = 20; 2061 2062 // If VecPred is not set, check if we can get a predicate from the context 2063 // instruction, if its type matches the requested ValTy. 2064 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) { 2065 CmpInst::Predicate CurrentPred; 2066 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(), 2067 m_Value()))) 2068 VecPred = CurrentPred; 2069 } 2070 // Check if we have a compare/select chain that can be lowered using 2071 // a (F)CMxx & BFI pair. 2072 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE || 2073 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT || 2074 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ || 2075 VecPred == CmpInst::FCMP_UNE) { 2076 static const auto ValidMinMaxTys = { 2077 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, 2078 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64}; 2079 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16}; 2080 2081 auto LT = TLI->getTypeLegalizationCost(DL, ValTy); 2082 if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; }) || 2083 (ST->hasFullFP16() && 2084 any_of(ValidFP16MinMaxTys, [<](MVT M) { return M == LT.second; }))) 2085 return LT.first; 2086 } 2087 2088 static const TypeConversionCostTblEntry 2089 VectorSelectTbl[] = { 2090 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 }, 2091 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 }, 2092 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 }, 2093 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost }, 2094 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost }, 2095 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost } 2096 }; 2097 2098 EVT SelCondTy = TLI->getValueType(DL, CondTy); 2099 EVT SelValTy = TLI->getValueType(DL, ValTy); 2100 if (SelCondTy.isSimple() && SelValTy.isSimple()) { 2101 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD, 2102 SelCondTy.getSimpleVT(), 2103 SelValTy.getSimpleVT())) 2104 return Entry->Cost; 2105 } 2106 } 2107 // The base case handles scalable vectors fine for now, since it treats the 2108 // cost as 1 * legalization cost. 2109 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); 2110 } 2111 2112 AArch64TTIImpl::TTI::MemCmpExpansionOptions 2113 AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { 2114 TTI::MemCmpExpansionOptions Options; 2115 if (ST->requiresStrictAlign()) { 2116 // TODO: Add cost modeling for strict align. Misaligned loads expand to 2117 // a bunch of instructions when strict align is enabled. 2118 return Options; 2119 } 2120 Options.AllowOverlappingLoads = true; 2121 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); 2122 Options.NumLoadsPerBlock = Options.MaxNumLoads; 2123 // TODO: Though vector loads usually perform well on AArch64, in some targets 2124 // they may wake up the FP unit, which raises the power consumption. Perhaps 2125 // they could be used with no holds barred (-O3). 2126 Options.LoadSizes = {8, 4, 2, 1}; 2127 return Options; 2128 } 2129 2130 bool AArch64TTIImpl::prefersVectorizedAddressing() const { 2131 return ST->hasSVE(); 2132 } 2133 2134 InstructionCost 2135 AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, 2136 Align Alignment, unsigned AddressSpace, 2137 TTI::TargetCostKind CostKind) { 2138 if (useNeonVector(Src)) 2139 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 2140 CostKind); 2141 auto LT = TLI->getTypeLegalizationCost(DL, Src); 2142 if (!LT.first.isValid()) 2143 return InstructionCost::getInvalid(); 2144 2145 // The code-generator is currently not able to handle scalable vectors 2146 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 2147 // it. This change will be removed when code-generation for these types is 2148 // sufficiently reliable. 2149 if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1)) 2150 return InstructionCost::getInvalid(); 2151 2152 return LT.first * 2; 2153 } 2154 2155 static unsigned getSVEGatherScatterOverhead(unsigned Opcode) { 2156 return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead; 2157 } 2158 2159 InstructionCost AArch64TTIImpl::getGatherScatterOpCost( 2160 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, 2161 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { 2162 if (useNeonVector(DataTy)) 2163 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 2164 Alignment, CostKind, I); 2165 auto *VT = cast<VectorType>(DataTy); 2166 auto LT = TLI->getTypeLegalizationCost(DL, DataTy); 2167 if (!LT.first.isValid()) 2168 return InstructionCost::getInvalid(); 2169 2170 // The code-generator is currently not able to handle scalable vectors 2171 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 2172 // it. This change will be removed when code-generation for these types is 2173 // sufficiently reliable. 2174 if (cast<VectorType>(DataTy)->getElementCount() == 2175 ElementCount::getScalable(1)) 2176 return InstructionCost::getInvalid(); 2177 2178 ElementCount LegalVF = LT.second.getVectorElementCount(); 2179 InstructionCost MemOpCost = 2180 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I); 2181 // Add on an overhead cost for using gathers/scatters. 2182 // TODO: At the moment this is applied unilaterally for all CPUs, but at some 2183 // point we may want a per-CPU overhead. 2184 MemOpCost *= getSVEGatherScatterOverhead(Opcode); 2185 return LT.first * MemOpCost * getMaxNumElements(LegalVF); 2186 } 2187 2188 bool AArch64TTIImpl::useNeonVector(const Type *Ty) const { 2189 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors(); 2190 } 2191 2192 InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, 2193 MaybeAlign Alignment, 2194 unsigned AddressSpace, 2195 TTI::TargetCostKind CostKind, 2196 const Instruction *I) { 2197 EVT VT = TLI->getValueType(DL, Ty, true); 2198 // Type legalization can't handle structs 2199 if (VT == MVT::Other) 2200 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, 2201 CostKind); 2202 2203 auto LT = TLI->getTypeLegalizationCost(DL, Ty); 2204 if (!LT.first.isValid()) 2205 return InstructionCost::getInvalid(); 2206 2207 // The code-generator is currently not able to handle scalable vectors 2208 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 2209 // it. This change will be removed when code-generation for these types is 2210 // sufficiently reliable. 2211 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty)) 2212 if (VTy->getElementCount() == ElementCount::getScalable(1)) 2213 return InstructionCost::getInvalid(); 2214 2215 // TODO: consider latency as well for TCK_SizeAndLatency. 2216 if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) 2217 return LT.first; 2218 2219 if (CostKind != TTI::TCK_RecipThroughput) 2220 return 1; 2221 2222 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store && 2223 LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) { 2224 // Unaligned stores are extremely inefficient. We don't split all 2225 // unaligned 128-bit stores because the negative impact that has shown in 2226 // practice on inlined block copy code. 2227 // We make such stores expensive so that we will only vectorize if there 2228 // are 6 other instructions getting vectorized. 2229 const int AmortizationCost = 6; 2230 2231 return LT.first * 2 * AmortizationCost; 2232 } 2233 2234 // Check truncating stores and extending loads. 2235 if (useNeonVector(Ty) && 2236 Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) { 2237 // v4i8 types are lowered to scalar a load/store and sshll/xtn. 2238 if (VT == MVT::v4i8) 2239 return 2; 2240 // Otherwise we need to scalarize. 2241 return cast<FixedVectorType>(Ty)->getNumElements() * 2; 2242 } 2243 2244 return LT.first; 2245 } 2246 2247 InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost( 2248 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 2249 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 2250 bool UseMaskForCond, bool UseMaskForGaps) { 2251 assert(Factor >= 2 && "Invalid interleave factor"); 2252 auto *VecVTy = cast<FixedVectorType>(VecTy); 2253 2254 if (!UseMaskForCond && !UseMaskForGaps && 2255 Factor <= TLI->getMaxSupportedInterleaveFactor()) { 2256 unsigned NumElts = VecVTy->getNumElements(); 2257 auto *SubVecTy = 2258 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor); 2259 2260 // ldN/stN only support legal vector types of size 64 or 128 in bits. 2261 // Accesses having vector types that are a multiple of 128 bits can be 2262 // matched to more than one ldN/stN instruction. 2263 bool UseScalable; 2264 if (NumElts % Factor == 0 && 2265 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable)) 2266 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable); 2267 } 2268 2269 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 2270 Alignment, AddressSpace, CostKind, 2271 UseMaskForCond, UseMaskForGaps); 2272 } 2273 2274 InstructionCost 2275 AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { 2276 InstructionCost Cost = 0; 2277 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2278 for (auto *I : Tys) { 2279 if (!I->isVectorTy()) 2280 continue; 2281 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() == 2282 128) 2283 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) + 2284 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind); 2285 } 2286 return Cost; 2287 } 2288 2289 unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) { 2290 return ST->getMaxInterleaveFactor(); 2291 } 2292 2293 // For Falkor, we want to avoid having too many strided loads in a loop since 2294 // that can exhaust the HW prefetcher resources. We adjust the unroller 2295 // MaxCount preference below to attempt to ensure unrolling doesn't create too 2296 // many strided loads. 2297 static void 2298 getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, 2299 TargetTransformInfo::UnrollingPreferences &UP) { 2300 enum { MaxStridedLoads = 7 }; 2301 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) { 2302 int StridedLoads = 0; 2303 // FIXME? We could make this more precise by looking at the CFG and 2304 // e.g. not counting loads in each side of an if-then-else diamond. 2305 for (const auto BB : L->blocks()) { 2306 for (auto &I : *BB) { 2307 LoadInst *LMemI = dyn_cast<LoadInst>(&I); 2308 if (!LMemI) 2309 continue; 2310 2311 Value *PtrValue = LMemI->getPointerOperand(); 2312 if (L->isLoopInvariant(PtrValue)) 2313 continue; 2314 2315 const SCEV *LSCEV = SE.getSCEV(PtrValue); 2316 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV); 2317 if (!LSCEVAddRec || !LSCEVAddRec->isAffine()) 2318 continue; 2319 2320 // FIXME? We could take pairing of unrolled load copies into account 2321 // by looking at the AddRec, but we would probably have to limit this 2322 // to loops with no stores or other memory optimization barriers. 2323 ++StridedLoads; 2324 // We've seen enough strided loads that seeing more won't make a 2325 // difference. 2326 if (StridedLoads > MaxStridedLoads / 2) 2327 return StridedLoads; 2328 } 2329 } 2330 return StridedLoads; 2331 }; 2332 2333 int StridedLoads = countStridedLoads(L, SE); 2334 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads 2335 << " strided loads\n"); 2336 // Pick the largest power of 2 unroll count that won't result in too many 2337 // strided loads. 2338 if (StridedLoads) { 2339 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads); 2340 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to " 2341 << UP.MaxCount << '\n'); 2342 } 2343 } 2344 2345 void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 2346 TTI::UnrollingPreferences &UP, 2347 OptimizationRemarkEmitter *ORE) { 2348 // Enable partial unrolling and runtime unrolling. 2349 BaseT::getUnrollingPreferences(L, SE, UP, ORE); 2350 2351 UP.UpperBound = true; 2352 2353 // For inner loop, it is more likely to be a hot one, and the runtime check 2354 // can be promoted out from LICM pass, so the overhead is less, let's try 2355 // a larger threshold to unroll more loops. 2356 if (L->getLoopDepth() > 1) 2357 UP.PartialThreshold *= 2; 2358 2359 // Disable partial & runtime unrolling on -Os. 2360 UP.PartialOptSizeThreshold = 0; 2361 2362 if (ST->getProcFamily() == AArch64Subtarget::Falkor && 2363 EnableFalkorHWPFUnrollFix) 2364 getFalkorUnrollingPreferences(L, SE, UP); 2365 2366 // Scan the loop: don't unroll loops with calls as this could prevent 2367 // inlining. Don't unroll vector loops either, as they don't benefit much from 2368 // unrolling. 2369 for (auto *BB : L->getBlocks()) { 2370 for (auto &I : *BB) { 2371 // Don't unroll vectorised loop. 2372 if (I.getType()->isVectorTy()) 2373 return; 2374 2375 if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 2376 if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 2377 if (!isLoweredToCall(F)) 2378 continue; 2379 } 2380 return; 2381 } 2382 } 2383 } 2384 2385 // Enable runtime unrolling for in-order models 2386 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by 2387 // checking for that case, we can ensure that the default behaviour is 2388 // unchanged 2389 if (ST->getProcFamily() != AArch64Subtarget::Others && 2390 !ST->getSchedModel().isOutOfOrder()) { 2391 UP.Runtime = true; 2392 UP.Partial = true; 2393 UP.UnrollRemainder = true; 2394 UP.DefaultUnrollRuntimeCount = 4; 2395 2396 UP.UnrollAndJam = true; 2397 UP.UnrollAndJamInnerLoopThreshold = 60; 2398 } 2399 } 2400 2401 void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 2402 TTI::PeelingPreferences &PP) { 2403 BaseT::getPeelingPreferences(L, SE, PP); 2404 } 2405 2406 Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, 2407 Type *ExpectedType) { 2408 switch (Inst->getIntrinsicID()) { 2409 default: 2410 return nullptr; 2411 case Intrinsic::aarch64_neon_st2: 2412 case Intrinsic::aarch64_neon_st3: 2413 case Intrinsic::aarch64_neon_st4: { 2414 // Create a struct type 2415 StructType *ST = dyn_cast<StructType>(ExpectedType); 2416 if (!ST) 2417 return nullptr; 2418 unsigned NumElts = Inst->arg_size() - 1; 2419 if (ST->getNumElements() != NumElts) 2420 return nullptr; 2421 for (unsigned i = 0, e = NumElts; i != e; ++i) { 2422 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i)) 2423 return nullptr; 2424 } 2425 Value *Res = UndefValue::get(ExpectedType); 2426 IRBuilder<> Builder(Inst); 2427 for (unsigned i = 0, e = NumElts; i != e; ++i) { 2428 Value *L = Inst->getArgOperand(i); 2429 Res = Builder.CreateInsertValue(Res, L, i); 2430 } 2431 return Res; 2432 } 2433 case Intrinsic::aarch64_neon_ld2: 2434 case Intrinsic::aarch64_neon_ld3: 2435 case Intrinsic::aarch64_neon_ld4: 2436 if (Inst->getType() == ExpectedType) 2437 return Inst; 2438 return nullptr; 2439 } 2440 } 2441 2442 bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, 2443 MemIntrinsicInfo &Info) { 2444 switch (Inst->getIntrinsicID()) { 2445 default: 2446 break; 2447 case Intrinsic::aarch64_neon_ld2: 2448 case Intrinsic::aarch64_neon_ld3: 2449 case Intrinsic::aarch64_neon_ld4: 2450 Info.ReadMem = true; 2451 Info.WriteMem = false; 2452 Info.PtrVal = Inst->getArgOperand(0); 2453 break; 2454 case Intrinsic::aarch64_neon_st2: 2455 case Intrinsic::aarch64_neon_st3: 2456 case Intrinsic::aarch64_neon_st4: 2457 Info.ReadMem = false; 2458 Info.WriteMem = true; 2459 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1); 2460 break; 2461 } 2462 2463 switch (Inst->getIntrinsicID()) { 2464 default: 2465 return false; 2466 case Intrinsic::aarch64_neon_ld2: 2467 case Intrinsic::aarch64_neon_st2: 2468 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS; 2469 break; 2470 case Intrinsic::aarch64_neon_ld3: 2471 case Intrinsic::aarch64_neon_st3: 2472 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS; 2473 break; 2474 case Intrinsic::aarch64_neon_ld4: 2475 case Intrinsic::aarch64_neon_st4: 2476 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS; 2477 break; 2478 } 2479 return true; 2480 } 2481 2482 /// See if \p I should be considered for address type promotion. We check if \p 2483 /// I is a sext with right type and used in memory accesses. If it used in a 2484 /// "complex" getelementptr, we allow it to be promoted without finding other 2485 /// sext instructions that sign extended the same initial value. A getelementptr 2486 /// is considered as "complex" if it has more than 2 operands. 2487 bool AArch64TTIImpl::shouldConsiderAddressTypePromotion( 2488 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) { 2489 bool Considerable = false; 2490 AllowPromotionWithoutCommonHeader = false; 2491 if (!isa<SExtInst>(&I)) 2492 return false; 2493 Type *ConsideredSExtType = 2494 Type::getInt64Ty(I.getParent()->getParent()->getContext()); 2495 if (I.getType() != ConsideredSExtType) 2496 return false; 2497 // See if the sext is the one with the right type and used in at least one 2498 // GetElementPtrInst. 2499 for (const User *U : I.users()) { 2500 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) { 2501 Considerable = true; 2502 // A getelementptr is considered as "complex" if it has more than 2 2503 // operands. We will promote a SExt used in such complex GEP as we 2504 // expect some computation to be merged if they are done on 64 bits. 2505 if (GEPInst->getNumOperands() > 2) { 2506 AllowPromotionWithoutCommonHeader = true; 2507 break; 2508 } 2509 } 2510 } 2511 return Considerable; 2512 } 2513 2514 bool AArch64TTIImpl::isLegalToVectorizeReduction( 2515 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const { 2516 if (!VF.isScalable()) 2517 return true; 2518 2519 Type *Ty = RdxDesc.getRecurrenceType(); 2520 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty)) 2521 return false; 2522 2523 switch (RdxDesc.getRecurrenceKind()) { 2524 case RecurKind::Add: 2525 case RecurKind::FAdd: 2526 case RecurKind::And: 2527 case RecurKind::Or: 2528 case RecurKind::Xor: 2529 case RecurKind::SMin: 2530 case RecurKind::SMax: 2531 case RecurKind::UMin: 2532 case RecurKind::UMax: 2533 case RecurKind::FMin: 2534 case RecurKind::FMax: 2535 case RecurKind::SelectICmp: 2536 case RecurKind::SelectFCmp: 2537 case RecurKind::FMulAdd: 2538 return true; 2539 default: 2540 return false; 2541 } 2542 } 2543 2544 InstructionCost 2545 AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, 2546 bool IsUnsigned, 2547 TTI::TargetCostKind CostKind) { 2548 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 2549 2550 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16()) 2551 return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind); 2552 2553 assert((isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) && 2554 "Both vector needs to be equally scalable"); 2555 2556 InstructionCost LegalizationCost = 0; 2557 if (LT.first > 1) { 2558 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext()); 2559 unsigned MinMaxOpcode = 2560 Ty->isFPOrFPVectorTy() 2561 ? Intrinsic::maxnum 2562 : (IsUnsigned ? Intrinsic::umin : Intrinsic::smin); 2563 IntrinsicCostAttributes Attrs(MinMaxOpcode, LegalVTy, {LegalVTy, LegalVTy}); 2564 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1); 2565 } 2566 2567 return LegalizationCost + /*Cost of horizontal reduction*/ 2; 2568 } 2569 2570 InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE( 2571 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) { 2572 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 2573 InstructionCost LegalizationCost = 0; 2574 if (LT.first > 1) { 2575 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext()); 2576 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind); 2577 LegalizationCost *= LT.first - 1; 2578 } 2579 2580 int ISD = TLI->InstructionOpcodeToISD(Opcode); 2581 assert(ISD && "Invalid opcode"); 2582 // Add the final reduction cost for the legal horizontal reduction 2583 switch (ISD) { 2584 case ISD::ADD: 2585 case ISD::AND: 2586 case ISD::OR: 2587 case ISD::XOR: 2588 case ISD::FADD: 2589 return LegalizationCost + 2; 2590 default: 2591 return InstructionCost::getInvalid(); 2592 } 2593 } 2594 2595 InstructionCost 2596 AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, 2597 Optional<FastMathFlags> FMF, 2598 TTI::TargetCostKind CostKind) { 2599 if (TTI::requiresOrderedReduction(FMF)) { 2600 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) { 2601 InstructionCost BaseCost = 2602 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 2603 // Add on extra cost to reflect the extra overhead on some CPUs. We still 2604 // end up vectorizing for more computationally intensive loops. 2605 return BaseCost + FixedVTy->getNumElements(); 2606 } 2607 2608 if (Opcode != Instruction::FAdd) 2609 return InstructionCost::getInvalid(); 2610 2611 auto *VTy = cast<ScalableVectorType>(ValTy); 2612 InstructionCost Cost = 2613 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind); 2614 Cost *= getMaxNumElements(VTy->getElementCount()); 2615 return Cost; 2616 } 2617 2618 if (isa<ScalableVectorType>(ValTy)) 2619 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind); 2620 2621 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 2622 MVT MTy = LT.second; 2623 int ISD = TLI->InstructionOpcodeToISD(Opcode); 2624 assert(ISD && "Invalid opcode"); 2625 2626 // Horizontal adds can use the 'addv' instruction. We model the cost of these 2627 // instructions as twice a normal vector add, plus 1 for each legalization 2628 // step (LT.first). This is the only arithmetic vector reduction operation for 2629 // which we have an instruction. 2630 // OR, XOR and AND costs should match the codegen from: 2631 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll 2632 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll 2633 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll 2634 static const CostTblEntry CostTblNoPairwise[]{ 2635 {ISD::ADD, MVT::v8i8, 2}, 2636 {ISD::ADD, MVT::v16i8, 2}, 2637 {ISD::ADD, MVT::v4i16, 2}, 2638 {ISD::ADD, MVT::v8i16, 2}, 2639 {ISD::ADD, MVT::v4i32, 2}, 2640 {ISD::OR, MVT::v8i8, 15}, 2641 {ISD::OR, MVT::v16i8, 17}, 2642 {ISD::OR, MVT::v4i16, 7}, 2643 {ISD::OR, MVT::v8i16, 9}, 2644 {ISD::OR, MVT::v2i32, 3}, 2645 {ISD::OR, MVT::v4i32, 5}, 2646 {ISD::OR, MVT::v2i64, 3}, 2647 {ISD::XOR, MVT::v8i8, 15}, 2648 {ISD::XOR, MVT::v16i8, 17}, 2649 {ISD::XOR, MVT::v4i16, 7}, 2650 {ISD::XOR, MVT::v8i16, 9}, 2651 {ISD::XOR, MVT::v2i32, 3}, 2652 {ISD::XOR, MVT::v4i32, 5}, 2653 {ISD::XOR, MVT::v2i64, 3}, 2654 {ISD::AND, MVT::v8i8, 15}, 2655 {ISD::AND, MVT::v16i8, 17}, 2656 {ISD::AND, MVT::v4i16, 7}, 2657 {ISD::AND, MVT::v8i16, 9}, 2658 {ISD::AND, MVT::v2i32, 3}, 2659 {ISD::AND, MVT::v4i32, 5}, 2660 {ISD::AND, MVT::v2i64, 3}, 2661 }; 2662 switch (ISD) { 2663 default: 2664 break; 2665 case ISD::ADD: 2666 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy)) 2667 return (LT.first - 1) + Entry->Cost; 2668 break; 2669 case ISD::XOR: 2670 case ISD::AND: 2671 case ISD::OR: 2672 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy); 2673 if (!Entry) 2674 break; 2675 auto *ValVTy = cast<FixedVectorType>(ValTy); 2676 if (!ValVTy->getElementType()->isIntegerTy(1) && 2677 MTy.getVectorNumElements() <= ValVTy->getNumElements() && 2678 isPowerOf2_32(ValVTy->getNumElements())) { 2679 InstructionCost ExtraCost = 0; 2680 if (LT.first != 1) { 2681 // Type needs to be split, so there is an extra cost of LT.first - 1 2682 // arithmetic ops. 2683 auto *Ty = FixedVectorType::get(ValTy->getElementType(), 2684 MTy.getVectorNumElements()); 2685 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind); 2686 ExtraCost *= LT.first - 1; 2687 } 2688 return Entry->Cost + ExtraCost; 2689 } 2690 break; 2691 } 2692 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 2693 } 2694 2695 InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) { 2696 static const CostTblEntry ShuffleTbl[] = { 2697 { TTI::SK_Splice, MVT::nxv16i8, 1 }, 2698 { TTI::SK_Splice, MVT::nxv8i16, 1 }, 2699 { TTI::SK_Splice, MVT::nxv4i32, 1 }, 2700 { TTI::SK_Splice, MVT::nxv2i64, 1 }, 2701 { TTI::SK_Splice, MVT::nxv2f16, 1 }, 2702 { TTI::SK_Splice, MVT::nxv4f16, 1 }, 2703 { TTI::SK_Splice, MVT::nxv8f16, 1 }, 2704 { TTI::SK_Splice, MVT::nxv2bf16, 1 }, 2705 { TTI::SK_Splice, MVT::nxv4bf16, 1 }, 2706 { TTI::SK_Splice, MVT::nxv8bf16, 1 }, 2707 { TTI::SK_Splice, MVT::nxv2f32, 1 }, 2708 { TTI::SK_Splice, MVT::nxv4f32, 1 }, 2709 { TTI::SK_Splice, MVT::nxv2f64, 1 }, 2710 }; 2711 2712 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 2713 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext()); 2714 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2715 EVT PromotedVT = LT.second.getScalarType() == MVT::i1 2716 ? TLI->getPromotedVTForPredicate(EVT(LT.second)) 2717 : LT.second; 2718 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext()); 2719 InstructionCost LegalizationCost = 0; 2720 if (Index < 0) { 2721 LegalizationCost = 2722 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy, 2723 CmpInst::BAD_ICMP_PREDICATE, CostKind) + 2724 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy, 2725 CmpInst::BAD_ICMP_PREDICATE, CostKind); 2726 } 2727 2728 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp 2729 // Cost performed on a promoted type. 2730 if (LT.second.getScalarType() == MVT::i1) { 2731 LegalizationCost += 2732 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy, 2733 TTI::CastContextHint::None, CostKind) + 2734 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy, 2735 TTI::CastContextHint::None, CostKind); 2736 } 2737 const auto *Entry = 2738 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT()); 2739 assert(Entry && "Illegal Type for Splice"); 2740 LegalizationCost += Entry->Cost; 2741 return LegalizationCost * LT.first; 2742 } 2743 2744 InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, 2745 VectorType *Tp, 2746 ArrayRef<int> Mask, int Index, 2747 VectorType *SubTp, 2748 ArrayRef<const Value *> Args) { 2749 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 2750 // If we have a Mask, and the LT is being legalized somehow, split the Mask 2751 // into smaller vectors and sum the cost of each shuffle. 2752 if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() && 2753 Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() && 2754 cast<FixedVectorType>(Tp)->getNumElements() > 2755 LT.second.getVectorNumElements() && 2756 !Index && !SubTp) { 2757 unsigned TpNumElts = cast<FixedVectorType>(Tp)->getNumElements(); 2758 assert(Mask.size() == TpNumElts && "Expected Mask and Tp size to match!"); 2759 unsigned LTNumElts = LT.second.getVectorNumElements(); 2760 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts; 2761 VectorType *NTp = 2762 VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount()); 2763 InstructionCost Cost; 2764 for (unsigned N = 0; N < NumVecs; N++) { 2765 SmallVector<int> NMask; 2766 // Split the existing mask into chunks of size LTNumElts. Track the source 2767 // sub-vectors to ensure the result has at most 2 inputs. 2768 unsigned Source1, Source2; 2769 unsigned NumSources = 0; 2770 for (unsigned E = 0; E < LTNumElts; E++) { 2771 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E] 2772 : UndefMaskElem; 2773 if (MaskElt < 0) { 2774 NMask.push_back(UndefMaskElem); 2775 continue; 2776 } 2777 2778 // Calculate which source from the input this comes from and whether it 2779 // is new to us. 2780 unsigned Source = MaskElt / LTNumElts; 2781 if (NumSources == 0) { 2782 Source1 = Source; 2783 NumSources = 1; 2784 } else if (NumSources == 1 && Source != Source1) { 2785 Source2 = Source; 2786 NumSources = 2; 2787 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) { 2788 NumSources++; 2789 } 2790 2791 // Add to the new mask. For the NumSources>2 case these are not correct, 2792 // but are only used for the modular lane number. 2793 if (Source == Source1) 2794 NMask.push_back(MaskElt % LTNumElts); 2795 else if (Source == Source2) 2796 NMask.push_back(MaskElt % LTNumElts + LTNumElts); 2797 else 2798 NMask.push_back(MaskElt % LTNumElts); 2799 } 2800 // If the sub-mask has at most 2 input sub-vectors then re-cost it using 2801 // getShuffleCost. If not then cost it using the worst case. 2802 if (NumSources <= 2) 2803 Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc 2804 : TTI::SK_PermuteTwoSrc, 2805 NTp, NMask, 0, nullptr, Args); 2806 else if (any_of(enumerate(NMask), [&](const auto &ME) { 2807 return ME.value() % LTNumElts == ME.index(); 2808 })) 2809 Cost += LTNumElts - 1; 2810 else 2811 Cost += LTNumElts; 2812 } 2813 return Cost; 2814 } 2815 2816 Kind = improveShuffleKindFromMask(Kind, Mask); 2817 2818 // Check for broadcast loads. 2819 if (Kind == TTI::SK_Broadcast) { 2820 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]); 2821 if (IsLoad && LT.second.isVector() && 2822 isLegalBroadcastLoad(Tp->getElementType(), 2823 LT.second.getVectorElementCount())) 2824 return 0; // broadcast is handled by ld1r 2825 } 2826 2827 // If we have 4 elements for the shuffle and a Mask, get the cost straight 2828 // from the perfect shuffle tables. 2829 if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) && 2830 (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) && 2831 all_of(Mask, [](int E) { return E < 8; })) 2832 return getPerfectShuffleCost(Mask); 2833 2834 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose || 2835 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc || 2836 Kind == TTI::SK_Reverse) { 2837 2838 static const CostTblEntry ShuffleTbl[] = { 2839 // Broadcast shuffle kinds can be performed with 'dup'. 2840 { TTI::SK_Broadcast, MVT::v8i8, 1 }, 2841 { TTI::SK_Broadcast, MVT::v16i8, 1 }, 2842 { TTI::SK_Broadcast, MVT::v4i16, 1 }, 2843 { TTI::SK_Broadcast, MVT::v8i16, 1 }, 2844 { TTI::SK_Broadcast, MVT::v2i32, 1 }, 2845 { TTI::SK_Broadcast, MVT::v4i32, 1 }, 2846 { TTI::SK_Broadcast, MVT::v2i64, 1 }, 2847 { TTI::SK_Broadcast, MVT::v2f32, 1 }, 2848 { TTI::SK_Broadcast, MVT::v4f32, 1 }, 2849 { TTI::SK_Broadcast, MVT::v2f64, 1 }, 2850 // Transpose shuffle kinds can be performed with 'trn1/trn2' and 2851 // 'zip1/zip2' instructions. 2852 { TTI::SK_Transpose, MVT::v8i8, 1 }, 2853 { TTI::SK_Transpose, MVT::v16i8, 1 }, 2854 { TTI::SK_Transpose, MVT::v4i16, 1 }, 2855 { TTI::SK_Transpose, MVT::v8i16, 1 }, 2856 { TTI::SK_Transpose, MVT::v2i32, 1 }, 2857 { TTI::SK_Transpose, MVT::v4i32, 1 }, 2858 { TTI::SK_Transpose, MVT::v2i64, 1 }, 2859 { TTI::SK_Transpose, MVT::v2f32, 1 }, 2860 { TTI::SK_Transpose, MVT::v4f32, 1 }, 2861 { TTI::SK_Transpose, MVT::v2f64, 1 }, 2862 // Select shuffle kinds. 2863 // TODO: handle vXi8/vXi16. 2864 { TTI::SK_Select, MVT::v2i32, 1 }, // mov. 2865 { TTI::SK_Select, MVT::v4i32, 2 }, // rev+trn (or similar). 2866 { TTI::SK_Select, MVT::v2i64, 1 }, // mov. 2867 { TTI::SK_Select, MVT::v2f32, 1 }, // mov. 2868 { TTI::SK_Select, MVT::v4f32, 2 }, // rev+trn (or similar). 2869 { TTI::SK_Select, MVT::v2f64, 1 }, // mov. 2870 // PermuteSingleSrc shuffle kinds. 2871 { TTI::SK_PermuteSingleSrc, MVT::v2i32, 1 }, // mov. 2872 { TTI::SK_PermuteSingleSrc, MVT::v4i32, 3 }, // perfectshuffle worst case. 2873 { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // mov. 2874 { TTI::SK_PermuteSingleSrc, MVT::v2f32, 1 }, // mov. 2875 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 3 }, // perfectshuffle worst case. 2876 { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // mov. 2877 { TTI::SK_PermuteSingleSrc, MVT::v4i16, 3 }, // perfectshuffle worst case. 2878 { TTI::SK_PermuteSingleSrc, MVT::v4f16, 3 }, // perfectshuffle worst case. 2879 { TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3 }, // perfectshuffle worst case. 2880 { TTI::SK_PermuteSingleSrc, MVT::v8i16, 8 }, // constpool + load + tbl 2881 { TTI::SK_PermuteSingleSrc, MVT::v8f16, 8 }, // constpool + load + tbl 2882 { TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8 }, // constpool + load + tbl 2883 { TTI::SK_PermuteSingleSrc, MVT::v8i8, 8 }, // constpool + load + tbl 2884 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 8 }, // constpool + load + tbl 2885 // Reverse can be lowered with `rev`. 2886 { TTI::SK_Reverse, MVT::v2i32, 1 }, // mov. 2887 { TTI::SK_Reverse, MVT::v4i32, 2 }, // REV64; EXT 2888 { TTI::SK_Reverse, MVT::v2i64, 1 }, // mov. 2889 { TTI::SK_Reverse, MVT::v2f32, 1 }, // mov. 2890 { TTI::SK_Reverse, MVT::v4f32, 2 }, // REV64; EXT 2891 { TTI::SK_Reverse, MVT::v2f64, 1 }, // mov. 2892 { TTI::SK_Reverse, MVT::v8f16, 2 }, // REV64; EXT 2893 { TTI::SK_Reverse, MVT::v8i16, 2 }, // REV64; EXT 2894 { TTI::SK_Reverse, MVT::v16i8, 2 }, // REV64; EXT 2895 { TTI::SK_Reverse, MVT::v4f16, 1 }, // REV64 2896 { TTI::SK_Reverse, MVT::v4i16, 1 }, // REV64 2897 { TTI::SK_Reverse, MVT::v8i8, 1 }, // REV64 2898 // Broadcast shuffle kinds for scalable vectors 2899 { TTI::SK_Broadcast, MVT::nxv16i8, 1 }, 2900 { TTI::SK_Broadcast, MVT::nxv8i16, 1 }, 2901 { TTI::SK_Broadcast, MVT::nxv4i32, 1 }, 2902 { TTI::SK_Broadcast, MVT::nxv2i64, 1 }, 2903 { TTI::SK_Broadcast, MVT::nxv2f16, 1 }, 2904 { TTI::SK_Broadcast, MVT::nxv4f16, 1 }, 2905 { TTI::SK_Broadcast, MVT::nxv8f16, 1 }, 2906 { TTI::SK_Broadcast, MVT::nxv2bf16, 1 }, 2907 { TTI::SK_Broadcast, MVT::nxv4bf16, 1 }, 2908 { TTI::SK_Broadcast, MVT::nxv8bf16, 1 }, 2909 { TTI::SK_Broadcast, MVT::nxv2f32, 1 }, 2910 { TTI::SK_Broadcast, MVT::nxv4f32, 1 }, 2911 { TTI::SK_Broadcast, MVT::nxv2f64, 1 }, 2912 { TTI::SK_Broadcast, MVT::nxv16i1, 1 }, 2913 { TTI::SK_Broadcast, MVT::nxv8i1, 1 }, 2914 { TTI::SK_Broadcast, MVT::nxv4i1, 1 }, 2915 { TTI::SK_Broadcast, MVT::nxv2i1, 1 }, 2916 // Handle the cases for vector.reverse with scalable vectors 2917 { TTI::SK_Reverse, MVT::nxv16i8, 1 }, 2918 { TTI::SK_Reverse, MVT::nxv8i16, 1 }, 2919 { TTI::SK_Reverse, MVT::nxv4i32, 1 }, 2920 { TTI::SK_Reverse, MVT::nxv2i64, 1 }, 2921 { TTI::SK_Reverse, MVT::nxv2f16, 1 }, 2922 { TTI::SK_Reverse, MVT::nxv4f16, 1 }, 2923 { TTI::SK_Reverse, MVT::nxv8f16, 1 }, 2924 { TTI::SK_Reverse, MVT::nxv2bf16, 1 }, 2925 { TTI::SK_Reverse, MVT::nxv4bf16, 1 }, 2926 { TTI::SK_Reverse, MVT::nxv8bf16, 1 }, 2927 { TTI::SK_Reverse, MVT::nxv2f32, 1 }, 2928 { TTI::SK_Reverse, MVT::nxv4f32, 1 }, 2929 { TTI::SK_Reverse, MVT::nxv2f64, 1 }, 2930 { TTI::SK_Reverse, MVT::nxv16i1, 1 }, 2931 { TTI::SK_Reverse, MVT::nxv8i1, 1 }, 2932 { TTI::SK_Reverse, MVT::nxv4i1, 1 }, 2933 { TTI::SK_Reverse, MVT::nxv2i1, 1 }, 2934 }; 2935 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second)) 2936 return LT.first * Entry->Cost; 2937 } 2938 2939 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp)) 2940 return getSpliceCost(Tp, Index); 2941 2942 // Inserting a subvector can often be done with either a D, S or H register 2943 // move, so long as the inserted vector is "aligned". 2944 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() && 2945 LT.second.getSizeInBits() <= 128 && SubTp) { 2946 std::pair<InstructionCost, MVT> SubLT = 2947 TLI->getTypeLegalizationCost(DL, SubTp); 2948 if (SubLT.second.isVector()) { 2949 int NumElts = LT.second.getVectorNumElements(); 2950 int NumSubElts = SubLT.second.getVectorNumElements(); 2951 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) 2952 return SubLT.first; 2953 } 2954 } 2955 2956 return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp); 2957 } 2958