1 //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "AArch64TargetTransformInfo.h" 10 #include "AArch64ExpandImm.h" 11 #include "MCTargetDesc/AArch64AddressingModes.h" 12 #include "llvm/Analysis/IVDescriptors.h" 13 #include "llvm/Analysis/LoopInfo.h" 14 #include "llvm/Analysis/TargetTransformInfo.h" 15 #include "llvm/CodeGen/BasicTTIImpl.h" 16 #include "llvm/CodeGen/CostTable.h" 17 #include "llvm/CodeGen/TargetLowering.h" 18 #include "llvm/IR/Intrinsics.h" 19 #include "llvm/IR/IntrinsicInst.h" 20 #include "llvm/IR/IntrinsicsAArch64.h" 21 #include "llvm/IR/PatternMatch.h" 22 #include "llvm/Support/Debug.h" 23 #include "llvm/Transforms/InstCombine/InstCombiner.h" 24 #include <algorithm> 25 using namespace llvm; 26 using namespace llvm::PatternMatch; 27 28 #define DEBUG_TYPE "aarch64tti" 29 30 static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", 31 cl::init(true), cl::Hidden); 32 33 static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10), 34 cl::Hidden); 35 36 static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead", 37 cl::init(10), cl::Hidden); 38 39 bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, 40 const Function *Callee) const { 41 const TargetMachine &TM = getTLI()->getTargetMachine(); 42 43 const FeatureBitset &CallerBits = 44 TM.getSubtargetImpl(*Caller)->getFeatureBits(); 45 const FeatureBitset &CalleeBits = 46 TM.getSubtargetImpl(*Callee)->getFeatureBits(); 47 48 // Inline a callee if its target-features are a subset of the callers 49 // target-features. 50 return (CallerBits & CalleeBits) == CalleeBits; 51 } 52 53 bool AArch64TTIImpl::shouldMaximizeVectorBandwidth( 54 TargetTransformInfo::RegisterKind K) const { 55 assert(K != TargetTransformInfo::RGK_Scalar); 56 return K == TargetTransformInfo::RGK_FixedWidthVector; 57 } 58 59 /// Calculate the cost of materializing a 64-bit value. This helper 60 /// method might only calculate a fraction of a larger immediate. Therefore it 61 /// is valid to return a cost of ZERO. 62 InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) { 63 // Check if the immediate can be encoded within an instruction. 64 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64)) 65 return 0; 66 67 if (Val < 0) 68 Val = ~Val; 69 70 // Calculate how many moves we will need to materialize this constant. 71 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 72 AArch64_IMM::expandMOVImm(Val, 64, Insn); 73 return Insn.size(); 74 } 75 76 /// Calculate the cost of materializing the given constant. 77 InstructionCost AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, 78 TTI::TargetCostKind CostKind) { 79 assert(Ty->isIntegerTy()); 80 81 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 82 if (BitSize == 0) 83 return ~0U; 84 85 // Sign-extend all constants to a multiple of 64-bit. 86 APInt ImmVal = Imm; 87 if (BitSize & 0x3f) 88 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU); 89 90 // Split the constant into 64-bit chunks and calculate the cost for each 91 // chunk. 92 InstructionCost Cost = 0; 93 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { 94 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); 95 int64_t Val = Tmp.getSExtValue(); 96 Cost += getIntImmCost(Val); 97 } 98 // We need at least one instruction to materialze the constant. 99 return std::max<InstructionCost>(1, Cost); 100 } 101 102 InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, 103 const APInt &Imm, Type *Ty, 104 TTI::TargetCostKind CostKind, 105 Instruction *Inst) { 106 assert(Ty->isIntegerTy()); 107 108 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 109 // There is no cost model for constants with a bit size of 0. Return TCC_Free 110 // here, so that constant hoisting will ignore this constant. 111 if (BitSize == 0) 112 return TTI::TCC_Free; 113 114 unsigned ImmIdx = ~0U; 115 switch (Opcode) { 116 default: 117 return TTI::TCC_Free; 118 case Instruction::GetElementPtr: 119 // Always hoist the base address of a GetElementPtr. 120 if (Idx == 0) 121 return 2 * TTI::TCC_Basic; 122 return TTI::TCC_Free; 123 case Instruction::Store: 124 ImmIdx = 0; 125 break; 126 case Instruction::Add: 127 case Instruction::Sub: 128 case Instruction::Mul: 129 case Instruction::UDiv: 130 case Instruction::SDiv: 131 case Instruction::URem: 132 case Instruction::SRem: 133 case Instruction::And: 134 case Instruction::Or: 135 case Instruction::Xor: 136 case Instruction::ICmp: 137 ImmIdx = 1; 138 break; 139 // Always return TCC_Free for the shift value of a shift instruction. 140 case Instruction::Shl: 141 case Instruction::LShr: 142 case Instruction::AShr: 143 if (Idx == 1) 144 return TTI::TCC_Free; 145 break; 146 case Instruction::Trunc: 147 case Instruction::ZExt: 148 case Instruction::SExt: 149 case Instruction::IntToPtr: 150 case Instruction::PtrToInt: 151 case Instruction::BitCast: 152 case Instruction::PHI: 153 case Instruction::Call: 154 case Instruction::Select: 155 case Instruction::Ret: 156 case Instruction::Load: 157 break; 158 } 159 160 if (Idx == ImmIdx) { 161 int NumConstants = (BitSize + 63) / 64; 162 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 163 return (Cost <= NumConstants * TTI::TCC_Basic) 164 ? static_cast<int>(TTI::TCC_Free) 165 : Cost; 166 } 167 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 168 } 169 170 InstructionCost 171 AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, 172 const APInt &Imm, Type *Ty, 173 TTI::TargetCostKind CostKind) { 174 assert(Ty->isIntegerTy()); 175 176 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 177 // There is no cost model for constants with a bit size of 0. Return TCC_Free 178 // here, so that constant hoisting will ignore this constant. 179 if (BitSize == 0) 180 return TTI::TCC_Free; 181 182 // Most (all?) AArch64 intrinsics do not support folding immediates into the 183 // selected instruction, so we compute the materialization cost for the 184 // immediate directly. 185 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv) 186 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 187 188 switch (IID) { 189 default: 190 return TTI::TCC_Free; 191 case Intrinsic::sadd_with_overflow: 192 case Intrinsic::uadd_with_overflow: 193 case Intrinsic::ssub_with_overflow: 194 case Intrinsic::usub_with_overflow: 195 case Intrinsic::smul_with_overflow: 196 case Intrinsic::umul_with_overflow: 197 if (Idx == 1) { 198 int NumConstants = (BitSize + 63) / 64; 199 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 200 return (Cost <= NumConstants * TTI::TCC_Basic) 201 ? static_cast<int>(TTI::TCC_Free) 202 : Cost; 203 } 204 break; 205 case Intrinsic::experimental_stackmap: 206 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 207 return TTI::TCC_Free; 208 break; 209 case Intrinsic::experimental_patchpoint_void: 210 case Intrinsic::experimental_patchpoint_i64: 211 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 212 return TTI::TCC_Free; 213 break; 214 case Intrinsic::experimental_gc_statepoint: 215 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 216 return TTI::TCC_Free; 217 break; 218 } 219 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 220 } 221 222 TargetTransformInfo::PopcntSupportKind 223 AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) { 224 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 225 if (TyWidth == 32 || TyWidth == 64) 226 return TTI::PSK_FastHardware; 227 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount. 228 return TTI::PSK_Software; 229 } 230 231 InstructionCost 232 AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 233 TTI::TargetCostKind CostKind) { 234 auto *RetTy = ICA.getReturnType(); 235 switch (ICA.getID()) { 236 case Intrinsic::umin: 237 case Intrinsic::umax: 238 case Intrinsic::smin: 239 case Intrinsic::smax: { 240 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, 241 MVT::v8i16, MVT::v2i32, MVT::v4i32}; 242 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 243 // v2i64 types get converted to cmp+bif hence the cost of 2 244 if (LT.second == MVT::v2i64) 245 return LT.first * 2; 246 if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; })) 247 return LT.first; 248 break; 249 } 250 case Intrinsic::sadd_sat: 251 case Intrinsic::ssub_sat: 252 case Intrinsic::uadd_sat: 253 case Intrinsic::usub_sat: { 254 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, 255 MVT::v8i16, MVT::v2i32, MVT::v4i32, 256 MVT::v2i64}; 257 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 258 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we 259 // need to extend the type, as it uses shr(qadd(shl, shl)). 260 unsigned Instrs = 261 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4; 262 if (any_of(ValidSatTys, [<](MVT M) { return M == LT.second; })) 263 return LT.first * Instrs; 264 break; 265 } 266 case Intrinsic::abs: { 267 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, 268 MVT::v8i16, MVT::v2i32, MVT::v4i32, 269 MVT::v2i64}; 270 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 271 if (any_of(ValidAbsTys, [<](MVT M) { return M == LT.second; })) 272 return LT.first; 273 break; 274 } 275 case Intrinsic::experimental_stepvector: { 276 InstructionCost Cost = 1; // Cost of the `index' instruction 277 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 278 // Legalisation of illegal vectors involves an `index' instruction plus 279 // (LT.first - 1) vector adds. 280 if (LT.first > 1) { 281 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext()); 282 InstructionCost AddCost = 283 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind); 284 Cost += AddCost * (LT.first - 1); 285 } 286 return Cost; 287 } 288 case Intrinsic::bitreverse: { 289 static const CostTblEntry BitreverseTbl[] = { 290 {Intrinsic::bitreverse, MVT::i32, 1}, 291 {Intrinsic::bitreverse, MVT::i64, 1}, 292 {Intrinsic::bitreverse, MVT::v8i8, 1}, 293 {Intrinsic::bitreverse, MVT::v16i8, 1}, 294 {Intrinsic::bitreverse, MVT::v4i16, 2}, 295 {Intrinsic::bitreverse, MVT::v8i16, 2}, 296 {Intrinsic::bitreverse, MVT::v2i32, 2}, 297 {Intrinsic::bitreverse, MVT::v4i32, 2}, 298 {Intrinsic::bitreverse, MVT::v1i64, 2}, 299 {Intrinsic::bitreverse, MVT::v2i64, 2}, 300 }; 301 const auto LegalisationCost = TLI->getTypeLegalizationCost(DL, RetTy); 302 const auto *Entry = 303 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second); 304 if (Entry) { 305 // Cost Model is using the legal type(i32) that i8 and i16 will be 306 // converted to +1 so that we match the actual lowering cost 307 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 || 308 TLI->getValueType(DL, RetTy, true) == MVT::i16) 309 return LegalisationCost.first * Entry->Cost + 1; 310 311 return LegalisationCost.first * Entry->Cost; 312 } 313 break; 314 } 315 case Intrinsic::ctpop: { 316 static const CostTblEntry CtpopCostTbl[] = { 317 {ISD::CTPOP, MVT::v2i64, 4}, 318 {ISD::CTPOP, MVT::v4i32, 3}, 319 {ISD::CTPOP, MVT::v8i16, 2}, 320 {ISD::CTPOP, MVT::v16i8, 1}, 321 {ISD::CTPOP, MVT::i64, 4}, 322 {ISD::CTPOP, MVT::v2i32, 3}, 323 {ISD::CTPOP, MVT::v4i16, 2}, 324 {ISD::CTPOP, MVT::v8i8, 1}, 325 {ISD::CTPOP, MVT::i32, 5}, 326 }; 327 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 328 MVT MTy = LT.second; 329 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) { 330 // Extra cost of +1 when illegal vector types are legalized by promoting 331 // the integer type. 332 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() != 333 RetTy->getScalarSizeInBits() 334 ? 1 335 : 0; 336 return LT.first * Entry->Cost + ExtraCost; 337 } 338 break; 339 } 340 case Intrinsic::sadd_with_overflow: 341 case Intrinsic::uadd_with_overflow: 342 case Intrinsic::ssub_with_overflow: 343 case Intrinsic::usub_with_overflow: 344 case Intrinsic::smul_with_overflow: 345 case Intrinsic::umul_with_overflow: { 346 static const CostTblEntry WithOverflowCostTbl[] = { 347 {Intrinsic::sadd_with_overflow, MVT::i8, 3}, 348 {Intrinsic::uadd_with_overflow, MVT::i8, 3}, 349 {Intrinsic::sadd_with_overflow, MVT::i16, 3}, 350 {Intrinsic::uadd_with_overflow, MVT::i16, 3}, 351 {Intrinsic::sadd_with_overflow, MVT::i32, 1}, 352 {Intrinsic::uadd_with_overflow, MVT::i32, 1}, 353 {Intrinsic::sadd_with_overflow, MVT::i64, 1}, 354 {Intrinsic::uadd_with_overflow, MVT::i64, 1}, 355 {Intrinsic::ssub_with_overflow, MVT::i8, 3}, 356 {Intrinsic::usub_with_overflow, MVT::i8, 3}, 357 {Intrinsic::ssub_with_overflow, MVT::i16, 3}, 358 {Intrinsic::usub_with_overflow, MVT::i16, 3}, 359 {Intrinsic::ssub_with_overflow, MVT::i32, 1}, 360 {Intrinsic::usub_with_overflow, MVT::i32, 1}, 361 {Intrinsic::ssub_with_overflow, MVT::i64, 1}, 362 {Intrinsic::usub_with_overflow, MVT::i64, 1}, 363 {Intrinsic::smul_with_overflow, MVT::i8, 5}, 364 {Intrinsic::umul_with_overflow, MVT::i8, 4}, 365 {Intrinsic::smul_with_overflow, MVT::i16, 5}, 366 {Intrinsic::umul_with_overflow, MVT::i16, 4}, 367 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst 368 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw 369 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp 370 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr 371 }; 372 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true); 373 if (MTy.isSimple()) 374 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(), 375 MTy.getSimpleVT())) 376 return Entry->Cost; 377 break; 378 } 379 default: 380 break; 381 } 382 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 383 } 384 385 /// The function will remove redundant reinterprets casting in the presence 386 /// of the control flow 387 static Optional<Instruction *> processPhiNode(InstCombiner &IC, 388 IntrinsicInst &II) { 389 SmallVector<Instruction *, 32> Worklist; 390 auto RequiredType = II.getType(); 391 392 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0)); 393 assert(PN && "Expected Phi Node!"); 394 395 // Don't create a new Phi unless we can remove the old one. 396 if (!PN->hasOneUse()) 397 return None; 398 399 for (Value *IncValPhi : PN->incoming_values()) { 400 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi); 401 if (!Reinterpret || 402 Reinterpret->getIntrinsicID() != 403 Intrinsic::aarch64_sve_convert_to_svbool || 404 RequiredType != Reinterpret->getArgOperand(0)->getType()) 405 return None; 406 } 407 408 // Create the new Phi 409 LLVMContext &Ctx = PN->getContext(); 410 IRBuilder<> Builder(Ctx); 411 Builder.SetInsertPoint(PN); 412 PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues()); 413 Worklist.push_back(PN); 414 415 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) { 416 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I)); 417 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I)); 418 Worklist.push_back(Reinterpret); 419 } 420 421 // Cleanup Phi Node and reinterprets 422 return IC.replaceInstUsesWith(II, NPN); 423 } 424 425 // (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _)))) 426 // => (binop (pred) (from_svbool _) (from_svbool _)) 427 // 428 // The above transformation eliminates a `to_svbool` in the predicate 429 // operand of bitwise operation `binop` by narrowing the vector width of 430 // the operation. For example, it would convert a `<vscale x 16 x i1> 431 // and` into a `<vscale x 4 x i1> and`. This is profitable because 432 // to_svbool must zero the new lanes during widening, whereas 433 // from_svbool is free. 434 static Optional<Instruction *> tryCombineFromSVBoolBinOp(InstCombiner &IC, 435 IntrinsicInst &II) { 436 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0)); 437 if (!BinOp) 438 return None; 439 440 auto IntrinsicID = BinOp->getIntrinsicID(); 441 switch (IntrinsicID) { 442 case Intrinsic::aarch64_sve_and_z: 443 case Intrinsic::aarch64_sve_bic_z: 444 case Intrinsic::aarch64_sve_eor_z: 445 case Intrinsic::aarch64_sve_nand_z: 446 case Intrinsic::aarch64_sve_nor_z: 447 case Intrinsic::aarch64_sve_orn_z: 448 case Intrinsic::aarch64_sve_orr_z: 449 break; 450 default: 451 return None; 452 } 453 454 auto BinOpPred = BinOp->getOperand(0); 455 auto BinOpOp1 = BinOp->getOperand(1); 456 auto BinOpOp2 = BinOp->getOperand(2); 457 458 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred); 459 if (!PredIntr || 460 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool) 461 return None; 462 463 auto PredOp = PredIntr->getOperand(0); 464 auto PredOpTy = cast<VectorType>(PredOp->getType()); 465 if (PredOpTy != II.getType()) 466 return None; 467 468 IRBuilder<> Builder(II.getContext()); 469 Builder.SetInsertPoint(&II); 470 471 SmallVector<Value *> NarrowedBinOpArgs = {PredOp}; 472 auto NarrowBinOpOp1 = Builder.CreateIntrinsic( 473 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1}); 474 NarrowedBinOpArgs.push_back(NarrowBinOpOp1); 475 if (BinOpOp1 == BinOpOp2) 476 NarrowedBinOpArgs.push_back(NarrowBinOpOp1); 477 else 478 NarrowedBinOpArgs.push_back(Builder.CreateIntrinsic( 479 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2})); 480 481 auto NarrowedBinOp = 482 Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs); 483 return IC.replaceInstUsesWith(II, NarrowedBinOp); 484 } 485 486 static Optional<Instruction *> instCombineConvertFromSVBool(InstCombiner &IC, 487 IntrinsicInst &II) { 488 // If the reinterpret instruction operand is a PHI Node 489 if (isa<PHINode>(II.getArgOperand(0))) 490 return processPhiNode(IC, II); 491 492 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II)) 493 return BinOpCombine; 494 495 SmallVector<Instruction *, 32> CandidatesForRemoval; 496 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr; 497 498 const auto *IVTy = cast<VectorType>(II.getType()); 499 500 // Walk the chain of conversions. 501 while (Cursor) { 502 // If the type of the cursor has fewer lanes than the final result, zeroing 503 // must take place, which breaks the equivalence chain. 504 const auto *CursorVTy = cast<VectorType>(Cursor->getType()); 505 if (CursorVTy->getElementCount().getKnownMinValue() < 506 IVTy->getElementCount().getKnownMinValue()) 507 break; 508 509 // If the cursor has the same type as I, it is a viable replacement. 510 if (Cursor->getType() == IVTy) 511 EarliestReplacement = Cursor; 512 513 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor); 514 515 // If this is not an SVE conversion intrinsic, this is the end of the chain. 516 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() == 517 Intrinsic::aarch64_sve_convert_to_svbool || 518 IntrinsicCursor->getIntrinsicID() == 519 Intrinsic::aarch64_sve_convert_from_svbool)) 520 break; 521 522 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor); 523 Cursor = IntrinsicCursor->getOperand(0); 524 } 525 526 // If no viable replacement in the conversion chain was found, there is 527 // nothing to do. 528 if (!EarliestReplacement) 529 return None; 530 531 return IC.replaceInstUsesWith(II, EarliestReplacement); 532 } 533 534 static Optional<Instruction *> instCombineSVESel(InstCombiner &IC, 535 IntrinsicInst &II) { 536 IRBuilder<> Builder(&II); 537 auto Select = Builder.CreateSelect(II.getOperand(0), II.getOperand(1), 538 II.getOperand(2)); 539 return IC.replaceInstUsesWith(II, Select); 540 } 541 542 static Optional<Instruction *> instCombineSVEDup(InstCombiner &IC, 543 IntrinsicInst &II) { 544 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); 545 if (!Pg) 546 return None; 547 548 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) 549 return None; 550 551 const auto PTruePattern = 552 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue(); 553 if (PTruePattern != AArch64SVEPredPattern::vl1) 554 return None; 555 556 // The intrinsic is inserting into lane zero so use an insert instead. 557 auto *IdxTy = Type::getInt64Ty(II.getContext()); 558 auto *Insert = InsertElementInst::Create( 559 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0)); 560 Insert->insertBefore(&II); 561 Insert->takeName(&II); 562 563 return IC.replaceInstUsesWith(II, Insert); 564 } 565 566 static Optional<Instruction *> instCombineSVEDupX(InstCombiner &IC, 567 IntrinsicInst &II) { 568 // Replace DupX with a regular IR splat. 569 IRBuilder<> Builder(II.getContext()); 570 Builder.SetInsertPoint(&II); 571 auto *RetTy = cast<ScalableVectorType>(II.getType()); 572 Value *Splat = 573 Builder.CreateVectorSplat(RetTy->getElementCount(), II.getArgOperand(0)); 574 Splat->takeName(&II); 575 return IC.replaceInstUsesWith(II, Splat); 576 } 577 578 static Optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC, 579 IntrinsicInst &II) { 580 LLVMContext &Ctx = II.getContext(); 581 IRBuilder<> Builder(Ctx); 582 Builder.SetInsertPoint(&II); 583 584 // Check that the predicate is all active 585 auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0)); 586 if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) 587 return None; 588 589 const auto PTruePattern = 590 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue(); 591 if (PTruePattern != AArch64SVEPredPattern::all) 592 return None; 593 594 // Check that we have a compare of zero.. 595 auto *SplatValue = 596 dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2))); 597 if (!SplatValue || !SplatValue->isZero()) 598 return None; 599 600 // ..against a dupq 601 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); 602 if (!DupQLane || 603 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane) 604 return None; 605 606 // Where the dupq is a lane 0 replicate of a vector insert 607 if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero()) 608 return None; 609 610 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0)); 611 if (!VecIns || 612 VecIns->getIntrinsicID() != Intrinsic::experimental_vector_insert) 613 return None; 614 615 // Where the vector insert is a fixed constant vector insert into undef at 616 // index zero 617 if (!isa<UndefValue>(VecIns->getArgOperand(0))) 618 return None; 619 620 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero()) 621 return None; 622 623 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1)); 624 if (!ConstVec) 625 return None; 626 627 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType()); 628 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType()); 629 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements()) 630 return None; 631 632 unsigned NumElts = VecTy->getNumElements(); 633 unsigned PredicateBits = 0; 634 635 // Expand intrinsic operands to a 16-bit byte level predicate 636 for (unsigned I = 0; I < NumElts; ++I) { 637 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I)); 638 if (!Arg) 639 return None; 640 if (!Arg->isZero()) 641 PredicateBits |= 1 << (I * (16 / NumElts)); 642 } 643 644 // If all bits are zero bail early with an empty predicate 645 if (PredicateBits == 0) { 646 auto *PFalse = Constant::getNullValue(II.getType()); 647 PFalse->takeName(&II); 648 return IC.replaceInstUsesWith(II, PFalse); 649 } 650 651 // Calculate largest predicate type used (where byte predicate is largest) 652 unsigned Mask = 8; 653 for (unsigned I = 0; I < 16; ++I) 654 if ((PredicateBits & (1 << I)) != 0) 655 Mask |= (I % 8); 656 657 unsigned PredSize = Mask & -Mask; 658 auto *PredType = ScalableVectorType::get( 659 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8)); 660 661 // Ensure all relevant bits are set 662 for (unsigned I = 0; I < 16; I += PredSize) 663 if ((PredicateBits & (1 << I)) == 0) 664 return None; 665 666 auto *PTruePat = 667 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all); 668 auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, 669 {PredType}, {PTruePat}); 670 auto *ConvertToSVBool = Builder.CreateIntrinsic( 671 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue}); 672 auto *ConvertFromSVBool = 673 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, 674 {II.getType()}, {ConvertToSVBool}); 675 676 ConvertFromSVBool->takeName(&II); 677 return IC.replaceInstUsesWith(II, ConvertFromSVBool); 678 } 679 680 static Optional<Instruction *> instCombineSVELast(InstCombiner &IC, 681 IntrinsicInst &II) { 682 IRBuilder<> Builder(II.getContext()); 683 Builder.SetInsertPoint(&II); 684 Value *Pg = II.getArgOperand(0); 685 Value *Vec = II.getArgOperand(1); 686 auto IntrinsicID = II.getIntrinsicID(); 687 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta; 688 689 // lastX(splat(X)) --> X 690 if (auto *SplatVal = getSplatValue(Vec)) 691 return IC.replaceInstUsesWith(II, SplatVal); 692 693 // If x and/or y is a splat value then: 694 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y)) 695 Value *LHS, *RHS; 696 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) { 697 if (isSplatValue(LHS) || isSplatValue(RHS)) { 698 auto *OldBinOp = cast<BinaryOperator>(Vec); 699 auto OpC = OldBinOp->getOpcode(); 700 auto *NewLHS = 701 Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS}); 702 auto *NewRHS = 703 Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS}); 704 auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags( 705 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), &II); 706 return IC.replaceInstUsesWith(II, NewBinOp); 707 } 708 } 709 710 auto *C = dyn_cast<Constant>(Pg); 711 if (IsAfter && C && C->isNullValue()) { 712 // The intrinsic is extracting lane 0 so use an extract instead. 713 auto *IdxTy = Type::getInt64Ty(II.getContext()); 714 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0)); 715 Extract->insertBefore(&II); 716 Extract->takeName(&II); 717 return IC.replaceInstUsesWith(II, Extract); 718 } 719 720 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg); 721 if (!IntrPG) 722 return None; 723 724 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) 725 return None; 726 727 const auto PTruePattern = 728 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue(); 729 730 // Can the intrinsic's predicate be converted to a known constant index? 731 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern); 732 if (!MinNumElts) 733 return None; 734 735 unsigned Idx = MinNumElts - 1; 736 // Increment the index if extracting the element after the last active 737 // predicate element. 738 if (IsAfter) 739 ++Idx; 740 741 // Ignore extracts whose index is larger than the known minimum vector 742 // length. NOTE: This is an artificial constraint where we prefer to 743 // maintain what the user asked for until an alternative is proven faster. 744 auto *PgVTy = cast<ScalableVectorType>(Pg->getType()); 745 if (Idx >= PgVTy->getMinNumElements()) 746 return None; 747 748 // The intrinsic is extracting a fixed lane so use an extract instead. 749 auto *IdxTy = Type::getInt64Ty(II.getContext()); 750 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx)); 751 Extract->insertBefore(&II); 752 Extract->takeName(&II); 753 return IC.replaceInstUsesWith(II, Extract); 754 } 755 756 static Optional<Instruction *> instCombineRDFFR(InstCombiner &IC, 757 IntrinsicInst &II) { 758 LLVMContext &Ctx = II.getContext(); 759 IRBuilder<> Builder(Ctx); 760 Builder.SetInsertPoint(&II); 761 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr 762 // can work with RDFFR_PP for ptest elimination. 763 auto *AllPat = 764 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all); 765 auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, 766 {II.getType()}, {AllPat}); 767 auto *RDFFR = 768 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue}); 769 RDFFR->takeName(&II); 770 return IC.replaceInstUsesWith(II, RDFFR); 771 } 772 773 static Optional<Instruction *> 774 instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) { 775 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue(); 776 777 if (Pattern == AArch64SVEPredPattern::all) { 778 LLVMContext &Ctx = II.getContext(); 779 IRBuilder<> Builder(Ctx); 780 Builder.SetInsertPoint(&II); 781 782 Constant *StepVal = ConstantInt::get(II.getType(), NumElts); 783 auto *VScale = Builder.CreateVScale(StepVal); 784 VScale->takeName(&II); 785 return IC.replaceInstUsesWith(II, VScale); 786 } 787 788 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern); 789 790 return MinNumElts && NumElts >= MinNumElts 791 ? Optional<Instruction *>(IC.replaceInstUsesWith( 792 II, ConstantInt::get(II.getType(), MinNumElts))) 793 : None; 794 } 795 796 static Optional<Instruction *> instCombineSVEPTest(InstCombiner &IC, 797 IntrinsicInst &II) { 798 IntrinsicInst *Op1 = dyn_cast<IntrinsicInst>(II.getArgOperand(0)); 799 IntrinsicInst *Op2 = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); 800 801 if (Op1 && Op2 && 802 Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && 803 Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && 804 Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) { 805 806 IRBuilder<> Builder(II.getContext()); 807 Builder.SetInsertPoint(&II); 808 809 Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)}; 810 Type *Tys[] = {Op1->getArgOperand(0)->getType()}; 811 812 auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops); 813 814 PTest->takeName(&II); 815 return IC.replaceInstUsesWith(II, PTest); 816 } 817 818 return None; 819 } 820 821 static Optional<Instruction *> instCombineSVEVectorFMLA(InstCombiner &IC, 822 IntrinsicInst &II) { 823 // fold (fadd p a (fmul p b c)) -> (fma p a b c) 824 Value *P = II.getOperand(0); 825 Value *A = II.getOperand(1); 826 auto FMul = II.getOperand(2); 827 Value *B, *C; 828 if (!match(FMul, m_Intrinsic<Intrinsic::aarch64_sve_fmul>( 829 m_Specific(P), m_Value(B), m_Value(C)))) 830 return None; 831 832 if (!FMul->hasOneUse()) 833 return None; 834 835 llvm::FastMathFlags FAddFlags = II.getFastMathFlags(); 836 // Stop the combine when the flags on the inputs differ in case dropping flags 837 // would lead to us missing out on more beneficial optimizations. 838 if (FAddFlags != cast<CallInst>(FMul)->getFastMathFlags()) 839 return None; 840 if (!FAddFlags.allowContract()) 841 return None; 842 843 IRBuilder<> Builder(II.getContext()); 844 Builder.SetInsertPoint(&II); 845 auto FMLA = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_fmla, 846 {II.getType()}, {P, A, B, C}, &II); 847 FMLA->setFastMathFlags(FAddFlags); 848 return IC.replaceInstUsesWith(II, FMLA); 849 } 850 851 static bool isAllActivePredicate(Value *Pred) { 852 // Look through convert.from.svbool(convert.to.svbool(...) chain. 853 Value *UncastedPred; 854 if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>( 855 m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>( 856 m_Value(UncastedPred))))) 857 // If the predicate has the same or less lanes than the uncasted 858 // predicate then we know the casting has no effect. 859 if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <= 860 cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements()) 861 Pred = UncastedPred; 862 863 return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( 864 m_ConstantInt<AArch64SVEPredPattern::all>())); 865 } 866 867 static Optional<Instruction *> 868 instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { 869 IRBuilder<> Builder(II.getContext()); 870 Builder.SetInsertPoint(&II); 871 872 Value *Pred = II.getOperand(0); 873 Value *PtrOp = II.getOperand(1); 874 Type *VecTy = II.getType(); 875 Value *VecPtr = Builder.CreateBitCast(PtrOp, VecTy->getPointerTo()); 876 877 if (isAllActivePredicate(Pred)) { 878 LoadInst *Load = Builder.CreateLoad(VecTy, VecPtr); 879 Load->copyMetadata(II); 880 return IC.replaceInstUsesWith(II, Load); 881 } 882 883 CallInst *MaskedLoad = 884 Builder.CreateMaskedLoad(VecTy, VecPtr, PtrOp->getPointerAlignment(DL), 885 Pred, ConstantAggregateZero::get(VecTy)); 886 MaskedLoad->copyMetadata(II); 887 return IC.replaceInstUsesWith(II, MaskedLoad); 888 } 889 890 static Optional<Instruction *> 891 instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { 892 IRBuilder<> Builder(II.getContext()); 893 Builder.SetInsertPoint(&II); 894 895 Value *VecOp = II.getOperand(0); 896 Value *Pred = II.getOperand(1); 897 Value *PtrOp = II.getOperand(2); 898 Value *VecPtr = 899 Builder.CreateBitCast(PtrOp, VecOp->getType()->getPointerTo()); 900 901 if (isAllActivePredicate(Pred)) { 902 StoreInst *Store = Builder.CreateStore(VecOp, VecPtr); 903 Store->copyMetadata(II); 904 return IC.eraseInstFromFunction(II); 905 } 906 907 CallInst *MaskedStore = Builder.CreateMaskedStore( 908 VecOp, VecPtr, PtrOp->getPointerAlignment(DL), Pred); 909 MaskedStore->copyMetadata(II); 910 return IC.eraseInstFromFunction(II); 911 } 912 913 static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) { 914 switch (Intrinsic) { 915 case Intrinsic::aarch64_sve_fmul: 916 return Instruction::BinaryOps::FMul; 917 case Intrinsic::aarch64_sve_fadd: 918 return Instruction::BinaryOps::FAdd; 919 case Intrinsic::aarch64_sve_fsub: 920 return Instruction::BinaryOps::FSub; 921 default: 922 return Instruction::BinaryOpsEnd; 923 } 924 } 925 926 static Optional<Instruction *> instCombineSVEVectorBinOp(InstCombiner &IC, 927 IntrinsicInst &II) { 928 auto *OpPredicate = II.getOperand(0); 929 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID()); 930 if (BinOpCode == Instruction::BinaryOpsEnd || 931 !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( 932 m_ConstantInt<AArch64SVEPredPattern::all>()))) 933 return None; 934 IRBuilder<> Builder(II.getContext()); 935 Builder.SetInsertPoint(&II); 936 Builder.setFastMathFlags(II.getFastMathFlags()); 937 auto BinOp = 938 Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2)); 939 return IC.replaceInstUsesWith(II, BinOp); 940 } 941 942 static Optional<Instruction *> instCombineSVEVectorFAdd(InstCombiner &IC, 943 IntrinsicInst &II) { 944 if (auto FMLA = instCombineSVEVectorFMLA(IC, II)) 945 return FMLA; 946 return instCombineSVEVectorBinOp(IC, II); 947 } 948 949 static Optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC, 950 IntrinsicInst &II) { 951 auto *OpPredicate = II.getOperand(0); 952 auto *OpMultiplicand = II.getOperand(1); 953 auto *OpMultiplier = II.getOperand(2); 954 955 IRBuilder<> Builder(II.getContext()); 956 Builder.SetInsertPoint(&II); 957 958 // Return true if a given instruction is a unit splat value, false otherwise. 959 auto IsUnitSplat = [](auto *I) { 960 auto *SplatValue = getSplatValue(I); 961 if (!SplatValue) 962 return false; 963 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); 964 }; 965 966 // Return true if a given instruction is an aarch64_sve_dup intrinsic call 967 // with a unit splat value, false otherwise. 968 auto IsUnitDup = [](auto *I) { 969 auto *IntrI = dyn_cast<IntrinsicInst>(I); 970 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup) 971 return false; 972 973 auto *SplatValue = IntrI->getOperand(2); 974 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); 975 }; 976 977 if (IsUnitSplat(OpMultiplier)) { 978 // [f]mul pg %n, (dupx 1) => %n 979 OpMultiplicand->takeName(&II); 980 return IC.replaceInstUsesWith(II, OpMultiplicand); 981 } else if (IsUnitDup(OpMultiplier)) { 982 // [f]mul pg %n, (dup pg 1) => %n 983 auto *DupInst = cast<IntrinsicInst>(OpMultiplier); 984 auto *DupPg = DupInst->getOperand(1); 985 // TODO: this is naive. The optimization is still valid if DupPg 986 // 'encompasses' OpPredicate, not only if they're the same predicate. 987 if (OpPredicate == DupPg) { 988 OpMultiplicand->takeName(&II); 989 return IC.replaceInstUsesWith(II, OpMultiplicand); 990 } 991 } 992 993 return instCombineSVEVectorBinOp(IC, II); 994 } 995 996 static Optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC, 997 IntrinsicInst &II) { 998 IRBuilder<> Builder(II.getContext()); 999 Builder.SetInsertPoint(&II); 1000 Value *UnpackArg = II.getArgOperand(0); 1001 auto *RetTy = cast<ScalableVectorType>(II.getType()); 1002 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi || 1003 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo; 1004 1005 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X)) 1006 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X)) 1007 if (auto *ScalarArg = getSplatValue(UnpackArg)) { 1008 ScalarArg = 1009 Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned); 1010 Value *NewVal = 1011 Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg); 1012 NewVal->takeName(&II); 1013 return IC.replaceInstUsesWith(II, NewVal); 1014 } 1015 1016 return None; 1017 } 1018 static Optional<Instruction *> instCombineSVETBL(InstCombiner &IC, 1019 IntrinsicInst &II) { 1020 auto *OpVal = II.getOperand(0); 1021 auto *OpIndices = II.getOperand(1); 1022 VectorType *VTy = cast<VectorType>(II.getType()); 1023 1024 // Check whether OpIndices is a constant splat value < minimal element count 1025 // of result. 1026 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices)); 1027 if (!SplatValue || 1028 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue())) 1029 return None; 1030 1031 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to 1032 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization. 1033 IRBuilder<> Builder(II.getContext()); 1034 Builder.SetInsertPoint(&II); 1035 auto *Extract = Builder.CreateExtractElement(OpVal, SplatValue); 1036 auto *VectorSplat = 1037 Builder.CreateVectorSplat(VTy->getElementCount(), Extract); 1038 1039 VectorSplat->takeName(&II); 1040 return IC.replaceInstUsesWith(II, VectorSplat); 1041 } 1042 1043 static Optional<Instruction *> instCombineSVETupleGet(InstCombiner &IC, 1044 IntrinsicInst &II) { 1045 // Try to remove sequences of tuple get/set. 1046 Value *SetTuple, *SetIndex, *SetValue; 1047 auto *GetTuple = II.getArgOperand(0); 1048 auto *GetIndex = II.getArgOperand(1); 1049 // Check that we have tuple_get(GetTuple, GetIndex) where GetTuple is a 1050 // call to tuple_set i.e. tuple_set(SetTuple, SetIndex, SetValue). 1051 // Make sure that the types of the current intrinsic and SetValue match 1052 // in order to safely remove the sequence. 1053 if (!match(GetTuple, 1054 m_Intrinsic<Intrinsic::aarch64_sve_tuple_set>( 1055 m_Value(SetTuple), m_Value(SetIndex), m_Value(SetValue))) || 1056 SetValue->getType() != II.getType()) 1057 return None; 1058 // Case where we get the same index right after setting it. 1059 // tuple_get(tuple_set(SetTuple, SetIndex, SetValue), GetIndex) --> SetValue 1060 if (GetIndex == SetIndex) 1061 return IC.replaceInstUsesWith(II, SetValue); 1062 // If we are getting a different index than what was set in the tuple_set 1063 // intrinsic. We can just set the input tuple to the one up in the chain. 1064 // tuple_get(tuple_set(SetTuple, SetIndex, SetValue), GetIndex) 1065 // --> tuple_get(SetTuple, GetIndex) 1066 return IC.replaceOperand(II, 0, SetTuple); 1067 } 1068 1069 static Optional<Instruction *> instCombineSVEZip(InstCombiner &IC, 1070 IntrinsicInst &II) { 1071 // zip1(uzp1(A, B), uzp2(A, B)) --> A 1072 // zip2(uzp1(A, B), uzp2(A, B)) --> B 1073 Value *A, *B; 1074 if (match(II.getArgOperand(0), 1075 m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) && 1076 match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>( 1077 m_Specific(A), m_Specific(B)))) 1078 return IC.replaceInstUsesWith( 1079 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B)); 1080 1081 return None; 1082 } 1083 1084 static Optional<Instruction *> instCombineLD1GatherIndex(InstCombiner &IC, 1085 IntrinsicInst &II) { 1086 Value *Mask = II.getOperand(0); 1087 Value *BasePtr = II.getOperand(1); 1088 Value *Index = II.getOperand(2); 1089 Type *Ty = II.getType(); 1090 Value *PassThru = ConstantAggregateZero::get(Ty); 1091 1092 // Contiguous gather => masked load. 1093 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1)) 1094 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer) 1095 Value *IndexBase; 1096 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>( 1097 m_Value(IndexBase), m_SpecificInt(1)))) { 1098 IRBuilder<> Builder(II.getContext()); 1099 Builder.SetInsertPoint(&II); 1100 1101 Align Alignment = 1102 BasePtr->getPointerAlignment(II.getModule()->getDataLayout()); 1103 1104 Type *VecPtrTy = PointerType::getUnqual(Ty); 1105 Value *Ptr = Builder.CreateGEP( 1106 cast<VectorType>(Ty)->getElementType(), BasePtr, IndexBase); 1107 Ptr = Builder.CreateBitCast(Ptr, VecPtrTy); 1108 CallInst *MaskedLoad = 1109 Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru); 1110 MaskedLoad->takeName(&II); 1111 return IC.replaceInstUsesWith(II, MaskedLoad); 1112 } 1113 1114 return None; 1115 } 1116 1117 static Optional<Instruction *> instCombineST1ScatterIndex(InstCombiner &IC, 1118 IntrinsicInst &II) { 1119 Value *Val = II.getOperand(0); 1120 Value *Mask = II.getOperand(1); 1121 Value *BasePtr = II.getOperand(2); 1122 Value *Index = II.getOperand(3); 1123 Type *Ty = Val->getType(); 1124 1125 // Contiguous scatter => masked store. 1126 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1)) 1127 // => (masked.store Value (gep BasePtr IndexBase) Align Mask) 1128 Value *IndexBase; 1129 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>( 1130 m_Value(IndexBase), m_SpecificInt(1)))) { 1131 IRBuilder<> Builder(II.getContext()); 1132 Builder.SetInsertPoint(&II); 1133 1134 Align Alignment = 1135 BasePtr->getPointerAlignment(II.getModule()->getDataLayout()); 1136 1137 Value *Ptr = Builder.CreateGEP( 1138 cast<VectorType>(Ty)->getElementType(), BasePtr, IndexBase); 1139 Type *VecPtrTy = PointerType::getUnqual(Ty); 1140 Ptr = Builder.CreateBitCast(Ptr, VecPtrTy); 1141 1142 (void)Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask); 1143 1144 return IC.eraseInstFromFunction(II); 1145 } 1146 1147 return None; 1148 } 1149 1150 static Optional<Instruction *> instCombineSVESDIV(InstCombiner &IC, 1151 IntrinsicInst &II) { 1152 IRBuilder<> Builder(II.getContext()); 1153 Builder.SetInsertPoint(&II); 1154 Type *Int32Ty = Builder.getInt32Ty(); 1155 Value *Pred = II.getOperand(0); 1156 Value *Vec = II.getOperand(1); 1157 Value *DivVec = II.getOperand(2); 1158 1159 Value *SplatValue = getSplatValue(DivVec); 1160 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue); 1161 if (!SplatConstantInt) 1162 return None; 1163 APInt Divisor = SplatConstantInt->getValue(); 1164 1165 if (Divisor.isPowerOf2()) { 1166 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2()); 1167 auto ASRD = Builder.CreateIntrinsic( 1168 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2}); 1169 return IC.replaceInstUsesWith(II, ASRD); 1170 } 1171 if (Divisor.isNegatedPowerOf2()) { 1172 Divisor.negate(); 1173 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2()); 1174 auto ASRD = Builder.CreateIntrinsic( 1175 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2}); 1176 auto NEG = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_neg, 1177 {ASRD->getType()}, {ASRD, Pred, ASRD}); 1178 return IC.replaceInstUsesWith(II, NEG); 1179 } 1180 1181 return None; 1182 } 1183 1184 Optional<Instruction *> 1185 AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, 1186 IntrinsicInst &II) const { 1187 Intrinsic::ID IID = II.getIntrinsicID(); 1188 switch (IID) { 1189 default: 1190 break; 1191 case Intrinsic::aarch64_sve_convert_from_svbool: 1192 return instCombineConvertFromSVBool(IC, II); 1193 case Intrinsic::aarch64_sve_dup: 1194 return instCombineSVEDup(IC, II); 1195 case Intrinsic::aarch64_sve_dup_x: 1196 return instCombineSVEDupX(IC, II); 1197 case Intrinsic::aarch64_sve_cmpne: 1198 case Intrinsic::aarch64_sve_cmpne_wide: 1199 return instCombineSVECmpNE(IC, II); 1200 case Intrinsic::aarch64_sve_rdffr: 1201 return instCombineRDFFR(IC, II); 1202 case Intrinsic::aarch64_sve_lasta: 1203 case Intrinsic::aarch64_sve_lastb: 1204 return instCombineSVELast(IC, II); 1205 case Intrinsic::aarch64_sve_cntd: 1206 return instCombineSVECntElts(IC, II, 2); 1207 case Intrinsic::aarch64_sve_cntw: 1208 return instCombineSVECntElts(IC, II, 4); 1209 case Intrinsic::aarch64_sve_cnth: 1210 return instCombineSVECntElts(IC, II, 8); 1211 case Intrinsic::aarch64_sve_cntb: 1212 return instCombineSVECntElts(IC, II, 16); 1213 case Intrinsic::aarch64_sve_ptest_any: 1214 case Intrinsic::aarch64_sve_ptest_first: 1215 case Intrinsic::aarch64_sve_ptest_last: 1216 return instCombineSVEPTest(IC, II); 1217 case Intrinsic::aarch64_sve_mul: 1218 case Intrinsic::aarch64_sve_fmul: 1219 return instCombineSVEVectorMul(IC, II); 1220 case Intrinsic::aarch64_sve_fadd: 1221 return instCombineSVEVectorFAdd(IC, II); 1222 case Intrinsic::aarch64_sve_fsub: 1223 return instCombineSVEVectorBinOp(IC, II); 1224 case Intrinsic::aarch64_sve_tbl: 1225 return instCombineSVETBL(IC, II); 1226 case Intrinsic::aarch64_sve_uunpkhi: 1227 case Intrinsic::aarch64_sve_uunpklo: 1228 case Intrinsic::aarch64_sve_sunpkhi: 1229 case Intrinsic::aarch64_sve_sunpklo: 1230 return instCombineSVEUnpack(IC, II); 1231 case Intrinsic::aarch64_sve_tuple_get: 1232 return instCombineSVETupleGet(IC, II); 1233 case Intrinsic::aarch64_sve_zip1: 1234 case Intrinsic::aarch64_sve_zip2: 1235 return instCombineSVEZip(IC, II); 1236 case Intrinsic::aarch64_sve_ld1_gather_index: 1237 return instCombineLD1GatherIndex(IC, II); 1238 case Intrinsic::aarch64_sve_st1_scatter_index: 1239 return instCombineST1ScatterIndex(IC, II); 1240 case Intrinsic::aarch64_sve_ld1: 1241 return instCombineSVELD1(IC, II, DL); 1242 case Intrinsic::aarch64_sve_st1: 1243 return instCombineSVEST1(IC, II, DL); 1244 case Intrinsic::aarch64_sve_sdiv: 1245 return instCombineSVESDIV(IC, II); 1246 case Intrinsic::aarch64_sve_sel: 1247 return instCombineSVESel(IC, II); 1248 } 1249 1250 return None; 1251 } 1252 1253 Optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic( 1254 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts, 1255 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, 1256 std::function<void(Instruction *, unsigned, APInt, APInt &)> 1257 SimplifyAndSetOp) const { 1258 switch (II.getIntrinsicID()) { 1259 default: 1260 break; 1261 case Intrinsic::aarch64_neon_fcvtxn: 1262 case Intrinsic::aarch64_neon_rshrn: 1263 case Intrinsic::aarch64_neon_sqrshrn: 1264 case Intrinsic::aarch64_neon_sqrshrun: 1265 case Intrinsic::aarch64_neon_sqshrn: 1266 case Intrinsic::aarch64_neon_sqshrun: 1267 case Intrinsic::aarch64_neon_sqxtn: 1268 case Intrinsic::aarch64_neon_sqxtun: 1269 case Intrinsic::aarch64_neon_uqrshrn: 1270 case Intrinsic::aarch64_neon_uqshrn: 1271 case Intrinsic::aarch64_neon_uqxtn: 1272 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts); 1273 break; 1274 } 1275 1276 return None; 1277 } 1278 1279 bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, 1280 ArrayRef<const Value *> Args) { 1281 1282 // A helper that returns a vector type from the given type. The number of 1283 // elements in type Ty determine the vector width. 1284 auto toVectorTy = [&](Type *ArgTy) { 1285 return VectorType::get(ArgTy->getScalarType(), 1286 cast<VectorType>(DstTy)->getElementCount()); 1287 }; 1288 1289 // Exit early if DstTy is not a vector type whose elements are at least 1290 // 16-bits wide. 1291 if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16) 1292 return false; 1293 1294 // Determine if the operation has a widening variant. We consider both the 1295 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the 1296 // instructions. 1297 // 1298 // TODO: Add additional widening operations (e.g., shl, etc.) once we 1299 // verify that their extending operands are eliminated during code 1300 // generation. 1301 switch (Opcode) { 1302 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2). 1303 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2). 1304 case Instruction::Mul: // SMULL(2), UMULL(2) 1305 break; 1306 default: 1307 return false; 1308 } 1309 1310 // To be a widening instruction (either the "wide" or "long" versions), the 1311 // second operand must be a sign- or zero extend. 1312 if (Args.size() != 2 || 1313 (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1]))) 1314 return false; 1315 auto *Extend = cast<CastInst>(Args[1]); 1316 auto *Arg0 = dyn_cast<CastInst>(Args[0]); 1317 1318 // A mul only has a mull version (not like addw). Both operands need to be 1319 // extending and the same type. 1320 if (Opcode == Instruction::Mul && 1321 (!Arg0 || Arg0->getOpcode() != Extend->getOpcode() || 1322 Arg0->getOperand(0)->getType() != Extend->getOperand(0)->getType())) 1323 return false; 1324 1325 // Legalize the destination type and ensure it can be used in a widening 1326 // operation. 1327 auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy); 1328 unsigned DstElTySize = DstTyL.second.getScalarSizeInBits(); 1329 if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits()) 1330 return false; 1331 1332 // Legalize the source type and ensure it can be used in a widening 1333 // operation. 1334 auto *SrcTy = toVectorTy(Extend->getSrcTy()); 1335 auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy); 1336 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits(); 1337 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits()) 1338 return false; 1339 1340 // Get the total number of vector elements in the legalized types. 1341 InstructionCost NumDstEls = 1342 DstTyL.first * DstTyL.second.getVectorMinNumElements(); 1343 InstructionCost NumSrcEls = 1344 SrcTyL.first * SrcTyL.second.getVectorMinNumElements(); 1345 1346 // Return true if the legalized types have the same number of vector elements 1347 // and the destination element type size is twice that of the source type. 1348 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize; 1349 } 1350 1351 InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, 1352 Type *Src, 1353 TTI::CastContextHint CCH, 1354 TTI::TargetCostKind CostKind, 1355 const Instruction *I) { 1356 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1357 assert(ISD && "Invalid opcode"); 1358 1359 // If the cast is observable, and it is used by a widening instruction (e.g., 1360 // uaddl, saddw, etc.), it may be free. 1361 if (I && I->hasOneUser()) { 1362 auto *SingleUser = cast<Instruction>(*I->user_begin()); 1363 SmallVector<const Value *, 4> Operands(SingleUser->operand_values()); 1364 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) { 1365 // If the cast is the second operand, it is free. We will generate either 1366 // a "wide" or "long" version of the widening instruction. 1367 if (I == SingleUser->getOperand(1)) 1368 return 0; 1369 // If the cast is not the second operand, it will be free if it looks the 1370 // same as the second operand. In this case, we will generate a "long" 1371 // version of the widening instruction. 1372 if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1))) 1373 if (I->getOpcode() == unsigned(Cast->getOpcode()) && 1374 cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy()) 1375 return 0; 1376 } 1377 } 1378 1379 // TODO: Allow non-throughput costs that aren't binary. 1380 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost { 1381 if (CostKind != TTI::TCK_RecipThroughput) 1382 return Cost == 0 ? 0 : 1; 1383 return Cost; 1384 }; 1385 1386 EVT SrcTy = TLI->getValueType(DL, Src); 1387 EVT DstTy = TLI->getValueType(DL, Dst); 1388 1389 if (!SrcTy.isSimple() || !DstTy.isSimple()) 1390 return AdjustCost( 1391 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); 1392 1393 static const TypeConversionCostTblEntry 1394 ConversionTbl[] = { 1395 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, 1396 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 }, 1397 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, 1398 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 }, 1399 1400 // Truncations on nxvmiN 1401 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 }, 1402 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 }, 1403 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 }, 1404 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 }, 1405 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 }, 1406 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 }, 1407 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 }, 1408 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 }, 1409 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 }, 1410 { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 }, 1411 { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 }, 1412 { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 }, 1413 { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 }, 1414 { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 }, 1415 { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 }, 1416 { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 }, 1417 1418 // The number of shll instructions for the extension. 1419 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 1420 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 1421 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, 1422 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, 1423 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 1424 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 1425 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, 1426 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, 1427 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, 1428 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, 1429 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, 1430 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, 1431 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, 1432 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, 1433 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, 1434 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, 1435 1436 // LowerVectorINT_TO_FP: 1437 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 1438 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 1439 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 1440 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 1441 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 1442 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 1443 1444 // Complex: to v2f32 1445 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 1446 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, 1447 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, 1448 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 1449 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, 1450 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, 1451 1452 // Complex: to v4f32 1453 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 }, 1454 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 1455 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, 1456 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 1457 1458 // Complex: to v8f32 1459 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, 1460 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 1461 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, 1462 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 1463 1464 // Complex: to v16f32 1465 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, 1466 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, 1467 1468 // Complex: to v2f64 1469 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 1470 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, 1471 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 1472 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 1473 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, 1474 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 1475 1476 1477 // LowerVectorFP_TO_INT 1478 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 }, 1479 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, 1480 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, 1481 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, 1482 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, 1483 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, 1484 1485 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext). 1486 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 }, 1487 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 }, 1488 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 }, 1489 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 }, 1490 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 }, 1491 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 }, 1492 1493 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2 1494 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 }, 1495 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 }, 1496 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 }, 1497 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 }, 1498 1499 // Complex, from nxv2f32. 1500 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 }, 1501 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 }, 1502 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 }, 1503 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1 }, 1504 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 }, 1505 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 }, 1506 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 }, 1507 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1 }, 1508 1509 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2. 1510 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 }, 1511 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 }, 1512 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 }, 1513 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 }, 1514 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 }, 1515 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 }, 1516 1517 // Complex, from nxv2f64. 1518 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 }, 1519 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 }, 1520 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 }, 1521 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1 }, 1522 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 }, 1523 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 }, 1524 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 }, 1525 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1 }, 1526 1527 // Complex, from nxv4f32. 1528 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 }, 1529 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 }, 1530 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 }, 1531 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1 }, 1532 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 }, 1533 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 }, 1534 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 }, 1535 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1 }, 1536 1537 // Complex, from nxv8f64. Illegal -> illegal conversions not required. 1538 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 }, 1539 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7 }, 1540 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 }, 1541 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7 }, 1542 1543 // Complex, from nxv4f64. Illegal -> illegal conversions not required. 1544 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 }, 1545 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 }, 1546 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3 }, 1547 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 }, 1548 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 }, 1549 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3 }, 1550 1551 // Complex, from nxv8f32. Illegal -> illegal conversions not required. 1552 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 }, 1553 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3 }, 1554 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 }, 1555 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3 }, 1556 1557 // Complex, from nxv8f16. 1558 { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 }, 1559 { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 }, 1560 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 }, 1561 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1 }, 1562 { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 }, 1563 { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 }, 1564 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 }, 1565 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1 }, 1566 1567 // Complex, from nxv4f16. 1568 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 }, 1569 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 }, 1570 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 }, 1571 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1 }, 1572 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 }, 1573 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 }, 1574 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 }, 1575 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1 }, 1576 1577 // Complex, from nxv2f16. 1578 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 }, 1579 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 }, 1580 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 }, 1581 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1 }, 1582 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 }, 1583 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 }, 1584 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 }, 1585 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1 }, 1586 1587 // Truncate from nxvmf32 to nxvmf16. 1588 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 }, 1589 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 }, 1590 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 }, 1591 1592 // Truncate from nxvmf64 to nxvmf16. 1593 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 }, 1594 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 }, 1595 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 }, 1596 1597 // Truncate from nxvmf64 to nxvmf32. 1598 { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 }, 1599 { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 }, 1600 { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 }, 1601 1602 // Extend from nxvmf16 to nxvmf32. 1603 { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1}, 1604 { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1}, 1605 { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2}, 1606 1607 // Extend from nxvmf16 to nxvmf64. 1608 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1}, 1609 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2}, 1610 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4}, 1611 1612 // Extend from nxvmf32 to nxvmf64. 1613 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1}, 1614 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2}, 1615 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6}, 1616 1617 // Bitcasts from float to integer 1618 { ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0 }, 1619 { ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0 }, 1620 { ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0 }, 1621 1622 // Bitcasts from integer to float 1623 { ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0 }, 1624 { ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0 }, 1625 { ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0 }, 1626 }; 1627 1628 if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD, 1629 DstTy.getSimpleVT(), 1630 SrcTy.getSimpleVT())) 1631 return AdjustCost(Entry->Cost); 1632 1633 static const TypeConversionCostTblEntry FP16Tbl[] = { 1634 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs 1635 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1}, 1636 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs 1637 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1}, 1638 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs 1639 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2}, 1640 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn 1641 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2}, 1642 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs 1643 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1}, 1644 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs 1645 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4}, 1646 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn 1647 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3}, 1648 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs 1649 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2}, 1650 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs 1651 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8}, 1652 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf 1653 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf 1654 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf 1655 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf 1656 }; 1657 1658 if (ST->hasFullFP16()) 1659 if (const auto *Entry = ConvertCostTableLookup( 1660 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) 1661 return AdjustCost(Entry->Cost); 1662 1663 return AdjustCost( 1664 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); 1665 } 1666 1667 InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, 1668 Type *Dst, 1669 VectorType *VecTy, 1670 unsigned Index) { 1671 1672 // Make sure we were given a valid extend opcode. 1673 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && 1674 "Invalid opcode"); 1675 1676 // We are extending an element we extract from a vector, so the source type 1677 // of the extend is the element type of the vector. 1678 auto *Src = VecTy->getElementType(); 1679 1680 // Sign- and zero-extends are for integer types only. 1681 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type"); 1682 1683 // Get the cost for the extract. We compute the cost (if any) for the extend 1684 // below. 1685 InstructionCost Cost = 1686 getVectorInstrCost(Instruction::ExtractElement, VecTy, Index); 1687 1688 // Legalize the types. 1689 auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy); 1690 auto DstVT = TLI->getValueType(DL, Dst); 1691 auto SrcVT = TLI->getValueType(DL, Src); 1692 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 1693 1694 // If the resulting type is still a vector and the destination type is legal, 1695 // we may get the extension for free. If not, get the default cost for the 1696 // extend. 1697 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT)) 1698 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 1699 CostKind); 1700 1701 // The destination type should be larger than the element type. If not, get 1702 // the default cost for the extend. 1703 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits()) 1704 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 1705 CostKind); 1706 1707 switch (Opcode) { 1708 default: 1709 llvm_unreachable("Opcode should be either SExt or ZExt"); 1710 1711 // For sign-extends, we only need a smov, which performs the extension 1712 // automatically. 1713 case Instruction::SExt: 1714 return Cost; 1715 1716 // For zero-extends, the extend is performed automatically by a umov unless 1717 // the destination type is i64 and the element type is i8 or i16. 1718 case Instruction::ZExt: 1719 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u) 1720 return Cost; 1721 } 1722 1723 // If we are unable to perform the extend for free, get the default cost. 1724 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 1725 CostKind); 1726 } 1727 1728 InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode, 1729 TTI::TargetCostKind CostKind, 1730 const Instruction *I) { 1731 if (CostKind != TTI::TCK_RecipThroughput) 1732 return Opcode == Instruction::PHI ? 0 : 1; 1733 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind"); 1734 // Branches are assumed to be predicted. 1735 return 0; 1736 } 1737 1738 InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, 1739 unsigned Index) { 1740 assert(Val->isVectorTy() && "This must be a vector type"); 1741 1742 if (Index != -1U) { 1743 // Legalize the type. 1744 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); 1745 1746 // This type is legalized to a scalar type. 1747 if (!LT.second.isVector()) 1748 return 0; 1749 1750 // The type may be split. For fixed-width vectors we can normalize the 1751 // index to the new type. 1752 if (LT.second.isFixedLengthVector()) { 1753 unsigned Width = LT.second.getVectorNumElements(); 1754 Index = Index % Width; 1755 } 1756 1757 // The element at index zero is already inside the vector. 1758 if (Index == 0) 1759 return 0; 1760 } 1761 1762 // All other insert/extracts cost this much. 1763 return ST->getVectorInsertExtractBaseCost(); 1764 } 1765 1766 InstructionCost AArch64TTIImpl::getArithmeticInstrCost( 1767 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 1768 TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, 1769 TTI::OperandValueProperties Opd1PropInfo, 1770 TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args, 1771 const Instruction *CxtI) { 1772 // TODO: Handle more cost kinds. 1773 if (CostKind != TTI::TCK_RecipThroughput) 1774 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, 1775 Opd2Info, Opd1PropInfo, 1776 Opd2PropInfo, Args, CxtI); 1777 1778 // Legalize the type. 1779 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 1780 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1781 1782 switch (ISD) { 1783 default: 1784 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, 1785 Opd2Info, Opd1PropInfo, Opd2PropInfo); 1786 case ISD::SDIV: 1787 if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue && 1788 Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { 1789 // On AArch64, scalar signed division by constants power-of-two are 1790 // normally expanded to the sequence ADD + CMP + SELECT + SRA. 1791 // The OperandValue properties many not be same as that of previous 1792 // operation; conservatively assume OP_None. 1793 InstructionCost Cost = getArithmeticInstrCost( 1794 Instruction::Add, Ty, CostKind, Opd1Info, Opd2Info, 1795 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 1796 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Opd1Info, 1797 Opd2Info, TargetTransformInfo::OP_None, 1798 TargetTransformInfo::OP_None); 1799 Cost += getArithmeticInstrCost( 1800 Instruction::Select, Ty, CostKind, Opd1Info, Opd2Info, 1801 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 1802 Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Opd1Info, 1803 Opd2Info, TargetTransformInfo::OP_None, 1804 TargetTransformInfo::OP_None); 1805 return Cost; 1806 } 1807 LLVM_FALLTHROUGH; 1808 case ISD::UDIV: { 1809 if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue) { 1810 auto VT = TLI->getValueType(DL, Ty); 1811 if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) { 1812 // Vector signed division by constant are expanded to the 1813 // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division 1814 // to MULHS + SUB + SRL + ADD + SRL. 1815 InstructionCost MulCost = getArithmeticInstrCost( 1816 Instruction::Mul, Ty, CostKind, Opd1Info, Opd2Info, 1817 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 1818 InstructionCost AddCost = getArithmeticInstrCost( 1819 Instruction::Add, Ty, CostKind, Opd1Info, Opd2Info, 1820 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 1821 InstructionCost ShrCost = getArithmeticInstrCost( 1822 Instruction::AShr, Ty, CostKind, Opd1Info, Opd2Info, 1823 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 1824 return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1; 1825 } 1826 } 1827 1828 InstructionCost Cost = BaseT::getArithmeticInstrCost( 1829 Opcode, Ty, CostKind, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo); 1830 if (Ty->isVectorTy()) { 1831 // On AArch64, vector divisions are not supported natively and are 1832 // expanded into scalar divisions of each pair of elements. 1833 Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, CostKind, 1834 Opd1Info, Opd2Info, Opd1PropInfo, 1835 Opd2PropInfo); 1836 Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind, 1837 Opd1Info, Opd2Info, Opd1PropInfo, 1838 Opd2PropInfo); 1839 // TODO: if one of the arguments is scalar, then it's not necessary to 1840 // double the cost of handling the vector elements. 1841 Cost += Cost; 1842 } 1843 return Cost; 1844 } 1845 case ISD::MUL: 1846 // Since we do not have a MUL.2d instruction, a mul <2 x i64> is expensive 1847 // as elements are extracted from the vectors and the muls scalarized. 1848 // As getScalarizationOverhead is a bit too pessimistic, we estimate the 1849 // cost for a i64 vector directly here, which is: 1850 // - four 2-cost i64 extracts, 1851 // - two 2-cost i64 inserts, and 1852 // - two 1-cost muls. 1853 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with 1854 // LT.first = 2 the cost is 28. If both operands are extensions it will not 1855 // need to scalarize so the cost can be cheaper (smull or umull). 1856 if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args)) 1857 return LT.first; 1858 return LT.first * 14; 1859 case ISD::ADD: 1860 case ISD::XOR: 1861 case ISD::OR: 1862 case ISD::AND: 1863 case ISD::SRL: 1864 case ISD::SRA: 1865 case ISD::SHL: 1866 // These nodes are marked as 'custom' for combining purposes only. 1867 // We know that they are legal. See LowerAdd in ISelLowering. 1868 return LT.first; 1869 1870 case ISD::FADD: 1871 case ISD::FSUB: 1872 case ISD::FMUL: 1873 case ISD::FDIV: 1874 case ISD::FNEG: 1875 // These nodes are marked as 'custom' just to lower them to SVE. 1876 // We know said lowering will incur no additional cost. 1877 if (!Ty->getScalarType()->isFP128Ty()) 1878 return 2 * LT.first; 1879 1880 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, 1881 Opd2Info, Opd1PropInfo, Opd2PropInfo); 1882 } 1883 } 1884 1885 InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty, 1886 ScalarEvolution *SE, 1887 const SCEV *Ptr) { 1888 // Address computations in vectorized code with non-consecutive addresses will 1889 // likely result in more instructions compared to scalar code where the 1890 // computation can more often be merged into the index mode. The resulting 1891 // extra micro-ops can significantly decrease throughput. 1892 unsigned NumVectorInstToHideOverhead = 10; 1893 int MaxMergeDistance = 64; 1894 1895 if (Ty->isVectorTy() && SE && 1896 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) 1897 return NumVectorInstToHideOverhead; 1898 1899 // In many cases the address computation is not merged into the instruction 1900 // addressing mode. 1901 return 1; 1902 } 1903 1904 InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 1905 Type *CondTy, 1906 CmpInst::Predicate VecPred, 1907 TTI::TargetCostKind CostKind, 1908 const Instruction *I) { 1909 // TODO: Handle other cost kinds. 1910 if (CostKind != TTI::TCK_RecipThroughput) 1911 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1912 I); 1913 1914 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1915 // We don't lower some vector selects well that are wider than the register 1916 // width. 1917 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) { 1918 // We would need this many instructions to hide the scalarization happening. 1919 const int AmortizationCost = 20; 1920 1921 // If VecPred is not set, check if we can get a predicate from the context 1922 // instruction, if its type matches the requested ValTy. 1923 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) { 1924 CmpInst::Predicate CurrentPred; 1925 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(), 1926 m_Value()))) 1927 VecPred = CurrentPred; 1928 } 1929 // Check if we have a compare/select chain that can be lowered using 1930 // a (F)CMxx & BFI pair. 1931 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE || 1932 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT || 1933 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ || 1934 VecPred == CmpInst::FCMP_UNE) { 1935 static const auto ValidMinMaxTys = { 1936 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, 1937 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64}; 1938 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16}; 1939 1940 auto LT = TLI->getTypeLegalizationCost(DL, ValTy); 1941 if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; }) || 1942 (ST->hasFullFP16() && 1943 any_of(ValidFP16MinMaxTys, [<](MVT M) { return M == LT.second; }))) 1944 return LT.first; 1945 } 1946 1947 static const TypeConversionCostTblEntry 1948 VectorSelectTbl[] = { 1949 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 }, 1950 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 }, 1951 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 }, 1952 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost }, 1953 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost }, 1954 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost } 1955 }; 1956 1957 EVT SelCondTy = TLI->getValueType(DL, CondTy); 1958 EVT SelValTy = TLI->getValueType(DL, ValTy); 1959 if (SelCondTy.isSimple() && SelValTy.isSimple()) { 1960 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD, 1961 SelCondTy.getSimpleVT(), 1962 SelValTy.getSimpleVT())) 1963 return Entry->Cost; 1964 } 1965 } 1966 // The base case handles scalable vectors fine for now, since it treats the 1967 // cost as 1 * legalization cost. 1968 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); 1969 } 1970 1971 AArch64TTIImpl::TTI::MemCmpExpansionOptions 1972 AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { 1973 TTI::MemCmpExpansionOptions Options; 1974 if (ST->requiresStrictAlign()) { 1975 // TODO: Add cost modeling for strict align. Misaligned loads expand to 1976 // a bunch of instructions when strict align is enabled. 1977 return Options; 1978 } 1979 Options.AllowOverlappingLoads = true; 1980 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); 1981 Options.NumLoadsPerBlock = Options.MaxNumLoads; 1982 // TODO: Though vector loads usually perform well on AArch64, in some targets 1983 // they may wake up the FP unit, which raises the power consumption. Perhaps 1984 // they could be used with no holds barred (-O3). 1985 Options.LoadSizes = {8, 4, 2, 1}; 1986 return Options; 1987 } 1988 1989 InstructionCost 1990 AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, 1991 Align Alignment, unsigned AddressSpace, 1992 TTI::TargetCostKind CostKind) { 1993 if (useNeonVector(Src)) 1994 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 1995 CostKind); 1996 auto LT = TLI->getTypeLegalizationCost(DL, Src); 1997 if (!LT.first.isValid()) 1998 return InstructionCost::getInvalid(); 1999 2000 // The code-generator is currently not able to handle scalable vectors 2001 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 2002 // it. This change will be removed when code-generation for these types is 2003 // sufficiently reliable. 2004 if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1)) 2005 return InstructionCost::getInvalid(); 2006 2007 return LT.first * 2; 2008 } 2009 2010 static unsigned getSVEGatherScatterOverhead(unsigned Opcode) { 2011 return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead; 2012 } 2013 2014 InstructionCost AArch64TTIImpl::getGatherScatterOpCost( 2015 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, 2016 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { 2017 if (useNeonVector(DataTy)) 2018 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 2019 Alignment, CostKind, I); 2020 auto *VT = cast<VectorType>(DataTy); 2021 auto LT = TLI->getTypeLegalizationCost(DL, DataTy); 2022 if (!LT.first.isValid()) 2023 return InstructionCost::getInvalid(); 2024 2025 // The code-generator is currently not able to handle scalable vectors 2026 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 2027 // it. This change will be removed when code-generation for these types is 2028 // sufficiently reliable. 2029 if (cast<VectorType>(DataTy)->getElementCount() == 2030 ElementCount::getScalable(1)) 2031 return InstructionCost::getInvalid(); 2032 2033 ElementCount LegalVF = LT.second.getVectorElementCount(); 2034 InstructionCost MemOpCost = 2035 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I); 2036 // Add on an overhead cost for using gathers/scatters. 2037 // TODO: At the moment this is applied unilaterally for all CPUs, but at some 2038 // point we may want a per-CPU overhead. 2039 MemOpCost *= getSVEGatherScatterOverhead(Opcode); 2040 return LT.first * MemOpCost * getMaxNumElements(LegalVF); 2041 } 2042 2043 bool AArch64TTIImpl::useNeonVector(const Type *Ty) const { 2044 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors(); 2045 } 2046 2047 InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, 2048 MaybeAlign Alignment, 2049 unsigned AddressSpace, 2050 TTI::TargetCostKind CostKind, 2051 const Instruction *I) { 2052 EVT VT = TLI->getValueType(DL, Ty, true); 2053 // Type legalization can't handle structs 2054 if (VT == MVT::Other) 2055 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, 2056 CostKind); 2057 2058 auto LT = TLI->getTypeLegalizationCost(DL, Ty); 2059 if (!LT.first.isValid()) 2060 return InstructionCost::getInvalid(); 2061 2062 // The code-generator is currently not able to handle scalable vectors 2063 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 2064 // it. This change will be removed when code-generation for these types is 2065 // sufficiently reliable. 2066 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty)) 2067 if (VTy->getElementCount() == ElementCount::getScalable(1)) 2068 return InstructionCost::getInvalid(); 2069 2070 // TODO: consider latency as well for TCK_SizeAndLatency. 2071 if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) 2072 return LT.first; 2073 2074 if (CostKind != TTI::TCK_RecipThroughput) 2075 return 1; 2076 2077 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store && 2078 LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) { 2079 // Unaligned stores are extremely inefficient. We don't split all 2080 // unaligned 128-bit stores because the negative impact that has shown in 2081 // practice on inlined block copy code. 2082 // We make such stores expensive so that we will only vectorize if there 2083 // are 6 other instructions getting vectorized. 2084 const int AmortizationCost = 6; 2085 2086 return LT.first * 2 * AmortizationCost; 2087 } 2088 2089 // Check truncating stores and extending loads. 2090 if (useNeonVector(Ty) && 2091 Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) { 2092 // v4i8 types are lowered to scalar a load/store and sshll/xtn. 2093 if (VT == MVT::v4i8) 2094 return 2; 2095 // Otherwise we need to scalarize. 2096 return cast<FixedVectorType>(Ty)->getNumElements() * 2; 2097 } 2098 2099 return LT.first; 2100 } 2101 2102 InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost( 2103 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 2104 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 2105 bool UseMaskForCond, bool UseMaskForGaps) { 2106 assert(Factor >= 2 && "Invalid interleave factor"); 2107 auto *VecVTy = cast<FixedVectorType>(VecTy); 2108 2109 if (!UseMaskForCond && !UseMaskForGaps && 2110 Factor <= TLI->getMaxSupportedInterleaveFactor()) { 2111 unsigned NumElts = VecVTy->getNumElements(); 2112 auto *SubVecTy = 2113 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor); 2114 2115 // ldN/stN only support legal vector types of size 64 or 128 in bits. 2116 // Accesses having vector types that are a multiple of 128 bits can be 2117 // matched to more than one ldN/stN instruction. 2118 bool UseScalable; 2119 if (NumElts % Factor == 0 && 2120 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable)) 2121 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable); 2122 } 2123 2124 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 2125 Alignment, AddressSpace, CostKind, 2126 UseMaskForCond, UseMaskForGaps); 2127 } 2128 2129 InstructionCost 2130 AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { 2131 InstructionCost Cost = 0; 2132 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2133 for (auto *I : Tys) { 2134 if (!I->isVectorTy()) 2135 continue; 2136 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() == 2137 128) 2138 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) + 2139 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind); 2140 } 2141 return Cost; 2142 } 2143 2144 unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) { 2145 return ST->getMaxInterleaveFactor(); 2146 } 2147 2148 // For Falkor, we want to avoid having too many strided loads in a loop since 2149 // that can exhaust the HW prefetcher resources. We adjust the unroller 2150 // MaxCount preference below to attempt to ensure unrolling doesn't create too 2151 // many strided loads. 2152 static void 2153 getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, 2154 TargetTransformInfo::UnrollingPreferences &UP) { 2155 enum { MaxStridedLoads = 7 }; 2156 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) { 2157 int StridedLoads = 0; 2158 // FIXME? We could make this more precise by looking at the CFG and 2159 // e.g. not counting loads in each side of an if-then-else diamond. 2160 for (const auto BB : L->blocks()) { 2161 for (auto &I : *BB) { 2162 LoadInst *LMemI = dyn_cast<LoadInst>(&I); 2163 if (!LMemI) 2164 continue; 2165 2166 Value *PtrValue = LMemI->getPointerOperand(); 2167 if (L->isLoopInvariant(PtrValue)) 2168 continue; 2169 2170 const SCEV *LSCEV = SE.getSCEV(PtrValue); 2171 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV); 2172 if (!LSCEVAddRec || !LSCEVAddRec->isAffine()) 2173 continue; 2174 2175 // FIXME? We could take pairing of unrolled load copies into account 2176 // by looking at the AddRec, but we would probably have to limit this 2177 // to loops with no stores or other memory optimization barriers. 2178 ++StridedLoads; 2179 // We've seen enough strided loads that seeing more won't make a 2180 // difference. 2181 if (StridedLoads > MaxStridedLoads / 2) 2182 return StridedLoads; 2183 } 2184 } 2185 return StridedLoads; 2186 }; 2187 2188 int StridedLoads = countStridedLoads(L, SE); 2189 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads 2190 << " strided loads\n"); 2191 // Pick the largest power of 2 unroll count that won't result in too many 2192 // strided loads. 2193 if (StridedLoads) { 2194 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads); 2195 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to " 2196 << UP.MaxCount << '\n'); 2197 } 2198 } 2199 2200 void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 2201 TTI::UnrollingPreferences &UP, 2202 OptimizationRemarkEmitter *ORE) { 2203 // Enable partial unrolling and runtime unrolling. 2204 BaseT::getUnrollingPreferences(L, SE, UP, ORE); 2205 2206 UP.UpperBound = true; 2207 2208 // For inner loop, it is more likely to be a hot one, and the runtime check 2209 // can be promoted out from LICM pass, so the overhead is less, let's try 2210 // a larger threshold to unroll more loops. 2211 if (L->getLoopDepth() > 1) 2212 UP.PartialThreshold *= 2; 2213 2214 // Disable partial & runtime unrolling on -Os. 2215 UP.PartialOptSizeThreshold = 0; 2216 2217 if (ST->getProcFamily() == AArch64Subtarget::Falkor && 2218 EnableFalkorHWPFUnrollFix) 2219 getFalkorUnrollingPreferences(L, SE, UP); 2220 2221 // Scan the loop: don't unroll loops with calls as this could prevent 2222 // inlining. Don't unroll vector loops either, as they don't benefit much from 2223 // unrolling. 2224 for (auto *BB : L->getBlocks()) { 2225 for (auto &I : *BB) { 2226 // Don't unroll vectorised loop. 2227 if (I.getType()->isVectorTy()) 2228 return; 2229 2230 if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 2231 if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 2232 if (!isLoweredToCall(F)) 2233 continue; 2234 } 2235 return; 2236 } 2237 } 2238 } 2239 2240 // Enable runtime unrolling for in-order models 2241 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by 2242 // checking for that case, we can ensure that the default behaviour is 2243 // unchanged 2244 if (ST->getProcFamily() != AArch64Subtarget::Others && 2245 !ST->getSchedModel().isOutOfOrder()) { 2246 UP.Runtime = true; 2247 UP.Partial = true; 2248 UP.UnrollRemainder = true; 2249 UP.DefaultUnrollRuntimeCount = 4; 2250 2251 UP.UnrollAndJam = true; 2252 UP.UnrollAndJamInnerLoopThreshold = 60; 2253 } 2254 } 2255 2256 void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 2257 TTI::PeelingPreferences &PP) { 2258 BaseT::getPeelingPreferences(L, SE, PP); 2259 } 2260 2261 Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, 2262 Type *ExpectedType) { 2263 switch (Inst->getIntrinsicID()) { 2264 default: 2265 return nullptr; 2266 case Intrinsic::aarch64_neon_st2: 2267 case Intrinsic::aarch64_neon_st3: 2268 case Intrinsic::aarch64_neon_st4: { 2269 // Create a struct type 2270 StructType *ST = dyn_cast<StructType>(ExpectedType); 2271 if (!ST) 2272 return nullptr; 2273 unsigned NumElts = Inst->arg_size() - 1; 2274 if (ST->getNumElements() != NumElts) 2275 return nullptr; 2276 for (unsigned i = 0, e = NumElts; i != e; ++i) { 2277 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i)) 2278 return nullptr; 2279 } 2280 Value *Res = UndefValue::get(ExpectedType); 2281 IRBuilder<> Builder(Inst); 2282 for (unsigned i = 0, e = NumElts; i != e; ++i) { 2283 Value *L = Inst->getArgOperand(i); 2284 Res = Builder.CreateInsertValue(Res, L, i); 2285 } 2286 return Res; 2287 } 2288 case Intrinsic::aarch64_neon_ld2: 2289 case Intrinsic::aarch64_neon_ld3: 2290 case Intrinsic::aarch64_neon_ld4: 2291 if (Inst->getType() == ExpectedType) 2292 return Inst; 2293 return nullptr; 2294 } 2295 } 2296 2297 bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, 2298 MemIntrinsicInfo &Info) { 2299 switch (Inst->getIntrinsicID()) { 2300 default: 2301 break; 2302 case Intrinsic::aarch64_neon_ld2: 2303 case Intrinsic::aarch64_neon_ld3: 2304 case Intrinsic::aarch64_neon_ld4: 2305 Info.ReadMem = true; 2306 Info.WriteMem = false; 2307 Info.PtrVal = Inst->getArgOperand(0); 2308 break; 2309 case Intrinsic::aarch64_neon_st2: 2310 case Intrinsic::aarch64_neon_st3: 2311 case Intrinsic::aarch64_neon_st4: 2312 Info.ReadMem = false; 2313 Info.WriteMem = true; 2314 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1); 2315 break; 2316 } 2317 2318 switch (Inst->getIntrinsicID()) { 2319 default: 2320 return false; 2321 case Intrinsic::aarch64_neon_ld2: 2322 case Intrinsic::aarch64_neon_st2: 2323 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS; 2324 break; 2325 case Intrinsic::aarch64_neon_ld3: 2326 case Intrinsic::aarch64_neon_st3: 2327 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS; 2328 break; 2329 case Intrinsic::aarch64_neon_ld4: 2330 case Intrinsic::aarch64_neon_st4: 2331 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS; 2332 break; 2333 } 2334 return true; 2335 } 2336 2337 /// See if \p I should be considered for address type promotion. We check if \p 2338 /// I is a sext with right type and used in memory accesses. If it used in a 2339 /// "complex" getelementptr, we allow it to be promoted without finding other 2340 /// sext instructions that sign extended the same initial value. A getelementptr 2341 /// is considered as "complex" if it has more than 2 operands. 2342 bool AArch64TTIImpl::shouldConsiderAddressTypePromotion( 2343 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) { 2344 bool Considerable = false; 2345 AllowPromotionWithoutCommonHeader = false; 2346 if (!isa<SExtInst>(&I)) 2347 return false; 2348 Type *ConsideredSExtType = 2349 Type::getInt64Ty(I.getParent()->getParent()->getContext()); 2350 if (I.getType() != ConsideredSExtType) 2351 return false; 2352 // See if the sext is the one with the right type and used in at least one 2353 // GetElementPtrInst. 2354 for (const User *U : I.users()) { 2355 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) { 2356 Considerable = true; 2357 // A getelementptr is considered as "complex" if it has more than 2 2358 // operands. We will promote a SExt used in such complex GEP as we 2359 // expect some computation to be merged if they are done on 64 bits. 2360 if (GEPInst->getNumOperands() > 2) { 2361 AllowPromotionWithoutCommonHeader = true; 2362 break; 2363 } 2364 } 2365 } 2366 return Considerable; 2367 } 2368 2369 bool AArch64TTIImpl::isLegalToVectorizeReduction( 2370 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const { 2371 if (!VF.isScalable()) 2372 return true; 2373 2374 Type *Ty = RdxDesc.getRecurrenceType(); 2375 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty)) 2376 return false; 2377 2378 switch (RdxDesc.getRecurrenceKind()) { 2379 case RecurKind::Add: 2380 case RecurKind::FAdd: 2381 case RecurKind::And: 2382 case RecurKind::Or: 2383 case RecurKind::Xor: 2384 case RecurKind::SMin: 2385 case RecurKind::SMax: 2386 case RecurKind::UMin: 2387 case RecurKind::UMax: 2388 case RecurKind::FMin: 2389 case RecurKind::FMax: 2390 case RecurKind::SelectICmp: 2391 case RecurKind::SelectFCmp: 2392 case RecurKind::FMulAdd: 2393 return true; 2394 default: 2395 return false; 2396 } 2397 } 2398 2399 InstructionCost 2400 AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, 2401 bool IsUnsigned, 2402 TTI::TargetCostKind CostKind) { 2403 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 2404 2405 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16()) 2406 return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind); 2407 2408 assert((isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) && 2409 "Both vector needs to be equally scalable"); 2410 2411 InstructionCost LegalizationCost = 0; 2412 if (LT.first > 1) { 2413 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext()); 2414 unsigned MinMaxOpcode = 2415 Ty->isFPOrFPVectorTy() 2416 ? Intrinsic::maxnum 2417 : (IsUnsigned ? Intrinsic::umin : Intrinsic::smin); 2418 IntrinsicCostAttributes Attrs(MinMaxOpcode, LegalVTy, {LegalVTy, LegalVTy}); 2419 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1); 2420 } 2421 2422 return LegalizationCost + /*Cost of horizontal reduction*/ 2; 2423 } 2424 2425 InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE( 2426 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) { 2427 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 2428 InstructionCost LegalizationCost = 0; 2429 if (LT.first > 1) { 2430 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext()); 2431 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind); 2432 LegalizationCost *= LT.first - 1; 2433 } 2434 2435 int ISD = TLI->InstructionOpcodeToISD(Opcode); 2436 assert(ISD && "Invalid opcode"); 2437 // Add the final reduction cost for the legal horizontal reduction 2438 switch (ISD) { 2439 case ISD::ADD: 2440 case ISD::AND: 2441 case ISD::OR: 2442 case ISD::XOR: 2443 case ISD::FADD: 2444 return LegalizationCost + 2; 2445 default: 2446 return InstructionCost::getInvalid(); 2447 } 2448 } 2449 2450 InstructionCost 2451 AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, 2452 Optional<FastMathFlags> FMF, 2453 TTI::TargetCostKind CostKind) { 2454 if (TTI::requiresOrderedReduction(FMF)) { 2455 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) { 2456 InstructionCost BaseCost = 2457 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 2458 // Add on extra cost to reflect the extra overhead on some CPUs. We still 2459 // end up vectorizing for more computationally intensive loops. 2460 return BaseCost + FixedVTy->getNumElements(); 2461 } 2462 2463 if (Opcode != Instruction::FAdd) 2464 return InstructionCost::getInvalid(); 2465 2466 auto *VTy = cast<ScalableVectorType>(ValTy); 2467 InstructionCost Cost = 2468 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind); 2469 Cost *= getMaxNumElements(VTy->getElementCount()); 2470 return Cost; 2471 } 2472 2473 if (isa<ScalableVectorType>(ValTy)) 2474 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind); 2475 2476 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 2477 MVT MTy = LT.second; 2478 int ISD = TLI->InstructionOpcodeToISD(Opcode); 2479 assert(ISD && "Invalid opcode"); 2480 2481 // Horizontal adds can use the 'addv' instruction. We model the cost of these 2482 // instructions as twice a normal vector add, plus 1 for each legalization 2483 // step (LT.first). This is the only arithmetic vector reduction operation for 2484 // which we have an instruction. 2485 // OR, XOR and AND costs should match the codegen from: 2486 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll 2487 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll 2488 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll 2489 static const CostTblEntry CostTblNoPairwise[]{ 2490 {ISD::ADD, MVT::v8i8, 2}, 2491 {ISD::ADD, MVT::v16i8, 2}, 2492 {ISD::ADD, MVT::v4i16, 2}, 2493 {ISD::ADD, MVT::v8i16, 2}, 2494 {ISD::ADD, MVT::v4i32, 2}, 2495 {ISD::OR, MVT::v8i8, 15}, 2496 {ISD::OR, MVT::v16i8, 17}, 2497 {ISD::OR, MVT::v4i16, 7}, 2498 {ISD::OR, MVT::v8i16, 9}, 2499 {ISD::OR, MVT::v2i32, 3}, 2500 {ISD::OR, MVT::v4i32, 5}, 2501 {ISD::OR, MVT::v2i64, 3}, 2502 {ISD::XOR, MVT::v8i8, 15}, 2503 {ISD::XOR, MVT::v16i8, 17}, 2504 {ISD::XOR, MVT::v4i16, 7}, 2505 {ISD::XOR, MVT::v8i16, 9}, 2506 {ISD::XOR, MVT::v2i32, 3}, 2507 {ISD::XOR, MVT::v4i32, 5}, 2508 {ISD::XOR, MVT::v2i64, 3}, 2509 {ISD::AND, MVT::v8i8, 15}, 2510 {ISD::AND, MVT::v16i8, 17}, 2511 {ISD::AND, MVT::v4i16, 7}, 2512 {ISD::AND, MVT::v8i16, 9}, 2513 {ISD::AND, MVT::v2i32, 3}, 2514 {ISD::AND, MVT::v4i32, 5}, 2515 {ISD::AND, MVT::v2i64, 3}, 2516 }; 2517 switch (ISD) { 2518 default: 2519 break; 2520 case ISD::ADD: 2521 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy)) 2522 return (LT.first - 1) + Entry->Cost; 2523 break; 2524 case ISD::XOR: 2525 case ISD::AND: 2526 case ISD::OR: 2527 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy); 2528 if (!Entry) 2529 break; 2530 auto *ValVTy = cast<FixedVectorType>(ValTy); 2531 if (!ValVTy->getElementType()->isIntegerTy(1) && 2532 MTy.getVectorNumElements() <= ValVTy->getNumElements() && 2533 isPowerOf2_32(ValVTy->getNumElements())) { 2534 InstructionCost ExtraCost = 0; 2535 if (LT.first != 1) { 2536 // Type needs to be split, so there is an extra cost of LT.first - 1 2537 // arithmetic ops. 2538 auto *Ty = FixedVectorType::get(ValTy->getElementType(), 2539 MTy.getVectorNumElements()); 2540 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind); 2541 ExtraCost *= LT.first - 1; 2542 } 2543 return Entry->Cost + ExtraCost; 2544 } 2545 break; 2546 } 2547 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 2548 } 2549 2550 InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) { 2551 static const CostTblEntry ShuffleTbl[] = { 2552 { TTI::SK_Splice, MVT::nxv16i8, 1 }, 2553 { TTI::SK_Splice, MVT::nxv8i16, 1 }, 2554 { TTI::SK_Splice, MVT::nxv4i32, 1 }, 2555 { TTI::SK_Splice, MVT::nxv2i64, 1 }, 2556 { TTI::SK_Splice, MVT::nxv2f16, 1 }, 2557 { TTI::SK_Splice, MVT::nxv4f16, 1 }, 2558 { TTI::SK_Splice, MVT::nxv8f16, 1 }, 2559 { TTI::SK_Splice, MVT::nxv2bf16, 1 }, 2560 { TTI::SK_Splice, MVT::nxv4bf16, 1 }, 2561 { TTI::SK_Splice, MVT::nxv8bf16, 1 }, 2562 { TTI::SK_Splice, MVT::nxv2f32, 1 }, 2563 { TTI::SK_Splice, MVT::nxv4f32, 1 }, 2564 { TTI::SK_Splice, MVT::nxv2f64, 1 }, 2565 }; 2566 2567 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 2568 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext()); 2569 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2570 EVT PromotedVT = LT.second.getScalarType() == MVT::i1 2571 ? TLI->getPromotedVTForPredicate(EVT(LT.second)) 2572 : LT.second; 2573 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext()); 2574 InstructionCost LegalizationCost = 0; 2575 if (Index < 0) { 2576 LegalizationCost = 2577 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy, 2578 CmpInst::BAD_ICMP_PREDICATE, CostKind) + 2579 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy, 2580 CmpInst::BAD_ICMP_PREDICATE, CostKind); 2581 } 2582 2583 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp 2584 // Cost performed on a promoted type. 2585 if (LT.second.getScalarType() == MVT::i1) { 2586 LegalizationCost += 2587 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy, 2588 TTI::CastContextHint::None, CostKind) + 2589 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy, 2590 TTI::CastContextHint::None, CostKind); 2591 } 2592 const auto *Entry = 2593 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT()); 2594 assert(Entry && "Illegal Type for Splice"); 2595 LegalizationCost += Entry->Cost; 2596 return LegalizationCost * LT.first; 2597 } 2598 2599 InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, 2600 VectorType *Tp, 2601 ArrayRef<int> Mask, int Index, 2602 VectorType *SubTp, 2603 ArrayRef<Value *> Args) { 2604 Kind = improveShuffleKindFromMask(Kind, Mask); 2605 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose || 2606 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc || 2607 Kind == TTI::SK_Reverse) { 2608 static const CostTblEntry ShuffleTbl[] = { 2609 // Broadcast shuffle kinds can be performed with 'dup'. 2610 { TTI::SK_Broadcast, MVT::v8i8, 1 }, 2611 { TTI::SK_Broadcast, MVT::v16i8, 1 }, 2612 { TTI::SK_Broadcast, MVT::v4i16, 1 }, 2613 { TTI::SK_Broadcast, MVT::v8i16, 1 }, 2614 { TTI::SK_Broadcast, MVT::v2i32, 1 }, 2615 { TTI::SK_Broadcast, MVT::v4i32, 1 }, 2616 { TTI::SK_Broadcast, MVT::v2i64, 1 }, 2617 { TTI::SK_Broadcast, MVT::v2f32, 1 }, 2618 { TTI::SK_Broadcast, MVT::v4f32, 1 }, 2619 { TTI::SK_Broadcast, MVT::v2f64, 1 }, 2620 // Transpose shuffle kinds can be performed with 'trn1/trn2' and 2621 // 'zip1/zip2' instructions. 2622 { TTI::SK_Transpose, MVT::v8i8, 1 }, 2623 { TTI::SK_Transpose, MVT::v16i8, 1 }, 2624 { TTI::SK_Transpose, MVT::v4i16, 1 }, 2625 { TTI::SK_Transpose, MVT::v8i16, 1 }, 2626 { TTI::SK_Transpose, MVT::v2i32, 1 }, 2627 { TTI::SK_Transpose, MVT::v4i32, 1 }, 2628 { TTI::SK_Transpose, MVT::v2i64, 1 }, 2629 { TTI::SK_Transpose, MVT::v2f32, 1 }, 2630 { TTI::SK_Transpose, MVT::v4f32, 1 }, 2631 { TTI::SK_Transpose, MVT::v2f64, 1 }, 2632 // Select shuffle kinds. 2633 // TODO: handle vXi8/vXi16. 2634 { TTI::SK_Select, MVT::v2i32, 1 }, // mov. 2635 { TTI::SK_Select, MVT::v4i32, 2 }, // rev+trn (or similar). 2636 { TTI::SK_Select, MVT::v2i64, 1 }, // mov. 2637 { TTI::SK_Select, MVT::v2f32, 1 }, // mov. 2638 { TTI::SK_Select, MVT::v4f32, 2 }, // rev+trn (or similar). 2639 { TTI::SK_Select, MVT::v2f64, 1 }, // mov. 2640 // PermuteSingleSrc shuffle kinds. 2641 { TTI::SK_PermuteSingleSrc, MVT::v2i32, 1 }, // mov. 2642 { TTI::SK_PermuteSingleSrc, MVT::v4i32, 3 }, // perfectshuffle worst case. 2643 { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // mov. 2644 { TTI::SK_PermuteSingleSrc, MVT::v2f32, 1 }, // mov. 2645 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 3 }, // perfectshuffle worst case. 2646 { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // mov. 2647 { TTI::SK_PermuteSingleSrc, MVT::v4i16, 3 }, // perfectshuffle worst case. 2648 { TTI::SK_PermuteSingleSrc, MVT::v4f16, 3 }, // perfectshuffle worst case. 2649 { TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3 }, // perfectshuffle worst case. 2650 { TTI::SK_PermuteSingleSrc, MVT::v8i16, 8 }, // constpool + load + tbl 2651 { TTI::SK_PermuteSingleSrc, MVT::v8f16, 8 }, // constpool + load + tbl 2652 { TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8 }, // constpool + load + tbl 2653 { TTI::SK_PermuteSingleSrc, MVT::v8i8, 8 }, // constpool + load + tbl 2654 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 8 }, // constpool + load + tbl 2655 // Reverse can be lowered with `rev`. 2656 { TTI::SK_Reverse, MVT::v2i32, 1 }, // mov. 2657 { TTI::SK_Reverse, MVT::v4i32, 2 }, // REV64; EXT 2658 { TTI::SK_Reverse, MVT::v2i64, 1 }, // mov. 2659 { TTI::SK_Reverse, MVT::v2f32, 1 }, // mov. 2660 { TTI::SK_Reverse, MVT::v4f32, 2 }, // REV64; EXT 2661 { TTI::SK_Reverse, MVT::v2f64, 1 }, // mov. 2662 // Broadcast shuffle kinds for scalable vectors 2663 { TTI::SK_Broadcast, MVT::nxv16i8, 1 }, 2664 { TTI::SK_Broadcast, MVT::nxv8i16, 1 }, 2665 { TTI::SK_Broadcast, MVT::nxv4i32, 1 }, 2666 { TTI::SK_Broadcast, MVT::nxv2i64, 1 }, 2667 { TTI::SK_Broadcast, MVT::nxv2f16, 1 }, 2668 { TTI::SK_Broadcast, MVT::nxv4f16, 1 }, 2669 { TTI::SK_Broadcast, MVT::nxv8f16, 1 }, 2670 { TTI::SK_Broadcast, MVT::nxv2bf16, 1 }, 2671 { TTI::SK_Broadcast, MVT::nxv4bf16, 1 }, 2672 { TTI::SK_Broadcast, MVT::nxv8bf16, 1 }, 2673 { TTI::SK_Broadcast, MVT::nxv2f32, 1 }, 2674 { TTI::SK_Broadcast, MVT::nxv4f32, 1 }, 2675 { TTI::SK_Broadcast, MVT::nxv2f64, 1 }, 2676 { TTI::SK_Broadcast, MVT::nxv16i1, 1 }, 2677 { TTI::SK_Broadcast, MVT::nxv8i1, 1 }, 2678 { TTI::SK_Broadcast, MVT::nxv4i1, 1 }, 2679 { TTI::SK_Broadcast, MVT::nxv2i1, 1 }, 2680 // Handle the cases for vector.reverse with scalable vectors 2681 { TTI::SK_Reverse, MVT::nxv16i8, 1 }, 2682 { TTI::SK_Reverse, MVT::nxv8i16, 1 }, 2683 { TTI::SK_Reverse, MVT::nxv4i32, 1 }, 2684 { TTI::SK_Reverse, MVT::nxv2i64, 1 }, 2685 { TTI::SK_Reverse, MVT::nxv2f16, 1 }, 2686 { TTI::SK_Reverse, MVT::nxv4f16, 1 }, 2687 { TTI::SK_Reverse, MVT::nxv8f16, 1 }, 2688 { TTI::SK_Reverse, MVT::nxv2bf16, 1 }, 2689 { TTI::SK_Reverse, MVT::nxv4bf16, 1 }, 2690 { TTI::SK_Reverse, MVT::nxv8bf16, 1 }, 2691 { TTI::SK_Reverse, MVT::nxv2f32, 1 }, 2692 { TTI::SK_Reverse, MVT::nxv4f32, 1 }, 2693 { TTI::SK_Reverse, MVT::nxv2f64, 1 }, 2694 { TTI::SK_Reverse, MVT::nxv16i1, 1 }, 2695 { TTI::SK_Reverse, MVT::nxv8i1, 1 }, 2696 { TTI::SK_Reverse, MVT::nxv4i1, 1 }, 2697 { TTI::SK_Reverse, MVT::nxv2i1, 1 }, 2698 }; 2699 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 2700 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second)) 2701 return LT.first * Entry->Cost; 2702 } 2703 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp)) 2704 return getSpliceCost(Tp, Index); 2705 return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp); 2706 } 2707