1 //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "AArch64TargetTransformInfo.h" 10 #include "AArch64ExpandImm.h" 11 #include "MCTargetDesc/AArch64AddressingModes.h" 12 #include "llvm/Analysis/IVDescriptors.h" 13 #include "llvm/Analysis/LoopInfo.h" 14 #include "llvm/Analysis/TargetTransformInfo.h" 15 #include "llvm/CodeGen/BasicTTIImpl.h" 16 #include "llvm/CodeGen/CostTable.h" 17 #include "llvm/CodeGen/TargetLowering.h" 18 #include "llvm/IR/Intrinsics.h" 19 #include "llvm/IR/IntrinsicInst.h" 20 #include "llvm/IR/IntrinsicsAArch64.h" 21 #include "llvm/IR/PatternMatch.h" 22 #include "llvm/Support/Debug.h" 23 #include "llvm/Transforms/InstCombine/InstCombiner.h" 24 #include <algorithm> 25 using namespace llvm; 26 using namespace llvm::PatternMatch; 27 28 #define DEBUG_TYPE "aarch64tti" 29 30 static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", 31 cl::init(true), cl::Hidden); 32 33 static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10), 34 cl::Hidden); 35 36 static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead", 37 cl::init(10), cl::Hidden); 38 39 bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, 40 const Function *Callee) const { 41 const TargetMachine &TM = getTLI()->getTargetMachine(); 42 43 const FeatureBitset &CallerBits = 44 TM.getSubtargetImpl(*Caller)->getFeatureBits(); 45 const FeatureBitset &CalleeBits = 46 TM.getSubtargetImpl(*Callee)->getFeatureBits(); 47 48 // Inline a callee if its target-features are a subset of the callers 49 // target-features. 50 return (CallerBits & CalleeBits) == CalleeBits; 51 } 52 53 /// Calculate the cost of materializing a 64-bit value. This helper 54 /// method might only calculate a fraction of a larger immediate. Therefore it 55 /// is valid to return a cost of ZERO. 56 InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) { 57 // Check if the immediate can be encoded within an instruction. 58 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64)) 59 return 0; 60 61 if (Val < 0) 62 Val = ~Val; 63 64 // Calculate how many moves we will need to materialize this constant. 65 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 66 AArch64_IMM::expandMOVImm(Val, 64, Insn); 67 return Insn.size(); 68 } 69 70 /// Calculate the cost of materializing the given constant. 71 InstructionCost AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, 72 TTI::TargetCostKind CostKind) { 73 assert(Ty->isIntegerTy()); 74 75 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 76 if (BitSize == 0) 77 return ~0U; 78 79 // Sign-extend all constants to a multiple of 64-bit. 80 APInt ImmVal = Imm; 81 if (BitSize & 0x3f) 82 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU); 83 84 // Split the constant into 64-bit chunks and calculate the cost for each 85 // chunk. 86 InstructionCost Cost = 0; 87 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { 88 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); 89 int64_t Val = Tmp.getSExtValue(); 90 Cost += getIntImmCost(Val); 91 } 92 // We need at least one instruction to materialze the constant. 93 return std::max<InstructionCost>(1, Cost); 94 } 95 96 InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, 97 const APInt &Imm, Type *Ty, 98 TTI::TargetCostKind CostKind, 99 Instruction *Inst) { 100 assert(Ty->isIntegerTy()); 101 102 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 103 // There is no cost model for constants with a bit size of 0. Return TCC_Free 104 // here, so that constant hoisting will ignore this constant. 105 if (BitSize == 0) 106 return TTI::TCC_Free; 107 108 unsigned ImmIdx = ~0U; 109 switch (Opcode) { 110 default: 111 return TTI::TCC_Free; 112 case Instruction::GetElementPtr: 113 // Always hoist the base address of a GetElementPtr. 114 if (Idx == 0) 115 return 2 * TTI::TCC_Basic; 116 return TTI::TCC_Free; 117 case Instruction::Store: 118 ImmIdx = 0; 119 break; 120 case Instruction::Add: 121 case Instruction::Sub: 122 case Instruction::Mul: 123 case Instruction::UDiv: 124 case Instruction::SDiv: 125 case Instruction::URem: 126 case Instruction::SRem: 127 case Instruction::And: 128 case Instruction::Or: 129 case Instruction::Xor: 130 case Instruction::ICmp: 131 ImmIdx = 1; 132 break; 133 // Always return TCC_Free for the shift value of a shift instruction. 134 case Instruction::Shl: 135 case Instruction::LShr: 136 case Instruction::AShr: 137 if (Idx == 1) 138 return TTI::TCC_Free; 139 break; 140 case Instruction::Trunc: 141 case Instruction::ZExt: 142 case Instruction::SExt: 143 case Instruction::IntToPtr: 144 case Instruction::PtrToInt: 145 case Instruction::BitCast: 146 case Instruction::PHI: 147 case Instruction::Call: 148 case Instruction::Select: 149 case Instruction::Ret: 150 case Instruction::Load: 151 break; 152 } 153 154 if (Idx == ImmIdx) { 155 int NumConstants = (BitSize + 63) / 64; 156 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 157 return (Cost <= NumConstants * TTI::TCC_Basic) 158 ? static_cast<int>(TTI::TCC_Free) 159 : Cost; 160 } 161 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 162 } 163 164 InstructionCost 165 AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, 166 const APInt &Imm, Type *Ty, 167 TTI::TargetCostKind CostKind) { 168 assert(Ty->isIntegerTy()); 169 170 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 171 // There is no cost model for constants with a bit size of 0. Return TCC_Free 172 // here, so that constant hoisting will ignore this constant. 173 if (BitSize == 0) 174 return TTI::TCC_Free; 175 176 // Most (all?) AArch64 intrinsics do not support folding immediates into the 177 // selected instruction, so we compute the materialization cost for the 178 // immediate directly. 179 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv) 180 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 181 182 switch (IID) { 183 default: 184 return TTI::TCC_Free; 185 case Intrinsic::sadd_with_overflow: 186 case Intrinsic::uadd_with_overflow: 187 case Intrinsic::ssub_with_overflow: 188 case Intrinsic::usub_with_overflow: 189 case Intrinsic::smul_with_overflow: 190 case Intrinsic::umul_with_overflow: 191 if (Idx == 1) { 192 int NumConstants = (BitSize + 63) / 64; 193 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 194 return (Cost <= NumConstants * TTI::TCC_Basic) 195 ? static_cast<int>(TTI::TCC_Free) 196 : Cost; 197 } 198 break; 199 case Intrinsic::experimental_stackmap: 200 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 201 return TTI::TCC_Free; 202 break; 203 case Intrinsic::experimental_patchpoint_void: 204 case Intrinsic::experimental_patchpoint_i64: 205 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 206 return TTI::TCC_Free; 207 break; 208 case Intrinsic::experimental_gc_statepoint: 209 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 210 return TTI::TCC_Free; 211 break; 212 } 213 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 214 } 215 216 TargetTransformInfo::PopcntSupportKind 217 AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) { 218 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 219 if (TyWidth == 32 || TyWidth == 64) 220 return TTI::PSK_FastHardware; 221 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount. 222 return TTI::PSK_Software; 223 } 224 225 InstructionCost 226 AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 227 TTI::TargetCostKind CostKind) { 228 auto *RetTy = ICA.getReturnType(); 229 switch (ICA.getID()) { 230 case Intrinsic::umin: 231 case Intrinsic::umax: 232 case Intrinsic::smin: 233 case Intrinsic::smax: { 234 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, 235 MVT::v8i16, MVT::v2i32, MVT::v4i32}; 236 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 237 // v2i64 types get converted to cmp+bif hence the cost of 2 238 if (LT.second == MVT::v2i64) 239 return LT.first * 2; 240 if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; })) 241 return LT.first; 242 break; 243 } 244 case Intrinsic::sadd_sat: 245 case Intrinsic::ssub_sat: 246 case Intrinsic::uadd_sat: 247 case Intrinsic::usub_sat: { 248 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, 249 MVT::v8i16, MVT::v2i32, MVT::v4i32, 250 MVT::v2i64}; 251 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 252 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we 253 // need to extend the type, as it uses shr(qadd(shl, shl)). 254 unsigned Instrs = 255 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4; 256 if (any_of(ValidSatTys, [<](MVT M) { return M == LT.second; })) 257 return LT.first * Instrs; 258 break; 259 } 260 case Intrinsic::abs: { 261 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, 262 MVT::v8i16, MVT::v2i32, MVT::v4i32, 263 MVT::v2i64}; 264 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 265 if (any_of(ValidAbsTys, [<](MVT M) { return M == LT.second; })) 266 return LT.first; 267 break; 268 } 269 case Intrinsic::experimental_stepvector: { 270 InstructionCost Cost = 1; // Cost of the `index' instruction 271 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 272 // Legalisation of illegal vectors involves an `index' instruction plus 273 // (LT.first - 1) vector adds. 274 if (LT.first > 1) { 275 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext()); 276 InstructionCost AddCost = 277 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind); 278 Cost += AddCost * (LT.first - 1); 279 } 280 return Cost; 281 } 282 case Intrinsic::bitreverse: { 283 static const CostTblEntry BitreverseTbl[] = { 284 {Intrinsic::bitreverse, MVT::i32, 1}, 285 {Intrinsic::bitreverse, MVT::i64, 1}, 286 {Intrinsic::bitreverse, MVT::v8i8, 1}, 287 {Intrinsic::bitreverse, MVT::v16i8, 1}, 288 {Intrinsic::bitreverse, MVT::v4i16, 2}, 289 {Intrinsic::bitreverse, MVT::v8i16, 2}, 290 {Intrinsic::bitreverse, MVT::v2i32, 2}, 291 {Intrinsic::bitreverse, MVT::v4i32, 2}, 292 {Intrinsic::bitreverse, MVT::v1i64, 2}, 293 {Intrinsic::bitreverse, MVT::v2i64, 2}, 294 }; 295 const auto LegalisationCost = TLI->getTypeLegalizationCost(DL, RetTy); 296 const auto *Entry = 297 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second); 298 if (Entry) { 299 // Cost Model is using the legal type(i32) that i8 and i16 will be 300 // converted to +1 so that we match the actual lowering cost 301 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 || 302 TLI->getValueType(DL, RetTy, true) == MVT::i16) 303 return LegalisationCost.first * Entry->Cost + 1; 304 305 return LegalisationCost.first * Entry->Cost; 306 } 307 break; 308 } 309 case Intrinsic::ctpop: { 310 static const CostTblEntry CtpopCostTbl[] = { 311 {ISD::CTPOP, MVT::v2i64, 4}, 312 {ISD::CTPOP, MVT::v4i32, 3}, 313 {ISD::CTPOP, MVT::v8i16, 2}, 314 {ISD::CTPOP, MVT::v16i8, 1}, 315 {ISD::CTPOP, MVT::i64, 4}, 316 {ISD::CTPOP, MVT::v2i32, 3}, 317 {ISD::CTPOP, MVT::v4i16, 2}, 318 {ISD::CTPOP, MVT::v8i8, 1}, 319 {ISD::CTPOP, MVT::i32, 5}, 320 }; 321 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 322 MVT MTy = LT.second; 323 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) { 324 // Extra cost of +1 when illegal vector types are legalized by promoting 325 // the integer type. 326 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() != 327 RetTy->getScalarSizeInBits() 328 ? 1 329 : 0; 330 return LT.first * Entry->Cost + ExtraCost; 331 } 332 break; 333 } 334 case Intrinsic::sadd_with_overflow: 335 case Intrinsic::uadd_with_overflow: 336 case Intrinsic::ssub_with_overflow: 337 case Intrinsic::usub_with_overflow: 338 case Intrinsic::smul_with_overflow: 339 case Intrinsic::umul_with_overflow: { 340 static const CostTblEntry WithOverflowCostTbl[] = { 341 {Intrinsic::sadd_with_overflow, MVT::i8, 3}, 342 {Intrinsic::uadd_with_overflow, MVT::i8, 3}, 343 {Intrinsic::sadd_with_overflow, MVT::i16, 3}, 344 {Intrinsic::uadd_with_overflow, MVT::i16, 3}, 345 {Intrinsic::sadd_with_overflow, MVT::i32, 1}, 346 {Intrinsic::uadd_with_overflow, MVT::i32, 1}, 347 {Intrinsic::sadd_with_overflow, MVT::i64, 1}, 348 {Intrinsic::uadd_with_overflow, MVT::i64, 1}, 349 {Intrinsic::ssub_with_overflow, MVT::i8, 3}, 350 {Intrinsic::usub_with_overflow, MVT::i8, 3}, 351 {Intrinsic::ssub_with_overflow, MVT::i16, 3}, 352 {Intrinsic::usub_with_overflow, MVT::i16, 3}, 353 {Intrinsic::ssub_with_overflow, MVT::i32, 1}, 354 {Intrinsic::usub_with_overflow, MVT::i32, 1}, 355 {Intrinsic::ssub_with_overflow, MVT::i64, 1}, 356 {Intrinsic::usub_with_overflow, MVT::i64, 1}, 357 {Intrinsic::smul_with_overflow, MVT::i8, 5}, 358 {Intrinsic::umul_with_overflow, MVT::i8, 4}, 359 {Intrinsic::smul_with_overflow, MVT::i16, 5}, 360 {Intrinsic::umul_with_overflow, MVT::i16, 4}, 361 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst 362 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw 363 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp 364 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr 365 }; 366 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true); 367 if (MTy.isSimple()) 368 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(), 369 MTy.getSimpleVT())) 370 return Entry->Cost; 371 break; 372 } 373 default: 374 break; 375 } 376 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 377 } 378 379 /// The function will remove redundant reinterprets casting in the presence 380 /// of the control flow 381 static Optional<Instruction *> processPhiNode(InstCombiner &IC, 382 IntrinsicInst &II) { 383 SmallVector<Instruction *, 32> Worklist; 384 auto RequiredType = II.getType(); 385 386 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0)); 387 assert(PN && "Expected Phi Node!"); 388 389 // Don't create a new Phi unless we can remove the old one. 390 if (!PN->hasOneUse()) 391 return None; 392 393 for (Value *IncValPhi : PN->incoming_values()) { 394 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi); 395 if (!Reinterpret || 396 Reinterpret->getIntrinsicID() != 397 Intrinsic::aarch64_sve_convert_to_svbool || 398 RequiredType != Reinterpret->getArgOperand(0)->getType()) 399 return None; 400 } 401 402 // Create the new Phi 403 LLVMContext &Ctx = PN->getContext(); 404 IRBuilder<> Builder(Ctx); 405 Builder.SetInsertPoint(PN); 406 PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues()); 407 Worklist.push_back(PN); 408 409 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) { 410 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I)); 411 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I)); 412 Worklist.push_back(Reinterpret); 413 } 414 415 // Cleanup Phi Node and reinterprets 416 return IC.replaceInstUsesWith(II, NPN); 417 } 418 419 // (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _)))) 420 // => (binop (pred) (from_svbool _) (from_svbool _)) 421 // 422 // The above transformation eliminates a `to_svbool` in the predicate 423 // operand of bitwise operation `binop` by narrowing the vector width of 424 // the operation. For example, it would convert a `<vscale x 16 x i1> 425 // and` into a `<vscale x 4 x i1> and`. This is profitable because 426 // to_svbool must zero the new lanes during widening, whereas 427 // from_svbool is free. 428 static Optional<Instruction *> tryCombineFromSVBoolBinOp(InstCombiner &IC, 429 IntrinsicInst &II) { 430 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0)); 431 if (!BinOp) 432 return None; 433 434 auto IntrinsicID = BinOp->getIntrinsicID(); 435 switch (IntrinsicID) { 436 case Intrinsic::aarch64_sve_and_z: 437 case Intrinsic::aarch64_sve_bic_z: 438 case Intrinsic::aarch64_sve_eor_z: 439 case Intrinsic::aarch64_sve_nand_z: 440 case Intrinsic::aarch64_sve_nor_z: 441 case Intrinsic::aarch64_sve_orn_z: 442 case Intrinsic::aarch64_sve_orr_z: 443 break; 444 default: 445 return None; 446 } 447 448 auto BinOpPred = BinOp->getOperand(0); 449 auto BinOpOp1 = BinOp->getOperand(1); 450 auto BinOpOp2 = BinOp->getOperand(2); 451 452 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred); 453 if (!PredIntr || 454 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool) 455 return None; 456 457 auto PredOp = PredIntr->getOperand(0); 458 auto PredOpTy = cast<VectorType>(PredOp->getType()); 459 if (PredOpTy != II.getType()) 460 return None; 461 462 IRBuilder<> Builder(II.getContext()); 463 Builder.SetInsertPoint(&II); 464 465 SmallVector<Value *> NarrowedBinOpArgs = {PredOp}; 466 auto NarrowBinOpOp1 = Builder.CreateIntrinsic( 467 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1}); 468 NarrowedBinOpArgs.push_back(NarrowBinOpOp1); 469 if (BinOpOp1 == BinOpOp2) 470 NarrowedBinOpArgs.push_back(NarrowBinOpOp1); 471 else 472 NarrowedBinOpArgs.push_back(Builder.CreateIntrinsic( 473 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2})); 474 475 auto NarrowedBinOp = 476 Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs); 477 return IC.replaceInstUsesWith(II, NarrowedBinOp); 478 } 479 480 static Optional<Instruction *> instCombineConvertFromSVBool(InstCombiner &IC, 481 IntrinsicInst &II) { 482 // If the reinterpret instruction operand is a PHI Node 483 if (isa<PHINode>(II.getArgOperand(0))) 484 return processPhiNode(IC, II); 485 486 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II)) 487 return BinOpCombine; 488 489 SmallVector<Instruction *, 32> CandidatesForRemoval; 490 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr; 491 492 const auto *IVTy = cast<VectorType>(II.getType()); 493 494 // Walk the chain of conversions. 495 while (Cursor) { 496 // If the type of the cursor has fewer lanes than the final result, zeroing 497 // must take place, which breaks the equivalence chain. 498 const auto *CursorVTy = cast<VectorType>(Cursor->getType()); 499 if (CursorVTy->getElementCount().getKnownMinValue() < 500 IVTy->getElementCount().getKnownMinValue()) 501 break; 502 503 // If the cursor has the same type as I, it is a viable replacement. 504 if (Cursor->getType() == IVTy) 505 EarliestReplacement = Cursor; 506 507 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor); 508 509 // If this is not an SVE conversion intrinsic, this is the end of the chain. 510 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() == 511 Intrinsic::aarch64_sve_convert_to_svbool || 512 IntrinsicCursor->getIntrinsicID() == 513 Intrinsic::aarch64_sve_convert_from_svbool)) 514 break; 515 516 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor); 517 Cursor = IntrinsicCursor->getOperand(0); 518 } 519 520 // If no viable replacement in the conversion chain was found, there is 521 // nothing to do. 522 if (!EarliestReplacement) 523 return None; 524 525 return IC.replaceInstUsesWith(II, EarliestReplacement); 526 } 527 528 static Optional<Instruction *> instCombineSVESel(InstCombiner &IC, 529 IntrinsicInst &II) { 530 IRBuilder<> Builder(&II); 531 auto Select = Builder.CreateSelect(II.getOperand(0), II.getOperand(1), 532 II.getOperand(2)); 533 return IC.replaceInstUsesWith(II, Select); 534 } 535 536 static Optional<Instruction *> instCombineSVEDup(InstCombiner &IC, 537 IntrinsicInst &II) { 538 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); 539 if (!Pg) 540 return None; 541 542 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) 543 return None; 544 545 const auto PTruePattern = 546 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue(); 547 if (PTruePattern != AArch64SVEPredPattern::vl1) 548 return None; 549 550 // The intrinsic is inserting into lane zero so use an insert instead. 551 auto *IdxTy = Type::getInt64Ty(II.getContext()); 552 auto *Insert = InsertElementInst::Create( 553 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0)); 554 Insert->insertBefore(&II); 555 Insert->takeName(&II); 556 557 return IC.replaceInstUsesWith(II, Insert); 558 } 559 560 static Optional<Instruction *> instCombineSVEDupX(InstCombiner &IC, 561 IntrinsicInst &II) { 562 // Replace DupX with a regular IR splat. 563 IRBuilder<> Builder(II.getContext()); 564 Builder.SetInsertPoint(&II); 565 auto *RetTy = cast<ScalableVectorType>(II.getType()); 566 Value *Splat = 567 Builder.CreateVectorSplat(RetTy->getElementCount(), II.getArgOperand(0)); 568 Splat->takeName(&II); 569 return IC.replaceInstUsesWith(II, Splat); 570 } 571 572 static Optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC, 573 IntrinsicInst &II) { 574 LLVMContext &Ctx = II.getContext(); 575 IRBuilder<> Builder(Ctx); 576 Builder.SetInsertPoint(&II); 577 578 // Check that the predicate is all active 579 auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0)); 580 if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) 581 return None; 582 583 const auto PTruePattern = 584 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue(); 585 if (PTruePattern != AArch64SVEPredPattern::all) 586 return None; 587 588 // Check that we have a compare of zero.. 589 auto *SplatValue = 590 dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2))); 591 if (!SplatValue || !SplatValue->isZero()) 592 return None; 593 594 // ..against a dupq 595 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); 596 if (!DupQLane || 597 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane) 598 return None; 599 600 // Where the dupq is a lane 0 replicate of a vector insert 601 if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero()) 602 return None; 603 604 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0)); 605 if (!VecIns || 606 VecIns->getIntrinsicID() != Intrinsic::experimental_vector_insert) 607 return None; 608 609 // Where the vector insert is a fixed constant vector insert into undef at 610 // index zero 611 if (!isa<UndefValue>(VecIns->getArgOperand(0))) 612 return None; 613 614 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero()) 615 return None; 616 617 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1)); 618 if (!ConstVec) 619 return None; 620 621 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType()); 622 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType()); 623 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements()) 624 return None; 625 626 unsigned NumElts = VecTy->getNumElements(); 627 unsigned PredicateBits = 0; 628 629 // Expand intrinsic operands to a 16-bit byte level predicate 630 for (unsigned I = 0; I < NumElts; ++I) { 631 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I)); 632 if (!Arg) 633 return None; 634 if (!Arg->isZero()) 635 PredicateBits |= 1 << (I * (16 / NumElts)); 636 } 637 638 // If all bits are zero bail early with an empty predicate 639 if (PredicateBits == 0) { 640 auto *PFalse = Constant::getNullValue(II.getType()); 641 PFalse->takeName(&II); 642 return IC.replaceInstUsesWith(II, PFalse); 643 } 644 645 // Calculate largest predicate type used (where byte predicate is largest) 646 unsigned Mask = 8; 647 for (unsigned I = 0; I < 16; ++I) 648 if ((PredicateBits & (1 << I)) != 0) 649 Mask |= (I % 8); 650 651 unsigned PredSize = Mask & -Mask; 652 auto *PredType = ScalableVectorType::get( 653 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8)); 654 655 // Ensure all relevant bits are set 656 for (unsigned I = 0; I < 16; I += PredSize) 657 if ((PredicateBits & (1 << I)) == 0) 658 return None; 659 660 auto *PTruePat = 661 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all); 662 auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, 663 {PredType}, {PTruePat}); 664 auto *ConvertToSVBool = Builder.CreateIntrinsic( 665 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue}); 666 auto *ConvertFromSVBool = 667 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, 668 {II.getType()}, {ConvertToSVBool}); 669 670 ConvertFromSVBool->takeName(&II); 671 return IC.replaceInstUsesWith(II, ConvertFromSVBool); 672 } 673 674 static Optional<Instruction *> instCombineSVELast(InstCombiner &IC, 675 IntrinsicInst &II) { 676 IRBuilder<> Builder(II.getContext()); 677 Builder.SetInsertPoint(&II); 678 Value *Pg = II.getArgOperand(0); 679 Value *Vec = II.getArgOperand(1); 680 auto IntrinsicID = II.getIntrinsicID(); 681 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta; 682 683 // lastX(splat(X)) --> X 684 if (auto *SplatVal = getSplatValue(Vec)) 685 return IC.replaceInstUsesWith(II, SplatVal); 686 687 // If x and/or y is a splat value then: 688 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y)) 689 Value *LHS, *RHS; 690 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) { 691 if (isSplatValue(LHS) || isSplatValue(RHS)) { 692 auto *OldBinOp = cast<BinaryOperator>(Vec); 693 auto OpC = OldBinOp->getOpcode(); 694 auto *NewLHS = 695 Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS}); 696 auto *NewRHS = 697 Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS}); 698 auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags( 699 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), &II); 700 return IC.replaceInstUsesWith(II, NewBinOp); 701 } 702 } 703 704 auto *C = dyn_cast<Constant>(Pg); 705 if (IsAfter && C && C->isNullValue()) { 706 // The intrinsic is extracting lane 0 so use an extract instead. 707 auto *IdxTy = Type::getInt64Ty(II.getContext()); 708 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0)); 709 Extract->insertBefore(&II); 710 Extract->takeName(&II); 711 return IC.replaceInstUsesWith(II, Extract); 712 } 713 714 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg); 715 if (!IntrPG) 716 return None; 717 718 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) 719 return None; 720 721 const auto PTruePattern = 722 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue(); 723 724 // Can the intrinsic's predicate be converted to a known constant index? 725 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern); 726 if (!MinNumElts) 727 return None; 728 729 unsigned Idx = MinNumElts - 1; 730 // Increment the index if extracting the element after the last active 731 // predicate element. 732 if (IsAfter) 733 ++Idx; 734 735 // Ignore extracts whose index is larger than the known minimum vector 736 // length. NOTE: This is an artificial constraint where we prefer to 737 // maintain what the user asked for until an alternative is proven faster. 738 auto *PgVTy = cast<ScalableVectorType>(Pg->getType()); 739 if (Idx >= PgVTy->getMinNumElements()) 740 return None; 741 742 // The intrinsic is extracting a fixed lane so use an extract instead. 743 auto *IdxTy = Type::getInt64Ty(II.getContext()); 744 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx)); 745 Extract->insertBefore(&II); 746 Extract->takeName(&II); 747 return IC.replaceInstUsesWith(II, Extract); 748 } 749 750 static Optional<Instruction *> instCombineRDFFR(InstCombiner &IC, 751 IntrinsicInst &II) { 752 LLVMContext &Ctx = II.getContext(); 753 IRBuilder<> Builder(Ctx); 754 Builder.SetInsertPoint(&II); 755 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr 756 // can work with RDFFR_PP for ptest elimination. 757 auto *AllPat = 758 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all); 759 auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, 760 {II.getType()}, {AllPat}); 761 auto *RDFFR = 762 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue}); 763 RDFFR->takeName(&II); 764 return IC.replaceInstUsesWith(II, RDFFR); 765 } 766 767 static Optional<Instruction *> 768 instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) { 769 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue(); 770 771 if (Pattern == AArch64SVEPredPattern::all) { 772 LLVMContext &Ctx = II.getContext(); 773 IRBuilder<> Builder(Ctx); 774 Builder.SetInsertPoint(&II); 775 776 Constant *StepVal = ConstantInt::get(II.getType(), NumElts); 777 auto *VScale = Builder.CreateVScale(StepVal); 778 VScale->takeName(&II); 779 return IC.replaceInstUsesWith(II, VScale); 780 } 781 782 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern); 783 784 return MinNumElts && NumElts >= MinNumElts 785 ? Optional<Instruction *>(IC.replaceInstUsesWith( 786 II, ConstantInt::get(II.getType(), MinNumElts))) 787 : None; 788 } 789 790 static Optional<Instruction *> instCombineSVEPTest(InstCombiner &IC, 791 IntrinsicInst &II) { 792 IntrinsicInst *Op1 = dyn_cast<IntrinsicInst>(II.getArgOperand(0)); 793 IntrinsicInst *Op2 = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); 794 795 if (Op1 && Op2 && 796 Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && 797 Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && 798 Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) { 799 800 IRBuilder<> Builder(II.getContext()); 801 Builder.SetInsertPoint(&II); 802 803 Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)}; 804 Type *Tys[] = {Op1->getArgOperand(0)->getType()}; 805 806 auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops); 807 808 PTest->takeName(&II); 809 return IC.replaceInstUsesWith(II, PTest); 810 } 811 812 return None; 813 } 814 815 static Optional<Instruction *> instCombineSVEVectorFMLA(InstCombiner &IC, 816 IntrinsicInst &II) { 817 // fold (fadd p a (fmul p b c)) -> (fma p a b c) 818 Value *P = II.getOperand(0); 819 Value *A = II.getOperand(1); 820 auto FMul = II.getOperand(2); 821 Value *B, *C; 822 if (!match(FMul, m_Intrinsic<Intrinsic::aarch64_sve_fmul>( 823 m_Specific(P), m_Value(B), m_Value(C)))) 824 return None; 825 826 if (!FMul->hasOneUse()) 827 return None; 828 829 llvm::FastMathFlags FAddFlags = II.getFastMathFlags(); 830 // Stop the combine when the flags on the inputs differ in case dropping flags 831 // would lead to us missing out on more beneficial optimizations. 832 if (FAddFlags != cast<CallInst>(FMul)->getFastMathFlags()) 833 return None; 834 if (!FAddFlags.allowContract()) 835 return None; 836 837 IRBuilder<> Builder(II.getContext()); 838 Builder.SetInsertPoint(&II); 839 auto FMLA = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_fmla, 840 {II.getType()}, {P, A, B, C}, &II); 841 FMLA->setFastMathFlags(FAddFlags); 842 return IC.replaceInstUsesWith(II, FMLA); 843 } 844 845 static bool isAllActivePredicate(Value *Pred) { 846 // Look through convert.from.svbool(convert.to.svbool(...) chain. 847 Value *UncastedPred; 848 if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>( 849 m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>( 850 m_Value(UncastedPred))))) 851 // If the predicate has the same or less lanes than the uncasted 852 // predicate then we know the casting has no effect. 853 if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <= 854 cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements()) 855 Pred = UncastedPred; 856 857 return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( 858 m_ConstantInt<AArch64SVEPredPattern::all>())); 859 } 860 861 static Optional<Instruction *> 862 instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { 863 IRBuilder<> Builder(II.getContext()); 864 Builder.SetInsertPoint(&II); 865 866 Value *Pred = II.getOperand(0); 867 Value *PtrOp = II.getOperand(1); 868 Type *VecTy = II.getType(); 869 Value *VecPtr = Builder.CreateBitCast(PtrOp, VecTy->getPointerTo()); 870 871 if (isAllActivePredicate(Pred)) { 872 LoadInst *Load = Builder.CreateLoad(VecTy, VecPtr); 873 Load->copyMetadata(II); 874 return IC.replaceInstUsesWith(II, Load); 875 } 876 877 CallInst *MaskedLoad = 878 Builder.CreateMaskedLoad(VecTy, VecPtr, PtrOp->getPointerAlignment(DL), 879 Pred, ConstantAggregateZero::get(VecTy)); 880 MaskedLoad->copyMetadata(II); 881 return IC.replaceInstUsesWith(II, MaskedLoad); 882 } 883 884 static Optional<Instruction *> 885 instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { 886 IRBuilder<> Builder(II.getContext()); 887 Builder.SetInsertPoint(&II); 888 889 Value *VecOp = II.getOperand(0); 890 Value *Pred = II.getOperand(1); 891 Value *PtrOp = II.getOperand(2); 892 Value *VecPtr = 893 Builder.CreateBitCast(PtrOp, VecOp->getType()->getPointerTo()); 894 895 if (isAllActivePredicate(Pred)) { 896 StoreInst *Store = Builder.CreateStore(VecOp, VecPtr); 897 Store->copyMetadata(II); 898 return IC.eraseInstFromFunction(II); 899 } 900 901 CallInst *MaskedStore = Builder.CreateMaskedStore( 902 VecOp, VecPtr, PtrOp->getPointerAlignment(DL), Pred); 903 MaskedStore->copyMetadata(II); 904 return IC.eraseInstFromFunction(II); 905 } 906 907 static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) { 908 switch (Intrinsic) { 909 case Intrinsic::aarch64_sve_fmul: 910 return Instruction::BinaryOps::FMul; 911 case Intrinsic::aarch64_sve_fadd: 912 return Instruction::BinaryOps::FAdd; 913 case Intrinsic::aarch64_sve_fsub: 914 return Instruction::BinaryOps::FSub; 915 default: 916 return Instruction::BinaryOpsEnd; 917 } 918 } 919 920 static Optional<Instruction *> instCombineSVEVectorBinOp(InstCombiner &IC, 921 IntrinsicInst &II) { 922 auto *OpPredicate = II.getOperand(0); 923 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID()); 924 if (BinOpCode == Instruction::BinaryOpsEnd || 925 !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( 926 m_ConstantInt<AArch64SVEPredPattern::all>()))) 927 return None; 928 IRBuilder<> Builder(II.getContext()); 929 Builder.SetInsertPoint(&II); 930 Builder.setFastMathFlags(II.getFastMathFlags()); 931 auto BinOp = 932 Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2)); 933 return IC.replaceInstUsesWith(II, BinOp); 934 } 935 936 static Optional<Instruction *> instCombineSVEVectorFAdd(InstCombiner &IC, 937 IntrinsicInst &II) { 938 if (auto FMLA = instCombineSVEVectorFMLA(IC, II)) 939 return FMLA; 940 return instCombineSVEVectorBinOp(IC, II); 941 } 942 943 static Optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC, 944 IntrinsicInst &II) { 945 auto *OpPredicate = II.getOperand(0); 946 auto *OpMultiplicand = II.getOperand(1); 947 auto *OpMultiplier = II.getOperand(2); 948 949 IRBuilder<> Builder(II.getContext()); 950 Builder.SetInsertPoint(&II); 951 952 // Return true if a given instruction is a unit splat value, false otherwise. 953 auto IsUnitSplat = [](auto *I) { 954 auto *SplatValue = getSplatValue(I); 955 if (!SplatValue) 956 return false; 957 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); 958 }; 959 960 // Return true if a given instruction is an aarch64_sve_dup intrinsic call 961 // with a unit splat value, false otherwise. 962 auto IsUnitDup = [](auto *I) { 963 auto *IntrI = dyn_cast<IntrinsicInst>(I); 964 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup) 965 return false; 966 967 auto *SplatValue = IntrI->getOperand(2); 968 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); 969 }; 970 971 if (IsUnitSplat(OpMultiplier)) { 972 // [f]mul pg %n, (dupx 1) => %n 973 OpMultiplicand->takeName(&II); 974 return IC.replaceInstUsesWith(II, OpMultiplicand); 975 } else if (IsUnitDup(OpMultiplier)) { 976 // [f]mul pg %n, (dup pg 1) => %n 977 auto *DupInst = cast<IntrinsicInst>(OpMultiplier); 978 auto *DupPg = DupInst->getOperand(1); 979 // TODO: this is naive. The optimization is still valid if DupPg 980 // 'encompasses' OpPredicate, not only if they're the same predicate. 981 if (OpPredicate == DupPg) { 982 OpMultiplicand->takeName(&II); 983 return IC.replaceInstUsesWith(II, OpMultiplicand); 984 } 985 } 986 987 return instCombineSVEVectorBinOp(IC, II); 988 } 989 990 static Optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC, 991 IntrinsicInst &II) { 992 IRBuilder<> Builder(II.getContext()); 993 Builder.SetInsertPoint(&II); 994 Value *UnpackArg = II.getArgOperand(0); 995 auto *RetTy = cast<ScalableVectorType>(II.getType()); 996 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi || 997 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo; 998 999 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X)) 1000 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X)) 1001 if (auto *ScalarArg = getSplatValue(UnpackArg)) { 1002 ScalarArg = 1003 Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned); 1004 Value *NewVal = 1005 Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg); 1006 NewVal->takeName(&II); 1007 return IC.replaceInstUsesWith(II, NewVal); 1008 } 1009 1010 return None; 1011 } 1012 static Optional<Instruction *> instCombineSVETBL(InstCombiner &IC, 1013 IntrinsicInst &II) { 1014 auto *OpVal = II.getOperand(0); 1015 auto *OpIndices = II.getOperand(1); 1016 VectorType *VTy = cast<VectorType>(II.getType()); 1017 1018 // Check whether OpIndices is a constant splat value < minimal element count 1019 // of result. 1020 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices)); 1021 if (!SplatValue || 1022 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue())) 1023 return None; 1024 1025 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to 1026 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization. 1027 IRBuilder<> Builder(II.getContext()); 1028 Builder.SetInsertPoint(&II); 1029 auto *Extract = Builder.CreateExtractElement(OpVal, SplatValue); 1030 auto *VectorSplat = 1031 Builder.CreateVectorSplat(VTy->getElementCount(), Extract); 1032 1033 VectorSplat->takeName(&II); 1034 return IC.replaceInstUsesWith(II, VectorSplat); 1035 } 1036 1037 static Optional<Instruction *> instCombineSVETupleGet(InstCombiner &IC, 1038 IntrinsicInst &II) { 1039 // Try to remove sequences of tuple get/set. 1040 Value *SetTuple, *SetIndex, *SetValue; 1041 auto *GetTuple = II.getArgOperand(0); 1042 auto *GetIndex = II.getArgOperand(1); 1043 // Check that we have tuple_get(GetTuple, GetIndex) where GetTuple is a 1044 // call to tuple_set i.e. tuple_set(SetTuple, SetIndex, SetValue). 1045 // Make sure that the types of the current intrinsic and SetValue match 1046 // in order to safely remove the sequence. 1047 if (!match(GetTuple, 1048 m_Intrinsic<Intrinsic::aarch64_sve_tuple_set>( 1049 m_Value(SetTuple), m_Value(SetIndex), m_Value(SetValue))) || 1050 SetValue->getType() != II.getType()) 1051 return None; 1052 // Case where we get the same index right after setting it. 1053 // tuple_get(tuple_set(SetTuple, SetIndex, SetValue), GetIndex) --> SetValue 1054 if (GetIndex == SetIndex) 1055 return IC.replaceInstUsesWith(II, SetValue); 1056 // If we are getting a different index than what was set in the tuple_set 1057 // intrinsic. We can just set the input tuple to the one up in the chain. 1058 // tuple_get(tuple_set(SetTuple, SetIndex, SetValue), GetIndex) 1059 // --> tuple_get(SetTuple, GetIndex) 1060 return IC.replaceOperand(II, 0, SetTuple); 1061 } 1062 1063 static Optional<Instruction *> instCombineSVEZip(InstCombiner &IC, 1064 IntrinsicInst &II) { 1065 // zip1(uzp1(A, B), uzp2(A, B)) --> A 1066 // zip2(uzp1(A, B), uzp2(A, B)) --> B 1067 Value *A, *B; 1068 if (match(II.getArgOperand(0), 1069 m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) && 1070 match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>( 1071 m_Specific(A), m_Specific(B)))) 1072 return IC.replaceInstUsesWith( 1073 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B)); 1074 1075 return None; 1076 } 1077 1078 static Optional<Instruction *> instCombineLD1GatherIndex(InstCombiner &IC, 1079 IntrinsicInst &II) { 1080 Value *Mask = II.getOperand(0); 1081 Value *BasePtr = II.getOperand(1); 1082 Value *Index = II.getOperand(2); 1083 Type *Ty = II.getType(); 1084 Value *PassThru = ConstantAggregateZero::get(Ty); 1085 1086 // Contiguous gather => masked load. 1087 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1)) 1088 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer) 1089 Value *IndexBase; 1090 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>( 1091 m_Value(IndexBase), m_SpecificInt(1)))) { 1092 IRBuilder<> Builder(II.getContext()); 1093 Builder.SetInsertPoint(&II); 1094 1095 Align Alignment = 1096 BasePtr->getPointerAlignment(II.getModule()->getDataLayout()); 1097 1098 Type *VecPtrTy = PointerType::getUnqual(Ty); 1099 Value *Ptr = Builder.CreateGEP( 1100 cast<VectorType>(Ty)->getElementType(), BasePtr, IndexBase); 1101 Ptr = Builder.CreateBitCast(Ptr, VecPtrTy); 1102 CallInst *MaskedLoad = 1103 Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru); 1104 MaskedLoad->takeName(&II); 1105 return IC.replaceInstUsesWith(II, MaskedLoad); 1106 } 1107 1108 return None; 1109 } 1110 1111 static Optional<Instruction *> instCombineST1ScatterIndex(InstCombiner &IC, 1112 IntrinsicInst &II) { 1113 Value *Val = II.getOperand(0); 1114 Value *Mask = II.getOperand(1); 1115 Value *BasePtr = II.getOperand(2); 1116 Value *Index = II.getOperand(3); 1117 Type *Ty = Val->getType(); 1118 1119 // Contiguous scatter => masked store. 1120 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1)) 1121 // => (masked.store Value (gep BasePtr IndexBase) Align Mask) 1122 Value *IndexBase; 1123 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>( 1124 m_Value(IndexBase), m_SpecificInt(1)))) { 1125 IRBuilder<> Builder(II.getContext()); 1126 Builder.SetInsertPoint(&II); 1127 1128 Align Alignment = 1129 BasePtr->getPointerAlignment(II.getModule()->getDataLayout()); 1130 1131 Value *Ptr = Builder.CreateGEP( 1132 cast<VectorType>(Ty)->getElementType(), BasePtr, IndexBase); 1133 Type *VecPtrTy = PointerType::getUnqual(Ty); 1134 Ptr = Builder.CreateBitCast(Ptr, VecPtrTy); 1135 1136 (void)Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask); 1137 1138 return IC.eraseInstFromFunction(II); 1139 } 1140 1141 return None; 1142 } 1143 1144 static Optional<Instruction *> instCombineSVESDIV(InstCombiner &IC, 1145 IntrinsicInst &II) { 1146 IRBuilder<> Builder(II.getContext()); 1147 Builder.SetInsertPoint(&II); 1148 Type *Int32Ty = Builder.getInt32Ty(); 1149 Value *Pred = II.getOperand(0); 1150 Value *Vec = II.getOperand(1); 1151 Value *DivVec = II.getOperand(2); 1152 1153 Value *SplatValue = getSplatValue(DivVec); 1154 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue); 1155 if (!SplatConstantInt) 1156 return None; 1157 APInt Divisor = SplatConstantInt->getValue(); 1158 1159 if (Divisor.isPowerOf2()) { 1160 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2()); 1161 auto ASRD = Builder.CreateIntrinsic( 1162 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2}); 1163 return IC.replaceInstUsesWith(II, ASRD); 1164 } 1165 if (Divisor.isNegatedPowerOf2()) { 1166 Divisor.negate(); 1167 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2()); 1168 auto ASRD = Builder.CreateIntrinsic( 1169 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2}); 1170 auto NEG = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_neg, 1171 {ASRD->getType()}, {ASRD, Pred, ASRD}); 1172 return IC.replaceInstUsesWith(II, NEG); 1173 } 1174 1175 return None; 1176 } 1177 1178 Optional<Instruction *> 1179 AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, 1180 IntrinsicInst &II) const { 1181 Intrinsic::ID IID = II.getIntrinsicID(); 1182 switch (IID) { 1183 default: 1184 break; 1185 case Intrinsic::aarch64_sve_convert_from_svbool: 1186 return instCombineConvertFromSVBool(IC, II); 1187 case Intrinsic::aarch64_sve_dup: 1188 return instCombineSVEDup(IC, II); 1189 case Intrinsic::aarch64_sve_dup_x: 1190 return instCombineSVEDupX(IC, II); 1191 case Intrinsic::aarch64_sve_cmpne: 1192 case Intrinsic::aarch64_sve_cmpne_wide: 1193 return instCombineSVECmpNE(IC, II); 1194 case Intrinsic::aarch64_sve_rdffr: 1195 return instCombineRDFFR(IC, II); 1196 case Intrinsic::aarch64_sve_lasta: 1197 case Intrinsic::aarch64_sve_lastb: 1198 return instCombineSVELast(IC, II); 1199 case Intrinsic::aarch64_sve_cntd: 1200 return instCombineSVECntElts(IC, II, 2); 1201 case Intrinsic::aarch64_sve_cntw: 1202 return instCombineSVECntElts(IC, II, 4); 1203 case Intrinsic::aarch64_sve_cnth: 1204 return instCombineSVECntElts(IC, II, 8); 1205 case Intrinsic::aarch64_sve_cntb: 1206 return instCombineSVECntElts(IC, II, 16); 1207 case Intrinsic::aarch64_sve_ptest_any: 1208 case Intrinsic::aarch64_sve_ptest_first: 1209 case Intrinsic::aarch64_sve_ptest_last: 1210 return instCombineSVEPTest(IC, II); 1211 case Intrinsic::aarch64_sve_mul: 1212 case Intrinsic::aarch64_sve_fmul: 1213 return instCombineSVEVectorMul(IC, II); 1214 case Intrinsic::aarch64_sve_fadd: 1215 return instCombineSVEVectorFAdd(IC, II); 1216 case Intrinsic::aarch64_sve_fsub: 1217 return instCombineSVEVectorBinOp(IC, II); 1218 case Intrinsic::aarch64_sve_tbl: 1219 return instCombineSVETBL(IC, II); 1220 case Intrinsic::aarch64_sve_uunpkhi: 1221 case Intrinsic::aarch64_sve_uunpklo: 1222 case Intrinsic::aarch64_sve_sunpkhi: 1223 case Intrinsic::aarch64_sve_sunpklo: 1224 return instCombineSVEUnpack(IC, II); 1225 case Intrinsic::aarch64_sve_tuple_get: 1226 return instCombineSVETupleGet(IC, II); 1227 case Intrinsic::aarch64_sve_zip1: 1228 case Intrinsic::aarch64_sve_zip2: 1229 return instCombineSVEZip(IC, II); 1230 case Intrinsic::aarch64_sve_ld1_gather_index: 1231 return instCombineLD1GatherIndex(IC, II); 1232 case Intrinsic::aarch64_sve_st1_scatter_index: 1233 return instCombineST1ScatterIndex(IC, II); 1234 case Intrinsic::aarch64_sve_ld1: 1235 return instCombineSVELD1(IC, II, DL); 1236 case Intrinsic::aarch64_sve_st1: 1237 return instCombineSVEST1(IC, II, DL); 1238 case Intrinsic::aarch64_sve_sdiv: 1239 return instCombineSVESDIV(IC, II); 1240 case Intrinsic::aarch64_sve_sel: 1241 return instCombineSVESel(IC, II); 1242 } 1243 1244 return None; 1245 } 1246 1247 Optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic( 1248 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts, 1249 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, 1250 std::function<void(Instruction *, unsigned, APInt, APInt &)> 1251 SimplifyAndSetOp) const { 1252 switch (II.getIntrinsicID()) { 1253 default: 1254 break; 1255 case Intrinsic::aarch64_neon_fcvtxn: 1256 case Intrinsic::aarch64_neon_rshrn: 1257 case Intrinsic::aarch64_neon_sqrshrn: 1258 case Intrinsic::aarch64_neon_sqrshrun: 1259 case Intrinsic::aarch64_neon_sqshrn: 1260 case Intrinsic::aarch64_neon_sqshrun: 1261 case Intrinsic::aarch64_neon_sqxtn: 1262 case Intrinsic::aarch64_neon_sqxtun: 1263 case Intrinsic::aarch64_neon_uqrshrn: 1264 case Intrinsic::aarch64_neon_uqshrn: 1265 case Intrinsic::aarch64_neon_uqxtn: 1266 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts); 1267 break; 1268 } 1269 1270 return None; 1271 } 1272 1273 bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, 1274 ArrayRef<const Value *> Args) { 1275 1276 // A helper that returns a vector type from the given type. The number of 1277 // elements in type Ty determine the vector width. 1278 auto toVectorTy = [&](Type *ArgTy) { 1279 return VectorType::get(ArgTy->getScalarType(), 1280 cast<VectorType>(DstTy)->getElementCount()); 1281 }; 1282 1283 // Exit early if DstTy is not a vector type whose elements are at least 1284 // 16-bits wide. 1285 if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16) 1286 return false; 1287 1288 // Determine if the operation has a widening variant. We consider both the 1289 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the 1290 // instructions. 1291 // 1292 // TODO: Add additional widening operations (e.g., mul, shl, etc.) once we 1293 // verify that their extending operands are eliminated during code 1294 // generation. 1295 switch (Opcode) { 1296 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2). 1297 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2). 1298 break; 1299 default: 1300 return false; 1301 } 1302 1303 // To be a widening instruction (either the "wide" or "long" versions), the 1304 // second operand must be a sign- or zero extend having a single user. We 1305 // only consider extends having a single user because they may otherwise not 1306 // be eliminated. 1307 if (Args.size() != 2 || 1308 (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) || 1309 !Args[1]->hasOneUse()) 1310 return false; 1311 auto *Extend = cast<CastInst>(Args[1]); 1312 1313 // Legalize the destination type and ensure it can be used in a widening 1314 // operation. 1315 auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy); 1316 unsigned DstElTySize = DstTyL.second.getScalarSizeInBits(); 1317 if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits()) 1318 return false; 1319 1320 // Legalize the source type and ensure it can be used in a widening 1321 // operation. 1322 auto *SrcTy = toVectorTy(Extend->getSrcTy()); 1323 auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy); 1324 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits(); 1325 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits()) 1326 return false; 1327 1328 // Get the total number of vector elements in the legalized types. 1329 InstructionCost NumDstEls = 1330 DstTyL.first * DstTyL.second.getVectorMinNumElements(); 1331 InstructionCost NumSrcEls = 1332 SrcTyL.first * SrcTyL.second.getVectorMinNumElements(); 1333 1334 // Return true if the legalized types have the same number of vector elements 1335 // and the destination element type size is twice that of the source type. 1336 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize; 1337 } 1338 1339 InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, 1340 Type *Src, 1341 TTI::CastContextHint CCH, 1342 TTI::TargetCostKind CostKind, 1343 const Instruction *I) { 1344 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1345 assert(ISD && "Invalid opcode"); 1346 1347 // If the cast is observable, and it is used by a widening instruction (e.g., 1348 // uaddl, saddw, etc.), it may be free. 1349 if (I && I->hasOneUse()) { 1350 auto *SingleUser = cast<Instruction>(*I->user_begin()); 1351 SmallVector<const Value *, 4> Operands(SingleUser->operand_values()); 1352 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) { 1353 // If the cast is the second operand, it is free. We will generate either 1354 // a "wide" or "long" version of the widening instruction. 1355 if (I == SingleUser->getOperand(1)) 1356 return 0; 1357 // If the cast is not the second operand, it will be free if it looks the 1358 // same as the second operand. In this case, we will generate a "long" 1359 // version of the widening instruction. 1360 if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1))) 1361 if (I->getOpcode() == unsigned(Cast->getOpcode()) && 1362 cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy()) 1363 return 0; 1364 } 1365 } 1366 1367 // TODO: Allow non-throughput costs that aren't binary. 1368 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost { 1369 if (CostKind != TTI::TCK_RecipThroughput) 1370 return Cost == 0 ? 0 : 1; 1371 return Cost; 1372 }; 1373 1374 EVT SrcTy = TLI->getValueType(DL, Src); 1375 EVT DstTy = TLI->getValueType(DL, Dst); 1376 1377 if (!SrcTy.isSimple() || !DstTy.isSimple()) 1378 return AdjustCost( 1379 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); 1380 1381 static const TypeConversionCostTblEntry 1382 ConversionTbl[] = { 1383 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, 1384 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 }, 1385 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, 1386 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 }, 1387 1388 // Truncations on nxvmiN 1389 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 }, 1390 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 }, 1391 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 }, 1392 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 }, 1393 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 }, 1394 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 }, 1395 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 }, 1396 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 }, 1397 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 }, 1398 { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 }, 1399 { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 }, 1400 { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 }, 1401 { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 }, 1402 { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 }, 1403 { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 }, 1404 { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 }, 1405 1406 // The number of shll instructions for the extension. 1407 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 1408 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 1409 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, 1410 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, 1411 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 1412 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 1413 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, 1414 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, 1415 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, 1416 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, 1417 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, 1418 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, 1419 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, 1420 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, 1421 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, 1422 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, 1423 1424 // LowerVectorINT_TO_FP: 1425 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 1426 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 1427 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 1428 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 1429 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 1430 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 1431 1432 // Complex: to v2f32 1433 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 1434 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, 1435 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, 1436 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 1437 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, 1438 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, 1439 1440 // Complex: to v4f32 1441 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 }, 1442 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 1443 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, 1444 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 1445 1446 // Complex: to v8f32 1447 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, 1448 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 1449 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, 1450 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 1451 1452 // Complex: to v16f32 1453 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, 1454 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, 1455 1456 // Complex: to v2f64 1457 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 1458 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, 1459 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 1460 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 1461 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, 1462 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 1463 1464 1465 // LowerVectorFP_TO_INT 1466 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 }, 1467 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, 1468 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, 1469 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, 1470 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, 1471 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, 1472 1473 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext). 1474 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 }, 1475 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 }, 1476 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 }, 1477 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 }, 1478 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 }, 1479 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 }, 1480 1481 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2 1482 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 }, 1483 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 }, 1484 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 }, 1485 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 }, 1486 1487 // Complex, from nxv2f32. 1488 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 }, 1489 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 }, 1490 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 }, 1491 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1 }, 1492 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 }, 1493 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 }, 1494 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 }, 1495 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1 }, 1496 1497 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2. 1498 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 }, 1499 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 }, 1500 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 }, 1501 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 }, 1502 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 }, 1503 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 }, 1504 1505 // Complex, from nxv2f64. 1506 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 }, 1507 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 }, 1508 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 }, 1509 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1 }, 1510 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 }, 1511 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 }, 1512 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 }, 1513 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1 }, 1514 1515 // Complex, from nxv4f32. 1516 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 }, 1517 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 }, 1518 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 }, 1519 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1 }, 1520 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 }, 1521 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 }, 1522 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 }, 1523 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1 }, 1524 1525 // Complex, from nxv8f64. Illegal -> illegal conversions not required. 1526 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 }, 1527 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7 }, 1528 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 }, 1529 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7 }, 1530 1531 // Complex, from nxv4f64. Illegal -> illegal conversions not required. 1532 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 }, 1533 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 }, 1534 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3 }, 1535 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 }, 1536 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 }, 1537 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3 }, 1538 1539 // Complex, from nxv8f32. Illegal -> illegal conversions not required. 1540 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 }, 1541 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3 }, 1542 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 }, 1543 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3 }, 1544 1545 // Complex, from nxv8f16. 1546 { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 }, 1547 { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 }, 1548 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 }, 1549 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1 }, 1550 { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 }, 1551 { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 }, 1552 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 }, 1553 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1 }, 1554 1555 // Complex, from nxv4f16. 1556 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 }, 1557 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 }, 1558 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 }, 1559 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1 }, 1560 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 }, 1561 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 }, 1562 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 }, 1563 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1 }, 1564 1565 // Complex, from nxv2f16. 1566 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 }, 1567 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 }, 1568 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 }, 1569 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1 }, 1570 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 }, 1571 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 }, 1572 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 }, 1573 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1 }, 1574 1575 // Truncate from nxvmf32 to nxvmf16. 1576 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 }, 1577 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 }, 1578 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 }, 1579 1580 // Truncate from nxvmf64 to nxvmf16. 1581 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 }, 1582 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 }, 1583 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 }, 1584 1585 // Truncate from nxvmf64 to nxvmf32. 1586 { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 }, 1587 { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 }, 1588 { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 }, 1589 1590 // Extend from nxvmf16 to nxvmf32. 1591 { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1}, 1592 { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1}, 1593 { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2}, 1594 1595 // Extend from nxvmf16 to nxvmf64. 1596 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1}, 1597 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2}, 1598 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4}, 1599 1600 // Extend from nxvmf32 to nxvmf64. 1601 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1}, 1602 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2}, 1603 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6}, 1604 1605 // Bitcasts from float to integer 1606 { ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0 }, 1607 { ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0 }, 1608 { ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0 }, 1609 1610 // Bitcasts from integer to float 1611 { ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0 }, 1612 { ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0 }, 1613 { ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0 }, 1614 }; 1615 1616 if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD, 1617 DstTy.getSimpleVT(), 1618 SrcTy.getSimpleVT())) 1619 return AdjustCost(Entry->Cost); 1620 1621 static const TypeConversionCostTblEntry FP16Tbl[] = { 1622 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs 1623 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1}, 1624 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs 1625 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1}, 1626 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs 1627 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2}, 1628 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn 1629 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2}, 1630 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs 1631 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1}, 1632 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs 1633 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4}, 1634 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn 1635 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3}, 1636 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs 1637 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2}, 1638 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs 1639 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8}, 1640 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf 1641 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf 1642 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf 1643 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf 1644 }; 1645 1646 if (ST->hasFullFP16()) 1647 if (const auto *Entry = ConvertCostTableLookup( 1648 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) 1649 return AdjustCost(Entry->Cost); 1650 1651 return AdjustCost( 1652 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); 1653 } 1654 1655 InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, 1656 Type *Dst, 1657 VectorType *VecTy, 1658 unsigned Index) { 1659 1660 // Make sure we were given a valid extend opcode. 1661 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && 1662 "Invalid opcode"); 1663 1664 // We are extending an element we extract from a vector, so the source type 1665 // of the extend is the element type of the vector. 1666 auto *Src = VecTy->getElementType(); 1667 1668 // Sign- and zero-extends are for integer types only. 1669 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type"); 1670 1671 // Get the cost for the extract. We compute the cost (if any) for the extend 1672 // below. 1673 InstructionCost Cost = 1674 getVectorInstrCost(Instruction::ExtractElement, VecTy, Index); 1675 1676 // Legalize the types. 1677 auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy); 1678 auto DstVT = TLI->getValueType(DL, Dst); 1679 auto SrcVT = TLI->getValueType(DL, Src); 1680 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 1681 1682 // If the resulting type is still a vector and the destination type is legal, 1683 // we may get the extension for free. If not, get the default cost for the 1684 // extend. 1685 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT)) 1686 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 1687 CostKind); 1688 1689 // The destination type should be larger than the element type. If not, get 1690 // the default cost for the extend. 1691 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits()) 1692 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 1693 CostKind); 1694 1695 switch (Opcode) { 1696 default: 1697 llvm_unreachable("Opcode should be either SExt or ZExt"); 1698 1699 // For sign-extends, we only need a smov, which performs the extension 1700 // automatically. 1701 case Instruction::SExt: 1702 return Cost; 1703 1704 // For zero-extends, the extend is performed automatically by a umov unless 1705 // the destination type is i64 and the element type is i8 or i16. 1706 case Instruction::ZExt: 1707 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u) 1708 return Cost; 1709 } 1710 1711 // If we are unable to perform the extend for free, get the default cost. 1712 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 1713 CostKind); 1714 } 1715 1716 InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode, 1717 TTI::TargetCostKind CostKind, 1718 const Instruction *I) { 1719 if (CostKind != TTI::TCK_RecipThroughput) 1720 return Opcode == Instruction::PHI ? 0 : 1; 1721 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind"); 1722 // Branches are assumed to be predicted. 1723 return 0; 1724 } 1725 1726 InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, 1727 unsigned Index) { 1728 assert(Val->isVectorTy() && "This must be a vector type"); 1729 1730 if (Index != -1U) { 1731 // Legalize the type. 1732 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); 1733 1734 // This type is legalized to a scalar type. 1735 if (!LT.second.isVector()) 1736 return 0; 1737 1738 // The type may be split. For fixed-width vectors we can normalize the 1739 // index to the new type. 1740 if (LT.second.isFixedLengthVector()) { 1741 unsigned Width = LT.second.getVectorNumElements(); 1742 Index = Index % Width; 1743 } 1744 1745 // The element at index zero is already inside the vector. 1746 if (Index == 0) 1747 return 0; 1748 } 1749 1750 // All other insert/extracts cost this much. 1751 return ST->getVectorInsertExtractBaseCost(); 1752 } 1753 1754 InstructionCost AArch64TTIImpl::getArithmeticInstrCost( 1755 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 1756 TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, 1757 TTI::OperandValueProperties Opd1PropInfo, 1758 TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args, 1759 const Instruction *CxtI) { 1760 // TODO: Handle more cost kinds. 1761 if (CostKind != TTI::TCK_RecipThroughput) 1762 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, 1763 Opd2Info, Opd1PropInfo, 1764 Opd2PropInfo, Args, CxtI); 1765 1766 // Legalize the type. 1767 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 1768 1769 // If the instruction is a widening instruction (e.g., uaddl, saddw, etc.), 1770 // add in the widening overhead specified by the sub-target. Since the 1771 // extends feeding widening instructions are performed automatically, they 1772 // aren't present in the generated code and have a zero cost. By adding a 1773 // widening overhead here, we attach the total cost of the combined operation 1774 // to the widening instruction. 1775 InstructionCost Cost = 0; 1776 if (isWideningInstruction(Ty, Opcode, Args)) 1777 Cost += ST->getWideningBaseCost(); 1778 1779 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1780 1781 switch (ISD) { 1782 default: 1783 return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, 1784 Opd2Info, 1785 Opd1PropInfo, Opd2PropInfo); 1786 case ISD::SDIV: 1787 if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue && 1788 Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { 1789 // On AArch64, scalar signed division by constants power-of-two are 1790 // normally expanded to the sequence ADD + CMP + SELECT + SRA. 1791 // The OperandValue properties many not be same as that of previous 1792 // operation; conservatively assume OP_None. 1793 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, 1794 Opd1Info, Opd2Info, 1795 TargetTransformInfo::OP_None, 1796 TargetTransformInfo::OP_None); 1797 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, 1798 Opd1Info, Opd2Info, 1799 TargetTransformInfo::OP_None, 1800 TargetTransformInfo::OP_None); 1801 Cost += getArithmeticInstrCost(Instruction::Select, Ty, CostKind, 1802 Opd1Info, Opd2Info, 1803 TargetTransformInfo::OP_None, 1804 TargetTransformInfo::OP_None); 1805 Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, 1806 Opd1Info, Opd2Info, 1807 TargetTransformInfo::OP_None, 1808 TargetTransformInfo::OP_None); 1809 return Cost; 1810 } 1811 LLVM_FALLTHROUGH; 1812 case ISD::UDIV: 1813 if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue) { 1814 auto VT = TLI->getValueType(DL, Ty); 1815 if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) { 1816 // Vector signed division by constant are expanded to the 1817 // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division 1818 // to MULHS + SUB + SRL + ADD + SRL. 1819 InstructionCost MulCost = getArithmeticInstrCost( 1820 Instruction::Mul, Ty, CostKind, Opd1Info, Opd2Info, 1821 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 1822 InstructionCost AddCost = getArithmeticInstrCost( 1823 Instruction::Add, Ty, CostKind, Opd1Info, Opd2Info, 1824 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 1825 InstructionCost ShrCost = getArithmeticInstrCost( 1826 Instruction::AShr, Ty, CostKind, Opd1Info, Opd2Info, 1827 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 1828 return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1; 1829 } 1830 } 1831 1832 Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, 1833 Opd2Info, 1834 Opd1PropInfo, Opd2PropInfo); 1835 if (Ty->isVectorTy()) { 1836 // On AArch64, vector divisions are not supported natively and are 1837 // expanded into scalar divisions of each pair of elements. 1838 Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, CostKind, 1839 Opd1Info, Opd2Info, Opd1PropInfo, 1840 Opd2PropInfo); 1841 Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind, 1842 Opd1Info, Opd2Info, Opd1PropInfo, 1843 Opd2PropInfo); 1844 // TODO: if one of the arguments is scalar, then it's not necessary to 1845 // double the cost of handling the vector elements. 1846 Cost += Cost; 1847 } 1848 return Cost; 1849 1850 case ISD::MUL: 1851 if (LT.second != MVT::v2i64) 1852 return (Cost + 1) * LT.first; 1853 // Since we do not have a MUL.2d instruction, a mul <2 x i64> is expensive 1854 // as elements are extracted from the vectors and the muls scalarized. 1855 // As getScalarizationOverhead is a bit too pessimistic, we estimate the 1856 // cost for a i64 vector directly here, which is: 1857 // - four i64 extracts, 1858 // - two i64 inserts, and 1859 // - two muls. 1860 // So, for a v2i64 with LT.First = 1 the cost is 8, and for a v4i64 with 1861 // LT.first = 2 the cost is 16. 1862 return LT.first * 8; 1863 case ISD::ADD: 1864 case ISD::XOR: 1865 case ISD::OR: 1866 case ISD::AND: 1867 case ISD::SRL: 1868 case ISD::SRA: 1869 case ISD::SHL: 1870 // These nodes are marked as 'custom' for combining purposes only. 1871 // We know that they are legal. See LowerAdd in ISelLowering. 1872 return (Cost + 1) * LT.first; 1873 1874 case ISD::FADD: 1875 case ISD::FSUB: 1876 case ISD::FMUL: 1877 case ISD::FDIV: 1878 case ISD::FNEG: 1879 // These nodes are marked as 'custom' just to lower them to SVE. 1880 // We know said lowering will incur no additional cost. 1881 if (!Ty->getScalarType()->isFP128Ty()) 1882 return (Cost + 2) * LT.first; 1883 1884 return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, 1885 Opd2Info, 1886 Opd1PropInfo, Opd2PropInfo); 1887 } 1888 } 1889 1890 InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty, 1891 ScalarEvolution *SE, 1892 const SCEV *Ptr) { 1893 // Address computations in vectorized code with non-consecutive addresses will 1894 // likely result in more instructions compared to scalar code where the 1895 // computation can more often be merged into the index mode. The resulting 1896 // extra micro-ops can significantly decrease throughput. 1897 unsigned NumVectorInstToHideOverhead = 10; 1898 int MaxMergeDistance = 64; 1899 1900 if (Ty->isVectorTy() && SE && 1901 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) 1902 return NumVectorInstToHideOverhead; 1903 1904 // In many cases the address computation is not merged into the instruction 1905 // addressing mode. 1906 return 1; 1907 } 1908 1909 InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 1910 Type *CondTy, 1911 CmpInst::Predicate VecPred, 1912 TTI::TargetCostKind CostKind, 1913 const Instruction *I) { 1914 // TODO: Handle other cost kinds. 1915 if (CostKind != TTI::TCK_RecipThroughput) 1916 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1917 I); 1918 1919 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1920 // We don't lower some vector selects well that are wider than the register 1921 // width. 1922 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) { 1923 // We would need this many instructions to hide the scalarization happening. 1924 const int AmortizationCost = 20; 1925 1926 // If VecPred is not set, check if we can get a predicate from the context 1927 // instruction, if its type matches the requested ValTy. 1928 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) { 1929 CmpInst::Predicate CurrentPred; 1930 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(), 1931 m_Value()))) 1932 VecPred = CurrentPred; 1933 } 1934 // Check if we have a compare/select chain that can be lowered using 1935 // a (F)CMxx & BFI pair. 1936 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE || 1937 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT || 1938 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ || 1939 VecPred == CmpInst::FCMP_UNE) { 1940 static const auto ValidMinMaxTys = { 1941 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, 1942 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64}; 1943 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16}; 1944 1945 auto LT = TLI->getTypeLegalizationCost(DL, ValTy); 1946 if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; }) || 1947 (ST->hasFullFP16() && 1948 any_of(ValidFP16MinMaxTys, [<](MVT M) { return M == LT.second; }))) 1949 return LT.first; 1950 } 1951 1952 static const TypeConversionCostTblEntry 1953 VectorSelectTbl[] = { 1954 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 }, 1955 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 }, 1956 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 }, 1957 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost }, 1958 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost }, 1959 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost } 1960 }; 1961 1962 EVT SelCondTy = TLI->getValueType(DL, CondTy); 1963 EVT SelValTy = TLI->getValueType(DL, ValTy); 1964 if (SelCondTy.isSimple() && SelValTy.isSimple()) { 1965 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD, 1966 SelCondTy.getSimpleVT(), 1967 SelValTy.getSimpleVT())) 1968 return Entry->Cost; 1969 } 1970 } 1971 // The base case handles scalable vectors fine for now, since it treats the 1972 // cost as 1 * legalization cost. 1973 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); 1974 } 1975 1976 AArch64TTIImpl::TTI::MemCmpExpansionOptions 1977 AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { 1978 TTI::MemCmpExpansionOptions Options; 1979 if (ST->requiresStrictAlign()) { 1980 // TODO: Add cost modeling for strict align. Misaligned loads expand to 1981 // a bunch of instructions when strict align is enabled. 1982 return Options; 1983 } 1984 Options.AllowOverlappingLoads = true; 1985 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); 1986 Options.NumLoadsPerBlock = Options.MaxNumLoads; 1987 // TODO: Though vector loads usually perform well on AArch64, in some targets 1988 // they may wake up the FP unit, which raises the power consumption. Perhaps 1989 // they could be used with no holds barred (-O3). 1990 Options.LoadSizes = {8, 4, 2, 1}; 1991 return Options; 1992 } 1993 1994 InstructionCost 1995 AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, 1996 Align Alignment, unsigned AddressSpace, 1997 TTI::TargetCostKind CostKind) { 1998 if (useNeonVector(Src)) 1999 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 2000 CostKind); 2001 auto LT = TLI->getTypeLegalizationCost(DL, Src); 2002 if (!LT.first.isValid()) 2003 return InstructionCost::getInvalid(); 2004 2005 // The code-generator is currently not able to handle scalable vectors 2006 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 2007 // it. This change will be removed when code-generation for these types is 2008 // sufficiently reliable. 2009 if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1)) 2010 return InstructionCost::getInvalid(); 2011 2012 return LT.first * 2; 2013 } 2014 2015 static unsigned getSVEGatherScatterOverhead(unsigned Opcode) { 2016 return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead; 2017 } 2018 2019 InstructionCost AArch64TTIImpl::getGatherScatterOpCost( 2020 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, 2021 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { 2022 if (useNeonVector(DataTy)) 2023 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 2024 Alignment, CostKind, I); 2025 auto *VT = cast<VectorType>(DataTy); 2026 auto LT = TLI->getTypeLegalizationCost(DL, DataTy); 2027 if (!LT.first.isValid()) 2028 return InstructionCost::getInvalid(); 2029 2030 // The code-generator is currently not able to handle scalable vectors 2031 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 2032 // it. This change will be removed when code-generation for these types is 2033 // sufficiently reliable. 2034 if (cast<VectorType>(DataTy)->getElementCount() == 2035 ElementCount::getScalable(1)) 2036 return InstructionCost::getInvalid(); 2037 2038 ElementCount LegalVF = LT.second.getVectorElementCount(); 2039 InstructionCost MemOpCost = 2040 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I); 2041 // Add on an overhead cost for using gathers/scatters. 2042 // TODO: At the moment this is applied unilaterally for all CPUs, but at some 2043 // point we may want a per-CPU overhead. 2044 MemOpCost *= getSVEGatherScatterOverhead(Opcode); 2045 return LT.first * MemOpCost * getMaxNumElements(LegalVF); 2046 } 2047 2048 bool AArch64TTIImpl::useNeonVector(const Type *Ty) const { 2049 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors(); 2050 } 2051 2052 InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, 2053 MaybeAlign Alignment, 2054 unsigned AddressSpace, 2055 TTI::TargetCostKind CostKind, 2056 const Instruction *I) { 2057 EVT VT = TLI->getValueType(DL, Ty, true); 2058 // Type legalization can't handle structs 2059 if (VT == MVT::Other) 2060 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, 2061 CostKind); 2062 2063 auto LT = TLI->getTypeLegalizationCost(DL, Ty); 2064 if (!LT.first.isValid()) 2065 return InstructionCost::getInvalid(); 2066 2067 // The code-generator is currently not able to handle scalable vectors 2068 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 2069 // it. This change will be removed when code-generation for these types is 2070 // sufficiently reliable. 2071 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty)) 2072 if (VTy->getElementCount() == ElementCount::getScalable(1)) 2073 return InstructionCost::getInvalid(); 2074 2075 // TODO: consider latency as well for TCK_SizeAndLatency. 2076 if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) 2077 return LT.first; 2078 2079 if (CostKind != TTI::TCK_RecipThroughput) 2080 return 1; 2081 2082 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store && 2083 LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) { 2084 // Unaligned stores are extremely inefficient. We don't split all 2085 // unaligned 128-bit stores because the negative impact that has shown in 2086 // practice on inlined block copy code. 2087 // We make such stores expensive so that we will only vectorize if there 2088 // are 6 other instructions getting vectorized. 2089 const int AmortizationCost = 6; 2090 2091 return LT.first * 2 * AmortizationCost; 2092 } 2093 2094 // Check truncating stores and extending loads. 2095 if (useNeonVector(Ty) && 2096 Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) { 2097 // v4i8 types are lowered to scalar a load/store and sshll/xtn. 2098 if (VT == MVT::v4i8) 2099 return 2; 2100 // Otherwise we need to scalarize. 2101 return cast<FixedVectorType>(Ty)->getNumElements() * 2; 2102 } 2103 2104 return LT.first; 2105 } 2106 2107 InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost( 2108 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 2109 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 2110 bool UseMaskForCond, bool UseMaskForGaps) { 2111 assert(Factor >= 2 && "Invalid interleave factor"); 2112 auto *VecVTy = cast<FixedVectorType>(VecTy); 2113 2114 if (!UseMaskForCond && !UseMaskForGaps && 2115 Factor <= TLI->getMaxSupportedInterleaveFactor()) { 2116 unsigned NumElts = VecVTy->getNumElements(); 2117 auto *SubVecTy = 2118 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor); 2119 2120 // ldN/stN only support legal vector types of size 64 or 128 in bits. 2121 // Accesses having vector types that are a multiple of 128 bits can be 2122 // matched to more than one ldN/stN instruction. 2123 bool UseScalable; 2124 if (NumElts % Factor == 0 && 2125 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable)) 2126 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable); 2127 } 2128 2129 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 2130 Alignment, AddressSpace, CostKind, 2131 UseMaskForCond, UseMaskForGaps); 2132 } 2133 2134 InstructionCost 2135 AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { 2136 InstructionCost Cost = 0; 2137 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2138 for (auto *I : Tys) { 2139 if (!I->isVectorTy()) 2140 continue; 2141 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() == 2142 128) 2143 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) + 2144 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind); 2145 } 2146 return Cost; 2147 } 2148 2149 unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) { 2150 return ST->getMaxInterleaveFactor(); 2151 } 2152 2153 // For Falkor, we want to avoid having too many strided loads in a loop since 2154 // that can exhaust the HW prefetcher resources. We adjust the unroller 2155 // MaxCount preference below to attempt to ensure unrolling doesn't create too 2156 // many strided loads. 2157 static void 2158 getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, 2159 TargetTransformInfo::UnrollingPreferences &UP) { 2160 enum { MaxStridedLoads = 7 }; 2161 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) { 2162 int StridedLoads = 0; 2163 // FIXME? We could make this more precise by looking at the CFG and 2164 // e.g. not counting loads in each side of an if-then-else diamond. 2165 for (const auto BB : L->blocks()) { 2166 for (auto &I : *BB) { 2167 LoadInst *LMemI = dyn_cast<LoadInst>(&I); 2168 if (!LMemI) 2169 continue; 2170 2171 Value *PtrValue = LMemI->getPointerOperand(); 2172 if (L->isLoopInvariant(PtrValue)) 2173 continue; 2174 2175 const SCEV *LSCEV = SE.getSCEV(PtrValue); 2176 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV); 2177 if (!LSCEVAddRec || !LSCEVAddRec->isAffine()) 2178 continue; 2179 2180 // FIXME? We could take pairing of unrolled load copies into account 2181 // by looking at the AddRec, but we would probably have to limit this 2182 // to loops with no stores or other memory optimization barriers. 2183 ++StridedLoads; 2184 // We've seen enough strided loads that seeing more won't make a 2185 // difference. 2186 if (StridedLoads > MaxStridedLoads / 2) 2187 return StridedLoads; 2188 } 2189 } 2190 return StridedLoads; 2191 }; 2192 2193 int StridedLoads = countStridedLoads(L, SE); 2194 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads 2195 << " strided loads\n"); 2196 // Pick the largest power of 2 unroll count that won't result in too many 2197 // strided loads. 2198 if (StridedLoads) { 2199 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads); 2200 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to " 2201 << UP.MaxCount << '\n'); 2202 } 2203 } 2204 2205 void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 2206 TTI::UnrollingPreferences &UP, 2207 OptimizationRemarkEmitter *ORE) { 2208 // Enable partial unrolling and runtime unrolling. 2209 BaseT::getUnrollingPreferences(L, SE, UP, ORE); 2210 2211 UP.UpperBound = true; 2212 2213 // For inner loop, it is more likely to be a hot one, and the runtime check 2214 // can be promoted out from LICM pass, so the overhead is less, let's try 2215 // a larger threshold to unroll more loops. 2216 if (L->getLoopDepth() > 1) 2217 UP.PartialThreshold *= 2; 2218 2219 // Disable partial & runtime unrolling on -Os. 2220 UP.PartialOptSizeThreshold = 0; 2221 2222 if (ST->getProcFamily() == AArch64Subtarget::Falkor && 2223 EnableFalkorHWPFUnrollFix) 2224 getFalkorUnrollingPreferences(L, SE, UP); 2225 2226 // Scan the loop: don't unroll loops with calls as this could prevent 2227 // inlining. Don't unroll vector loops either, as they don't benefit much from 2228 // unrolling. 2229 for (auto *BB : L->getBlocks()) { 2230 for (auto &I : *BB) { 2231 // Don't unroll vectorised loop. 2232 if (I.getType()->isVectorTy()) 2233 return; 2234 2235 if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 2236 if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 2237 if (!isLoweredToCall(F)) 2238 continue; 2239 } 2240 return; 2241 } 2242 } 2243 } 2244 2245 // Enable runtime unrolling for in-order models 2246 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by 2247 // checking for that case, we can ensure that the default behaviour is 2248 // unchanged 2249 if (ST->getProcFamily() != AArch64Subtarget::Others && 2250 !ST->getSchedModel().isOutOfOrder()) { 2251 UP.Runtime = true; 2252 UP.Partial = true; 2253 UP.UnrollRemainder = true; 2254 UP.DefaultUnrollRuntimeCount = 4; 2255 2256 UP.UnrollAndJam = true; 2257 UP.UnrollAndJamInnerLoopThreshold = 60; 2258 } 2259 } 2260 2261 void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 2262 TTI::PeelingPreferences &PP) { 2263 BaseT::getPeelingPreferences(L, SE, PP); 2264 } 2265 2266 Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, 2267 Type *ExpectedType) { 2268 switch (Inst->getIntrinsicID()) { 2269 default: 2270 return nullptr; 2271 case Intrinsic::aarch64_neon_st2: 2272 case Intrinsic::aarch64_neon_st3: 2273 case Intrinsic::aarch64_neon_st4: { 2274 // Create a struct type 2275 StructType *ST = dyn_cast<StructType>(ExpectedType); 2276 if (!ST) 2277 return nullptr; 2278 unsigned NumElts = Inst->arg_size() - 1; 2279 if (ST->getNumElements() != NumElts) 2280 return nullptr; 2281 for (unsigned i = 0, e = NumElts; i != e; ++i) { 2282 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i)) 2283 return nullptr; 2284 } 2285 Value *Res = UndefValue::get(ExpectedType); 2286 IRBuilder<> Builder(Inst); 2287 for (unsigned i = 0, e = NumElts; i != e; ++i) { 2288 Value *L = Inst->getArgOperand(i); 2289 Res = Builder.CreateInsertValue(Res, L, i); 2290 } 2291 return Res; 2292 } 2293 case Intrinsic::aarch64_neon_ld2: 2294 case Intrinsic::aarch64_neon_ld3: 2295 case Intrinsic::aarch64_neon_ld4: 2296 if (Inst->getType() == ExpectedType) 2297 return Inst; 2298 return nullptr; 2299 } 2300 } 2301 2302 bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, 2303 MemIntrinsicInfo &Info) { 2304 switch (Inst->getIntrinsicID()) { 2305 default: 2306 break; 2307 case Intrinsic::aarch64_neon_ld2: 2308 case Intrinsic::aarch64_neon_ld3: 2309 case Intrinsic::aarch64_neon_ld4: 2310 Info.ReadMem = true; 2311 Info.WriteMem = false; 2312 Info.PtrVal = Inst->getArgOperand(0); 2313 break; 2314 case Intrinsic::aarch64_neon_st2: 2315 case Intrinsic::aarch64_neon_st3: 2316 case Intrinsic::aarch64_neon_st4: 2317 Info.ReadMem = false; 2318 Info.WriteMem = true; 2319 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1); 2320 break; 2321 } 2322 2323 switch (Inst->getIntrinsicID()) { 2324 default: 2325 return false; 2326 case Intrinsic::aarch64_neon_ld2: 2327 case Intrinsic::aarch64_neon_st2: 2328 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS; 2329 break; 2330 case Intrinsic::aarch64_neon_ld3: 2331 case Intrinsic::aarch64_neon_st3: 2332 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS; 2333 break; 2334 case Intrinsic::aarch64_neon_ld4: 2335 case Intrinsic::aarch64_neon_st4: 2336 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS; 2337 break; 2338 } 2339 return true; 2340 } 2341 2342 /// See if \p I should be considered for address type promotion. We check if \p 2343 /// I is a sext with right type and used in memory accesses. If it used in a 2344 /// "complex" getelementptr, we allow it to be promoted without finding other 2345 /// sext instructions that sign extended the same initial value. A getelementptr 2346 /// is considered as "complex" if it has more than 2 operands. 2347 bool AArch64TTIImpl::shouldConsiderAddressTypePromotion( 2348 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) { 2349 bool Considerable = false; 2350 AllowPromotionWithoutCommonHeader = false; 2351 if (!isa<SExtInst>(&I)) 2352 return false; 2353 Type *ConsideredSExtType = 2354 Type::getInt64Ty(I.getParent()->getParent()->getContext()); 2355 if (I.getType() != ConsideredSExtType) 2356 return false; 2357 // See if the sext is the one with the right type and used in at least one 2358 // GetElementPtrInst. 2359 for (const User *U : I.users()) { 2360 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) { 2361 Considerable = true; 2362 // A getelementptr is considered as "complex" if it has more than 2 2363 // operands. We will promote a SExt used in such complex GEP as we 2364 // expect some computation to be merged if they are done on 64 bits. 2365 if (GEPInst->getNumOperands() > 2) { 2366 AllowPromotionWithoutCommonHeader = true; 2367 break; 2368 } 2369 } 2370 } 2371 return Considerable; 2372 } 2373 2374 bool AArch64TTIImpl::isLegalToVectorizeReduction( 2375 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const { 2376 if (!VF.isScalable()) 2377 return true; 2378 2379 Type *Ty = RdxDesc.getRecurrenceType(); 2380 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty)) 2381 return false; 2382 2383 switch (RdxDesc.getRecurrenceKind()) { 2384 case RecurKind::Add: 2385 case RecurKind::FAdd: 2386 case RecurKind::And: 2387 case RecurKind::Or: 2388 case RecurKind::Xor: 2389 case RecurKind::SMin: 2390 case RecurKind::SMax: 2391 case RecurKind::UMin: 2392 case RecurKind::UMax: 2393 case RecurKind::FMin: 2394 case RecurKind::FMax: 2395 case RecurKind::SelectICmp: 2396 case RecurKind::SelectFCmp: 2397 case RecurKind::FMulAdd: 2398 return true; 2399 default: 2400 return false; 2401 } 2402 } 2403 2404 InstructionCost 2405 AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, 2406 bool IsUnsigned, 2407 TTI::TargetCostKind CostKind) { 2408 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 2409 2410 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16()) 2411 return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind); 2412 2413 assert((isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) && 2414 "Both vector needs to be equally scalable"); 2415 2416 InstructionCost LegalizationCost = 0; 2417 if (LT.first > 1) { 2418 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext()); 2419 unsigned MinMaxOpcode = 2420 Ty->isFPOrFPVectorTy() 2421 ? Intrinsic::maxnum 2422 : (IsUnsigned ? Intrinsic::umin : Intrinsic::smin); 2423 IntrinsicCostAttributes Attrs(MinMaxOpcode, LegalVTy, {LegalVTy, LegalVTy}); 2424 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1); 2425 } 2426 2427 return LegalizationCost + /*Cost of horizontal reduction*/ 2; 2428 } 2429 2430 InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE( 2431 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) { 2432 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 2433 InstructionCost LegalizationCost = 0; 2434 if (LT.first > 1) { 2435 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext()); 2436 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind); 2437 LegalizationCost *= LT.first - 1; 2438 } 2439 2440 int ISD = TLI->InstructionOpcodeToISD(Opcode); 2441 assert(ISD && "Invalid opcode"); 2442 // Add the final reduction cost for the legal horizontal reduction 2443 switch (ISD) { 2444 case ISD::ADD: 2445 case ISD::AND: 2446 case ISD::OR: 2447 case ISD::XOR: 2448 case ISD::FADD: 2449 return LegalizationCost + 2; 2450 default: 2451 return InstructionCost::getInvalid(); 2452 } 2453 } 2454 2455 InstructionCost 2456 AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, 2457 Optional<FastMathFlags> FMF, 2458 TTI::TargetCostKind CostKind) { 2459 if (TTI::requiresOrderedReduction(FMF)) { 2460 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) { 2461 InstructionCost BaseCost = 2462 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 2463 // Add on extra cost to reflect the extra overhead on some CPUs. We still 2464 // end up vectorizing for more computationally intensive loops. 2465 return BaseCost + FixedVTy->getNumElements(); 2466 } 2467 2468 if (Opcode != Instruction::FAdd) 2469 return InstructionCost::getInvalid(); 2470 2471 auto *VTy = cast<ScalableVectorType>(ValTy); 2472 InstructionCost Cost = 2473 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind); 2474 Cost *= getMaxNumElements(VTy->getElementCount()); 2475 return Cost; 2476 } 2477 2478 if (isa<ScalableVectorType>(ValTy)) 2479 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind); 2480 2481 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 2482 MVT MTy = LT.second; 2483 int ISD = TLI->InstructionOpcodeToISD(Opcode); 2484 assert(ISD && "Invalid opcode"); 2485 2486 // Horizontal adds can use the 'addv' instruction. We model the cost of these 2487 // instructions as twice a normal vector add, plus 1 for each legalization 2488 // step (LT.first). This is the only arithmetic vector reduction operation for 2489 // which we have an instruction. 2490 // OR, XOR and AND costs should match the codegen from: 2491 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll 2492 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll 2493 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll 2494 static const CostTblEntry CostTblNoPairwise[]{ 2495 {ISD::ADD, MVT::v8i8, 2}, 2496 {ISD::ADD, MVT::v16i8, 2}, 2497 {ISD::ADD, MVT::v4i16, 2}, 2498 {ISD::ADD, MVT::v8i16, 2}, 2499 {ISD::ADD, MVT::v4i32, 2}, 2500 {ISD::OR, MVT::v8i8, 15}, 2501 {ISD::OR, MVT::v16i8, 17}, 2502 {ISD::OR, MVT::v4i16, 7}, 2503 {ISD::OR, MVT::v8i16, 9}, 2504 {ISD::OR, MVT::v2i32, 3}, 2505 {ISD::OR, MVT::v4i32, 5}, 2506 {ISD::OR, MVT::v2i64, 3}, 2507 {ISD::XOR, MVT::v8i8, 15}, 2508 {ISD::XOR, MVT::v16i8, 17}, 2509 {ISD::XOR, MVT::v4i16, 7}, 2510 {ISD::XOR, MVT::v8i16, 9}, 2511 {ISD::XOR, MVT::v2i32, 3}, 2512 {ISD::XOR, MVT::v4i32, 5}, 2513 {ISD::XOR, MVT::v2i64, 3}, 2514 {ISD::AND, MVT::v8i8, 15}, 2515 {ISD::AND, MVT::v16i8, 17}, 2516 {ISD::AND, MVT::v4i16, 7}, 2517 {ISD::AND, MVT::v8i16, 9}, 2518 {ISD::AND, MVT::v2i32, 3}, 2519 {ISD::AND, MVT::v4i32, 5}, 2520 {ISD::AND, MVT::v2i64, 3}, 2521 }; 2522 switch (ISD) { 2523 default: 2524 break; 2525 case ISD::ADD: 2526 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy)) 2527 return (LT.first - 1) + Entry->Cost; 2528 break; 2529 case ISD::XOR: 2530 case ISD::AND: 2531 case ISD::OR: 2532 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy); 2533 if (!Entry) 2534 break; 2535 auto *ValVTy = cast<FixedVectorType>(ValTy); 2536 if (!ValVTy->getElementType()->isIntegerTy(1) && 2537 MTy.getVectorNumElements() <= ValVTy->getNumElements() && 2538 isPowerOf2_32(ValVTy->getNumElements())) { 2539 InstructionCost ExtraCost = 0; 2540 if (LT.first != 1) { 2541 // Type needs to be split, so there is an extra cost of LT.first - 1 2542 // arithmetic ops. 2543 auto *Ty = FixedVectorType::get(ValTy->getElementType(), 2544 MTy.getVectorNumElements()); 2545 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind); 2546 ExtraCost *= LT.first - 1; 2547 } 2548 return Entry->Cost + ExtraCost; 2549 } 2550 break; 2551 } 2552 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 2553 } 2554 2555 InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) { 2556 static const CostTblEntry ShuffleTbl[] = { 2557 { TTI::SK_Splice, MVT::nxv16i8, 1 }, 2558 { TTI::SK_Splice, MVT::nxv8i16, 1 }, 2559 { TTI::SK_Splice, MVT::nxv4i32, 1 }, 2560 { TTI::SK_Splice, MVT::nxv2i64, 1 }, 2561 { TTI::SK_Splice, MVT::nxv2f16, 1 }, 2562 { TTI::SK_Splice, MVT::nxv4f16, 1 }, 2563 { TTI::SK_Splice, MVT::nxv8f16, 1 }, 2564 { TTI::SK_Splice, MVT::nxv2bf16, 1 }, 2565 { TTI::SK_Splice, MVT::nxv4bf16, 1 }, 2566 { TTI::SK_Splice, MVT::nxv8bf16, 1 }, 2567 { TTI::SK_Splice, MVT::nxv2f32, 1 }, 2568 { TTI::SK_Splice, MVT::nxv4f32, 1 }, 2569 { TTI::SK_Splice, MVT::nxv2f64, 1 }, 2570 }; 2571 2572 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 2573 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext()); 2574 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2575 EVT PromotedVT = LT.second.getScalarType() == MVT::i1 2576 ? TLI->getPromotedVTForPredicate(EVT(LT.second)) 2577 : LT.second; 2578 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext()); 2579 InstructionCost LegalizationCost = 0; 2580 if (Index < 0) { 2581 LegalizationCost = 2582 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy, 2583 CmpInst::BAD_ICMP_PREDICATE, CostKind) + 2584 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy, 2585 CmpInst::BAD_ICMP_PREDICATE, CostKind); 2586 } 2587 2588 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp 2589 // Cost performed on a promoted type. 2590 if (LT.second.getScalarType() == MVT::i1) { 2591 LegalizationCost += 2592 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy, 2593 TTI::CastContextHint::None, CostKind) + 2594 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy, 2595 TTI::CastContextHint::None, CostKind); 2596 } 2597 const auto *Entry = 2598 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT()); 2599 assert(Entry && "Illegal Type for Splice"); 2600 LegalizationCost += Entry->Cost; 2601 return LegalizationCost * LT.first; 2602 } 2603 2604 InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, 2605 VectorType *Tp, 2606 ArrayRef<int> Mask, int Index, 2607 VectorType *SubTp, 2608 ArrayRef<Value *> Args) { 2609 Kind = improveShuffleKindFromMask(Kind, Mask); 2610 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose || 2611 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc || 2612 Kind == TTI::SK_Reverse) { 2613 static const CostTblEntry ShuffleTbl[] = { 2614 // Broadcast shuffle kinds can be performed with 'dup'. 2615 { TTI::SK_Broadcast, MVT::v8i8, 1 }, 2616 { TTI::SK_Broadcast, MVT::v16i8, 1 }, 2617 { TTI::SK_Broadcast, MVT::v4i16, 1 }, 2618 { TTI::SK_Broadcast, MVT::v8i16, 1 }, 2619 { TTI::SK_Broadcast, MVT::v2i32, 1 }, 2620 { TTI::SK_Broadcast, MVT::v4i32, 1 }, 2621 { TTI::SK_Broadcast, MVT::v2i64, 1 }, 2622 { TTI::SK_Broadcast, MVT::v2f32, 1 }, 2623 { TTI::SK_Broadcast, MVT::v4f32, 1 }, 2624 { TTI::SK_Broadcast, MVT::v2f64, 1 }, 2625 // Transpose shuffle kinds can be performed with 'trn1/trn2' and 2626 // 'zip1/zip2' instructions. 2627 { TTI::SK_Transpose, MVT::v8i8, 1 }, 2628 { TTI::SK_Transpose, MVT::v16i8, 1 }, 2629 { TTI::SK_Transpose, MVT::v4i16, 1 }, 2630 { TTI::SK_Transpose, MVT::v8i16, 1 }, 2631 { TTI::SK_Transpose, MVT::v2i32, 1 }, 2632 { TTI::SK_Transpose, MVT::v4i32, 1 }, 2633 { TTI::SK_Transpose, MVT::v2i64, 1 }, 2634 { TTI::SK_Transpose, MVT::v2f32, 1 }, 2635 { TTI::SK_Transpose, MVT::v4f32, 1 }, 2636 { TTI::SK_Transpose, MVT::v2f64, 1 }, 2637 // Select shuffle kinds. 2638 // TODO: handle vXi8/vXi16. 2639 { TTI::SK_Select, MVT::v2i32, 1 }, // mov. 2640 { TTI::SK_Select, MVT::v4i32, 2 }, // rev+trn (or similar). 2641 { TTI::SK_Select, MVT::v2i64, 1 }, // mov. 2642 { TTI::SK_Select, MVT::v2f32, 1 }, // mov. 2643 { TTI::SK_Select, MVT::v4f32, 2 }, // rev+trn (or similar). 2644 { TTI::SK_Select, MVT::v2f64, 1 }, // mov. 2645 // PermuteSingleSrc shuffle kinds. 2646 { TTI::SK_PermuteSingleSrc, MVT::v2i32, 1 }, // mov. 2647 { TTI::SK_PermuteSingleSrc, MVT::v4i32, 3 }, // perfectshuffle worst case. 2648 { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // mov. 2649 { TTI::SK_PermuteSingleSrc, MVT::v2f32, 1 }, // mov. 2650 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 3 }, // perfectshuffle worst case. 2651 { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // mov. 2652 { TTI::SK_PermuteSingleSrc, MVT::v4i16, 3 }, // perfectshuffle worst case. 2653 { TTI::SK_PermuteSingleSrc, MVT::v4f16, 3 }, // perfectshuffle worst case. 2654 { TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3 }, // perfectshuffle worst case. 2655 { TTI::SK_PermuteSingleSrc, MVT::v8i16, 8 }, // constpool + load + tbl 2656 { TTI::SK_PermuteSingleSrc, MVT::v8f16, 8 }, // constpool + load + tbl 2657 { TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8 }, // constpool + load + tbl 2658 { TTI::SK_PermuteSingleSrc, MVT::v8i8, 8 }, // constpool + load + tbl 2659 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 8 }, // constpool + load + tbl 2660 // Reverse can be lowered with `rev`. 2661 { TTI::SK_Reverse, MVT::v2i32, 1 }, // mov. 2662 { TTI::SK_Reverse, MVT::v4i32, 2 }, // REV64; EXT 2663 { TTI::SK_Reverse, MVT::v2i64, 1 }, // mov. 2664 { TTI::SK_Reverse, MVT::v2f32, 1 }, // mov. 2665 { TTI::SK_Reverse, MVT::v4f32, 2 }, // REV64; EXT 2666 { TTI::SK_Reverse, MVT::v2f64, 1 }, // mov. 2667 // Broadcast shuffle kinds for scalable vectors 2668 { TTI::SK_Broadcast, MVT::nxv16i8, 1 }, 2669 { TTI::SK_Broadcast, MVT::nxv8i16, 1 }, 2670 { TTI::SK_Broadcast, MVT::nxv4i32, 1 }, 2671 { TTI::SK_Broadcast, MVT::nxv2i64, 1 }, 2672 { TTI::SK_Broadcast, MVT::nxv2f16, 1 }, 2673 { TTI::SK_Broadcast, MVT::nxv4f16, 1 }, 2674 { TTI::SK_Broadcast, MVT::nxv8f16, 1 }, 2675 { TTI::SK_Broadcast, MVT::nxv2bf16, 1 }, 2676 { TTI::SK_Broadcast, MVT::nxv4bf16, 1 }, 2677 { TTI::SK_Broadcast, MVT::nxv8bf16, 1 }, 2678 { TTI::SK_Broadcast, MVT::nxv2f32, 1 }, 2679 { TTI::SK_Broadcast, MVT::nxv4f32, 1 }, 2680 { TTI::SK_Broadcast, MVT::nxv2f64, 1 }, 2681 { TTI::SK_Broadcast, MVT::nxv16i1, 1 }, 2682 { TTI::SK_Broadcast, MVT::nxv8i1, 1 }, 2683 { TTI::SK_Broadcast, MVT::nxv4i1, 1 }, 2684 { TTI::SK_Broadcast, MVT::nxv2i1, 1 }, 2685 // Handle the cases for vector.reverse with scalable vectors 2686 { TTI::SK_Reverse, MVT::nxv16i8, 1 }, 2687 { TTI::SK_Reverse, MVT::nxv8i16, 1 }, 2688 { TTI::SK_Reverse, MVT::nxv4i32, 1 }, 2689 { TTI::SK_Reverse, MVT::nxv2i64, 1 }, 2690 { TTI::SK_Reverse, MVT::nxv2f16, 1 }, 2691 { TTI::SK_Reverse, MVT::nxv4f16, 1 }, 2692 { TTI::SK_Reverse, MVT::nxv8f16, 1 }, 2693 { TTI::SK_Reverse, MVT::nxv2bf16, 1 }, 2694 { TTI::SK_Reverse, MVT::nxv4bf16, 1 }, 2695 { TTI::SK_Reverse, MVT::nxv8bf16, 1 }, 2696 { TTI::SK_Reverse, MVT::nxv2f32, 1 }, 2697 { TTI::SK_Reverse, MVT::nxv4f32, 1 }, 2698 { TTI::SK_Reverse, MVT::nxv2f64, 1 }, 2699 { TTI::SK_Reverse, MVT::nxv16i1, 1 }, 2700 { TTI::SK_Reverse, MVT::nxv8i1, 1 }, 2701 { TTI::SK_Reverse, MVT::nxv4i1, 1 }, 2702 { TTI::SK_Reverse, MVT::nxv2i1, 1 }, 2703 }; 2704 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 2705 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second)) 2706 return LT.first * Entry->Cost; 2707 } 2708 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp)) 2709 return getSpliceCost(Tp, Index); 2710 return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp); 2711 } 2712