1 //===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "ARMTargetTransformInfo.h" 10 #include "ARMSubtarget.h" 11 #include "MCTargetDesc/ARMAddressingModes.h" 12 #include "llvm/ADT/APInt.h" 13 #include "llvm/ADT/SmallVector.h" 14 #include "llvm/Analysis/LoopInfo.h" 15 #include "llvm/CodeGen/CostTable.h" 16 #include "llvm/CodeGen/ISDOpcodes.h" 17 #include "llvm/CodeGen/ValueTypes.h" 18 #include "llvm/IR/BasicBlock.h" 19 #include "llvm/IR/DataLayout.h" 20 #include "llvm/IR/DerivedTypes.h" 21 #include "llvm/IR/Instruction.h" 22 #include "llvm/IR/Instructions.h" 23 #include "llvm/IR/IntrinsicInst.h" 24 #include "llvm/IR/IntrinsicsARM.h" 25 #include "llvm/IR/PatternMatch.h" 26 #include "llvm/IR/Type.h" 27 #include "llvm/MC/SubtargetFeature.h" 28 #include "llvm/Support/Casting.h" 29 #include "llvm/Support/MachineValueType.h" 30 #include "llvm/Target/TargetMachine.h" 31 #include <algorithm> 32 #include <cassert> 33 #include <cstdint> 34 #include <utility> 35 36 using namespace llvm; 37 38 #define DEBUG_TYPE "armtti" 39 40 static cl::opt<bool> EnableMaskedLoadStores( 41 "enable-arm-maskedldst", cl::Hidden, cl::init(true), 42 cl::desc("Enable the generation of masked loads and stores")); 43 44 static cl::opt<bool> DisableLowOverheadLoops( 45 "disable-arm-loloops", cl::Hidden, cl::init(false), 46 cl::desc("Disable the generation of low-overhead loops")); 47 48 extern cl::opt<bool> DisableTailPredication; 49 50 extern cl::opt<bool> EnableMaskedGatherScatters; 51 52 bool ARMTTIImpl::areInlineCompatible(const Function *Caller, 53 const Function *Callee) const { 54 const TargetMachine &TM = getTLI()->getTargetMachine(); 55 const FeatureBitset &CallerBits = 56 TM.getSubtargetImpl(*Caller)->getFeatureBits(); 57 const FeatureBitset &CalleeBits = 58 TM.getSubtargetImpl(*Callee)->getFeatureBits(); 59 60 // To inline a callee, all features not in the allowed list must match exactly. 61 bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) == 62 (CalleeBits & ~InlineFeaturesAllowed); 63 // For features in the allowed list, the callee's features must be a subset of 64 // the callers'. 65 bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) == 66 (CalleeBits & InlineFeaturesAllowed); 67 return MatchExact && MatchSubset; 68 } 69 70 bool ARMTTIImpl::shouldFavorBackedgeIndex(const Loop *L) const { 71 if (L->getHeader()->getParent()->hasOptSize()) 72 return false; 73 if (ST->hasMVEIntegerOps()) 74 return false; 75 return ST->isMClass() && ST->isThumb2() && L->getNumBlocks() == 1; 76 } 77 78 bool ARMTTIImpl::shouldFavorPostInc() const { 79 if (ST->hasMVEIntegerOps()) 80 return true; 81 return false; 82 } 83 84 int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, 85 TTI::TargetCostKind CostKind) { 86 assert(Ty->isIntegerTy()); 87 88 unsigned Bits = Ty->getPrimitiveSizeInBits(); 89 if (Bits == 0 || Imm.getActiveBits() >= 64) 90 return 4; 91 92 int64_t SImmVal = Imm.getSExtValue(); 93 uint64_t ZImmVal = Imm.getZExtValue(); 94 if (!ST->isThumb()) { 95 if ((SImmVal >= 0 && SImmVal < 65536) || 96 (ARM_AM::getSOImmVal(ZImmVal) != -1) || 97 (ARM_AM::getSOImmVal(~ZImmVal) != -1)) 98 return 1; 99 return ST->hasV6T2Ops() ? 2 : 3; 100 } 101 if (ST->isThumb2()) { 102 if ((SImmVal >= 0 && SImmVal < 65536) || 103 (ARM_AM::getT2SOImmVal(ZImmVal) != -1) || 104 (ARM_AM::getT2SOImmVal(~ZImmVal) != -1)) 105 return 1; 106 return ST->hasV6T2Ops() ? 2 : 3; 107 } 108 // Thumb1, any i8 imm cost 1. 109 if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256)) 110 return 1; 111 if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal)) 112 return 2; 113 // Load from constantpool. 114 return 3; 115 } 116 117 // Constants smaller than 256 fit in the immediate field of 118 // Thumb1 instructions so we return a zero cost and 1 otherwise. 119 int ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, 120 const APInt &Imm, Type *Ty) { 121 if (Imm.isNonNegative() && Imm.getLimitedValue() < 256) 122 return 0; 123 124 return 1; 125 } 126 127 int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, 128 Type *Ty, TTI::TargetCostKind CostKind) { 129 // Division by a constant can be turned into multiplication, but only if we 130 // know it's constant. So it's not so much that the immediate is cheap (it's 131 // not), but that the alternative is worse. 132 // FIXME: this is probably unneeded with GlobalISel. 133 if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv || 134 Opcode == Instruction::SRem || Opcode == Instruction::URem) && 135 Idx == 1) 136 return 0; 137 138 if (Opcode == Instruction::And) { 139 // UXTB/UXTH 140 if (Imm == 255 || Imm == 65535) 141 return 0; 142 // Conversion to BIC is free, and means we can use ~Imm instead. 143 return std::min(getIntImmCost(Imm, Ty, CostKind), 144 getIntImmCost(~Imm, Ty, CostKind)); 145 } 146 147 if (Opcode == Instruction::Add) 148 // Conversion to SUB is free, and means we can use -Imm instead. 149 return std::min(getIntImmCost(Imm, Ty, CostKind), 150 getIntImmCost(-Imm, Ty, CostKind)); 151 152 if (Opcode == Instruction::ICmp && Imm.isNegative() && 153 Ty->getIntegerBitWidth() == 32) { 154 int64_t NegImm = -Imm.getSExtValue(); 155 if (ST->isThumb2() && NegImm < 1<<12) 156 // icmp X, #-C -> cmn X, #C 157 return 0; 158 if (ST->isThumb() && NegImm < 1<<8) 159 // icmp X, #-C -> adds X, #C 160 return 0; 161 } 162 163 // xor a, -1 can always be folded to MVN 164 if (Opcode == Instruction::Xor && Imm.isAllOnesValue()) 165 return 0; 166 167 return getIntImmCost(Imm, Ty, CostKind); 168 } 169 170 int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, 171 TTI::TargetCostKind CostKind, 172 const Instruction *I) { 173 int ISD = TLI->InstructionOpcodeToISD(Opcode); 174 assert(ISD && "Invalid opcode"); 175 176 // TODO: Allow non-throughput costs that aren't binary. 177 auto AdjustCost = [&CostKind](int Cost) { 178 if (CostKind != TTI::TCK_RecipThroughput) 179 return Cost == 0 ? 0 : 1; 180 return Cost; 181 }; 182 183 EVT SrcTy = TLI->getValueType(DL, Src); 184 EVT DstTy = TLI->getValueType(DL, Dst); 185 186 if (!SrcTy.isSimple() || !DstTy.isSimple()) 187 return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I)); 188 189 // The extend of a load is free 190 if (I && isa<LoadInst>(I->getOperand(0))) { 191 static const TypeConversionCostTblEntry LoadConversionTbl[] = { 192 {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0}, 193 {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0}, 194 {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0}, 195 {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0}, 196 {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0}, 197 {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0}, 198 {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1}, 199 {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1}, 200 {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1}, 201 {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1}, 202 {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1}, 203 {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1}, 204 }; 205 if (const auto *Entry = ConvertCostTableLookup( 206 LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) 207 return AdjustCost(Entry->Cost); 208 209 static const TypeConversionCostTblEntry MVELoadConversionTbl[] = { 210 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0}, 211 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0}, 212 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0}, 213 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0}, 214 {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0}, 215 {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0}, 216 // The following extend from a legal type to an illegal type, so need to 217 // split the load. This introduced an extra load operation, but the 218 // extend is still "free". 219 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1}, 220 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1}, 221 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3}, 222 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3}, 223 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1}, 224 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1}, 225 }; 226 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) { 227 if (const auto *Entry = 228 ConvertCostTableLookup(MVELoadConversionTbl, ISD, 229 DstTy.getSimpleVT(), SrcTy.getSimpleVT())) 230 return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor()); 231 } 232 233 static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = { 234 // FPExtends are similar but also require the VCVT instructions. 235 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, 236 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3}, 237 }; 238 if (SrcTy.isVector() && ST->hasMVEFloatOps()) { 239 if (const auto *Entry = 240 ConvertCostTableLookup(MVEFLoadConversionTbl, ISD, 241 DstTy.getSimpleVT(), SrcTy.getSimpleVT())) 242 return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor()); 243 } 244 } 245 246 // The truncate of a store is free. This is the mirror of extends above. 247 if (I && I->hasOneUse() && isa<StoreInst>(*I->user_begin())) { 248 static const TypeConversionCostTblEntry MVELoadConversionTbl[] = { 249 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0}, 250 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0}, 251 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0}, 252 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1}, 253 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3}, 254 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1}, 255 }; 256 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) { 257 if (const auto *Entry = 258 ConvertCostTableLookup(MVELoadConversionTbl, ISD, SrcTy.getSimpleVT(), 259 DstTy.getSimpleVT())) 260 return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor()); 261 } 262 263 static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = { 264 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1}, 265 {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3}, 266 }; 267 if (SrcTy.isVector() && ST->hasMVEFloatOps()) { 268 if (const auto *Entry = 269 ConvertCostTableLookup(MVEFLoadConversionTbl, ISD, SrcTy.getSimpleVT(), 270 DstTy.getSimpleVT())) 271 return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor()); 272 } 273 } 274 275 // NEON vector operations that can extend their inputs. 276 if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) && 277 I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) { 278 static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = { 279 // vaddl 280 { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 }, 281 { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 }, 282 // vsubl 283 { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 }, 284 { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 }, 285 // vmull 286 { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 }, 287 { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 }, 288 // vshll 289 { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 }, 290 { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 }, 291 }; 292 293 auto *User = cast<Instruction>(*I->user_begin()); 294 int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode()); 295 if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD, 296 DstTy.getSimpleVT(), 297 SrcTy.getSimpleVT())) { 298 return AdjustCost(Entry->Cost); 299 } 300 } 301 302 // Single to/from double precision conversions. 303 if (Src->isVectorTy() && ST->hasNEON() && 304 ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 && 305 DstTy.getScalarType() == MVT::f32) || 306 (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 && 307 DstTy.getScalarType() == MVT::f64))) { 308 static const CostTblEntry NEONFltDblTbl[] = { 309 // Vector fptrunc/fpext conversions. 310 {ISD::FP_ROUND, MVT::v2f64, 2}, 311 {ISD::FP_EXTEND, MVT::v2f32, 2}, 312 {ISD::FP_EXTEND, MVT::v4f32, 4}}; 313 314 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); 315 if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second)) 316 return AdjustCost(LT.first * Entry->Cost); 317 } 318 319 // Some arithmetic, load and store operations have specific instructions 320 // to cast up/down their types automatically at no extra cost. 321 // TODO: Get these tables to know at least what the related operations are. 322 static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = { 323 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, 324 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, 325 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 }, 326 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 }, 327 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 }, 328 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, 329 330 // The number of vmovl instructions for the extension. 331 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, 332 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, 333 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, 334 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, 335 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 3 }, 336 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 3 }, 337 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 }, 338 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 }, 339 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 340 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 341 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 342 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 343 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, 344 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, 345 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, 346 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, 347 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, 348 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, 349 350 // Operations that we legalize using splitting. 351 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 }, 352 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, 353 354 // Vector float <-> i32 conversions. 355 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 356 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 357 358 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 359 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 360 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 }, 361 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 }, 362 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 363 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 364 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, 365 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, 366 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, 367 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, 368 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 369 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 370 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 371 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 372 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 }, 373 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 }, 374 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 }, 375 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 }, 376 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 }, 377 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 }, 378 379 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, 380 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, 381 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 }, 382 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 }, 383 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 }, 384 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 }, 385 386 // Vector double <-> i32 conversions. 387 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 388 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 389 390 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 391 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 392 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 }, 393 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 }, 394 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 395 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 396 397 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 }, 398 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 }, 399 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 4 }, 400 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 4 }, 401 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 8 }, 402 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 8 } 403 }; 404 405 if (SrcTy.isVector() && ST->hasNEON()) { 406 if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD, 407 DstTy.getSimpleVT(), 408 SrcTy.getSimpleVT())) 409 return AdjustCost(Entry->Cost); 410 } 411 412 // Scalar float to integer conversions. 413 static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = { 414 { ISD::FP_TO_SINT, MVT::i1, MVT::f32, 2 }, 415 { ISD::FP_TO_UINT, MVT::i1, MVT::f32, 2 }, 416 { ISD::FP_TO_SINT, MVT::i1, MVT::f64, 2 }, 417 { ISD::FP_TO_UINT, MVT::i1, MVT::f64, 2 }, 418 { ISD::FP_TO_SINT, MVT::i8, MVT::f32, 2 }, 419 { ISD::FP_TO_UINT, MVT::i8, MVT::f32, 2 }, 420 { ISD::FP_TO_SINT, MVT::i8, MVT::f64, 2 }, 421 { ISD::FP_TO_UINT, MVT::i8, MVT::f64, 2 }, 422 { ISD::FP_TO_SINT, MVT::i16, MVT::f32, 2 }, 423 { ISD::FP_TO_UINT, MVT::i16, MVT::f32, 2 }, 424 { ISD::FP_TO_SINT, MVT::i16, MVT::f64, 2 }, 425 { ISD::FP_TO_UINT, MVT::i16, MVT::f64, 2 }, 426 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 2 }, 427 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 2 }, 428 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 2 }, 429 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 2 }, 430 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 10 }, 431 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 10 }, 432 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 10 }, 433 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 10 } 434 }; 435 if (SrcTy.isFloatingPoint() && ST->hasNEON()) { 436 if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD, 437 DstTy.getSimpleVT(), 438 SrcTy.getSimpleVT())) 439 return AdjustCost(Entry->Cost); 440 } 441 442 // Scalar integer to float conversions. 443 static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = { 444 { ISD::SINT_TO_FP, MVT::f32, MVT::i1, 2 }, 445 { ISD::UINT_TO_FP, MVT::f32, MVT::i1, 2 }, 446 { ISD::SINT_TO_FP, MVT::f64, MVT::i1, 2 }, 447 { ISD::UINT_TO_FP, MVT::f64, MVT::i1, 2 }, 448 { ISD::SINT_TO_FP, MVT::f32, MVT::i8, 2 }, 449 { ISD::UINT_TO_FP, MVT::f32, MVT::i8, 2 }, 450 { ISD::SINT_TO_FP, MVT::f64, MVT::i8, 2 }, 451 { ISD::UINT_TO_FP, MVT::f64, MVT::i8, 2 }, 452 { ISD::SINT_TO_FP, MVT::f32, MVT::i16, 2 }, 453 { ISD::UINT_TO_FP, MVT::f32, MVT::i16, 2 }, 454 { ISD::SINT_TO_FP, MVT::f64, MVT::i16, 2 }, 455 { ISD::UINT_TO_FP, MVT::f64, MVT::i16, 2 }, 456 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 2 }, 457 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 2 }, 458 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 2 }, 459 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 2 }, 460 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 10 }, 461 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 10 }, 462 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 10 }, 463 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 10 } 464 }; 465 466 if (SrcTy.isInteger() && ST->hasNEON()) { 467 if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl, 468 ISD, DstTy.getSimpleVT(), 469 SrcTy.getSimpleVT())) 470 return AdjustCost(Entry->Cost); 471 } 472 473 // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one 474 // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext 475 // are linearised so take more. 476 static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = { 477 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, 478 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, 479 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, 480 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, 481 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 }, 482 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 }, 483 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, 484 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, 485 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 }, 486 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 }, 487 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 }, 488 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 }, 489 }; 490 491 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) { 492 if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl, 493 ISD, DstTy.getSimpleVT(), 494 SrcTy.getSimpleVT())) 495 return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor()); 496 } 497 498 if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) { 499 // As general rule, fp converts that were not matched above are scalarized 500 // and cost 1 vcvt for each lane, so long as the instruction is available. 501 // If not it will become a series of function calls. 502 const int CallCost = getCallInstrCost(nullptr, Dst, {Src}, CostKind); 503 int Lanes = 1; 504 if (SrcTy.isFixedLengthVector()) 505 Lanes = SrcTy.getVectorNumElements(); 506 auto IsLegal = [this](EVT VT) { 507 EVT EltVT = VT.getScalarType(); 508 return (EltVT == MVT::f32 && ST->hasVFP2Base()) || 509 (EltVT == MVT::f64 && ST->hasFP64()) || 510 (EltVT == MVT::f16 && ST->hasFullFP16()); 511 }; 512 513 if (IsLegal(SrcTy) && IsLegal(DstTy)) 514 return Lanes; 515 else 516 return Lanes * CallCost; 517 } 518 519 // Scalar integer conversion costs. 520 static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = { 521 // i16 -> i64 requires two dependent operations. 522 { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 }, 523 524 // Truncates on i64 are assumed to be free. 525 { ISD::TRUNCATE, MVT::i32, MVT::i64, 0 }, 526 { ISD::TRUNCATE, MVT::i16, MVT::i64, 0 }, 527 { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 }, 528 { ISD::TRUNCATE, MVT::i1, MVT::i64, 0 } 529 }; 530 531 if (SrcTy.isInteger()) { 532 if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD, 533 DstTy.getSimpleVT(), 534 SrcTy.getSimpleVT())) 535 return AdjustCost(Entry->Cost); 536 } 537 538 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy() 539 ? ST->getMVEVectorCostFactor() 540 : 1; 541 return AdjustCost( 542 BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I)); 543 } 544 545 int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, 546 unsigned Index) { 547 // Penalize inserting into an D-subregister. We end up with a three times 548 // lower estimated throughput on swift. 549 if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement && 550 ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32) 551 return 3; 552 553 if (ST->hasNEON() && (Opcode == Instruction::InsertElement || 554 Opcode == Instruction::ExtractElement)) { 555 // Cross-class copies are expensive on many microarchitectures, 556 // so assume they are expensive by default. 557 if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy()) 558 return 3; 559 560 // Even if it's not a cross class copy, this likely leads to mixing 561 // of NEON and VFP code and should be therefore penalized. 562 if (ValTy->isVectorTy() && 563 ValTy->getScalarSizeInBits() <= 32) 564 return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U); 565 } 566 567 if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement || 568 Opcode == Instruction::ExtractElement)) { 569 // We say MVE moves costs at least the MVEVectorCostFactor, even though 570 // they are scalar instructions. This helps prevent mixing scalar and 571 // vector, to prevent vectorising where we end up just scalarising the 572 // result anyway. 573 return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index), 574 ST->getMVEVectorCostFactor()) * 575 cast<FixedVectorType>(ValTy)->getNumElements() / 2; 576 } 577 578 return BaseT::getVectorInstrCost(Opcode, ValTy, Index); 579 } 580 581 int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, 582 TTI::TargetCostKind CostKind, 583 const Instruction *I) { 584 // TODO: Handle other cost kinds. 585 if (CostKind != TTI::TCK_RecipThroughput) 586 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I); 587 588 int ISD = TLI->InstructionOpcodeToISD(Opcode); 589 // On NEON a vector select gets lowered to vbsl. 590 if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT) { 591 // Lowering of some vector selects is currently far from perfect. 592 static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = { 593 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 }, 594 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 }, 595 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 } 596 }; 597 598 EVT SelCondTy = TLI->getValueType(DL, CondTy); 599 EVT SelValTy = TLI->getValueType(DL, ValTy); 600 if (SelCondTy.isSimple() && SelValTy.isSimple()) { 601 if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD, 602 SelCondTy.getSimpleVT(), 603 SelValTy.getSimpleVT())) 604 return Entry->Cost; 605 } 606 607 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 608 return LT.first; 609 } 610 611 int BaseCost = ST->hasMVEIntegerOps() && ValTy->isVectorTy() 612 ? ST->getMVEVectorCostFactor() 613 : 1; 614 return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, 615 I); 616 } 617 618 int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, 619 const SCEV *Ptr) { 620 // Address computations in vectorized code with non-consecutive addresses will 621 // likely result in more instructions compared to scalar code where the 622 // computation can more often be merged into the index mode. The resulting 623 // extra micro-ops can significantly decrease throughput. 624 unsigned NumVectorInstToHideOverhead = 10; 625 int MaxMergeDistance = 64; 626 627 if (ST->hasNEON()) { 628 if (Ty->isVectorTy() && SE && 629 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) 630 return NumVectorInstToHideOverhead; 631 632 // In many cases the address computation is not merged into the instruction 633 // addressing mode. 634 return 1; 635 } 636 return BaseT::getAddressComputationCost(Ty, SE, Ptr); 637 } 638 639 bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) { 640 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { 641 // If a VCTP is part of a chain, it's already profitable and shouldn't be 642 // optimized, else LSR may block tail-predication. 643 switch (II->getIntrinsicID()) { 644 case Intrinsic::arm_mve_vctp8: 645 case Intrinsic::arm_mve_vctp16: 646 case Intrinsic::arm_mve_vctp32: 647 case Intrinsic::arm_mve_vctp64: 648 return true; 649 default: 650 break; 651 } 652 } 653 return false; 654 } 655 656 bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) { 657 if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps()) 658 return false; 659 660 if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) { 661 // Don't support v2i1 yet. 662 if (VecTy->getNumElements() == 2) 663 return false; 664 665 // We don't support extending fp types. 666 unsigned VecWidth = DataTy->getPrimitiveSizeInBits(); 667 if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy()) 668 return false; 669 } 670 671 unsigned EltWidth = DataTy->getScalarSizeInBits(); 672 return (EltWidth == 32 && Alignment >= 4) || 673 (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8); 674 } 675 676 bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) { 677 if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps()) 678 return false; 679 680 // This method is called in 2 places: 681 // - from the vectorizer with a scalar type, in which case we need to get 682 // this as good as we can with the limited info we have (and rely on the cost 683 // model for the rest). 684 // - from the masked intrinsic lowering pass with the actual vector type. 685 // For MVE, we have a custom lowering pass that will already have custom 686 // legalised any gathers that we can to MVE intrinsics, and want to expand all 687 // the rest. The pass runs before the masked intrinsic lowering pass, so if we 688 // are here, we know we want to expand. 689 if (isa<VectorType>(Ty)) 690 return false; 691 692 unsigned EltWidth = Ty->getScalarSizeInBits(); 693 return ((EltWidth == 32 && Alignment >= 4) || 694 (EltWidth == 16 && Alignment >= 2) || EltWidth == 8); 695 } 696 697 int ARMTTIImpl::getMemcpyCost(const Instruction *I) { 698 const MemCpyInst *MI = dyn_cast<MemCpyInst>(I); 699 assert(MI && "MemcpyInst expected"); 700 ConstantInt *C = dyn_cast<ConstantInt>(MI->getLength()); 701 702 // To model the cost of a library call, we assume 1 for the call, and 703 // 3 for the argument setup. 704 const unsigned LibCallCost = 4; 705 706 // If 'size' is not a constant, a library call will be generated. 707 if (!C) 708 return LibCallCost; 709 710 const unsigned Size = C->getValue().getZExtValue(); 711 const Align DstAlign = *MI->getDestAlign(); 712 const Align SrcAlign = *MI->getSourceAlign(); 713 const Function *F = I->getParent()->getParent(); 714 const unsigned Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize()); 715 std::vector<EVT> MemOps; 716 717 // MemOps will be poplulated with a list of data types that needs to be 718 // loaded and stored. That's why we multiply the number of elements by 2 to 719 // get the cost for this memcpy. 720 if (getTLI()->findOptimalMemOpLowering( 721 MemOps, Limit, 722 MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign, 723 /*IsVolatile*/ true), 724 MI->getDestAddressSpace(), MI->getSourceAddressSpace(), 725 F->getAttributes())) 726 return MemOps.size() * 2; 727 728 // If we can't find an optimal memop lowering, return the default cost 729 return LibCallCost; 730 } 731 732 int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, 733 int Index, VectorType *SubTp) { 734 if (ST->hasNEON()) { 735 if (Kind == TTI::SK_Broadcast) { 736 static const CostTblEntry NEONDupTbl[] = { 737 // VDUP handles these cases. 738 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, 739 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, 740 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, 741 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, 742 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1}, 743 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1}, 744 745 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}, 746 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, 747 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, 748 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}}; 749 750 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 751 752 if (const auto *Entry = 753 CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second)) 754 return LT.first * Entry->Cost; 755 } 756 if (Kind == TTI::SK_Reverse) { 757 static const CostTblEntry NEONShuffleTbl[] = { 758 // Reverse shuffle cost one instruction if we are shuffling within a 759 // double word (vrev) or two if we shuffle a quad word (vrev, vext). 760 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, 761 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, 762 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, 763 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, 764 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1}, 765 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1}, 766 767 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, 768 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, 769 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2}, 770 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}}; 771 772 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 773 774 if (const auto *Entry = 775 CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) 776 return LT.first * Entry->Cost; 777 } 778 if (Kind == TTI::SK_Select) { 779 static const CostTblEntry NEONSelShuffleTbl[] = { 780 // Select shuffle cost table for ARM. Cost is the number of 781 // instructions 782 // required to create the shuffled vector. 783 784 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, 785 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, 786 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, 787 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, 788 789 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, 790 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, 791 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2}, 792 793 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16}, 794 795 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}}; 796 797 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 798 if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl, 799 ISD::VECTOR_SHUFFLE, LT.second)) 800 return LT.first * Entry->Cost; 801 } 802 } 803 if (ST->hasMVEIntegerOps()) { 804 if (Kind == TTI::SK_Broadcast) { 805 static const CostTblEntry MVEDupTbl[] = { 806 // VDUP handles these cases. 807 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}, 808 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, 809 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}, 810 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, 811 {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}}; 812 813 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 814 815 if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE, 816 LT.second)) 817 return LT.first * Entry->Cost * ST->getMVEVectorCostFactor(); 818 } 819 } 820 int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy() 821 ? ST->getMVEVectorCostFactor() 822 : 1; 823 return BaseCost * BaseT::getShuffleCost(Kind, Tp, Index, SubTp); 824 } 825 826 int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, 827 TTI::TargetCostKind CostKind, 828 TTI::OperandValueKind Op1Info, 829 TTI::OperandValueKind Op2Info, 830 TTI::OperandValueProperties Opd1PropInfo, 831 TTI::OperandValueProperties Opd2PropInfo, 832 ArrayRef<const Value *> Args, 833 const Instruction *CxtI) { 834 // TODO: Handle more cost kinds. 835 if (CostKind != TTI::TCK_RecipThroughput) 836 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, 837 Op2Info, Opd1PropInfo, 838 Opd2PropInfo, Args, CxtI); 839 840 int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode); 841 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 842 843 if (ST->hasNEON()) { 844 const unsigned FunctionCallDivCost = 20; 845 const unsigned ReciprocalDivCost = 10; 846 static const CostTblEntry CostTbl[] = { 847 // Division. 848 // These costs are somewhat random. Choose a cost of 20 to indicate that 849 // vectorizing devision (added function call) is going to be very expensive. 850 // Double registers types. 851 { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost}, 852 { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost}, 853 { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost}, 854 { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost}, 855 { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost}, 856 { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost}, 857 { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost}, 858 { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost}, 859 { ISD::SDIV, MVT::v4i16, ReciprocalDivCost}, 860 { ISD::UDIV, MVT::v4i16, ReciprocalDivCost}, 861 { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost}, 862 { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost}, 863 { ISD::SDIV, MVT::v8i8, ReciprocalDivCost}, 864 { ISD::UDIV, MVT::v8i8, ReciprocalDivCost}, 865 { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost}, 866 { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost}, 867 // Quad register types. 868 { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost}, 869 { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost}, 870 { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost}, 871 { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost}, 872 { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost}, 873 { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost}, 874 { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost}, 875 { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost}, 876 { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost}, 877 { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost}, 878 { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost}, 879 { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost}, 880 { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost}, 881 { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost}, 882 { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost}, 883 { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost}, 884 // Multiplication. 885 }; 886 887 if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second)) 888 return LT.first * Entry->Cost; 889 890 int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, 891 Op2Info, 892 Opd1PropInfo, Opd2PropInfo); 893 894 // This is somewhat of a hack. The problem that we are facing is that SROA 895 // creates a sequence of shift, and, or instructions to construct values. 896 // These sequences are recognized by the ISel and have zero-cost. Not so for 897 // the vectorized code. Because we have support for v2i64 but not i64 those 898 // sequences look particularly beneficial to vectorize. 899 // To work around this we increase the cost of v2i64 operations to make them 900 // seem less beneficial. 901 if (LT.second == MVT::v2i64 && 902 Op2Info == TargetTransformInfo::OK_UniformConstantValue) 903 Cost += 4; 904 905 return Cost; 906 } 907 908 // If this operation is a shift on arm/thumb2, it might well be folded into 909 // the following instruction, hence having a cost of 0. 910 auto LooksLikeAFreeShift = [&]() { 911 if (ST->isThumb1Only() || Ty->isVectorTy()) 912 return false; 913 914 if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift()) 915 return false; 916 if (Op2Info != TargetTransformInfo::OK_UniformConstantValue) 917 return false; 918 919 // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB 920 switch (cast<Instruction>(CxtI->user_back())->getOpcode()) { 921 case Instruction::Add: 922 case Instruction::Sub: 923 case Instruction::And: 924 case Instruction::Xor: 925 case Instruction::Or: 926 case Instruction::ICmp: 927 return true; 928 default: 929 return false; 930 } 931 }; 932 if (LooksLikeAFreeShift()) 933 return 0; 934 935 int BaseCost = ST->hasMVEIntegerOps() && Ty->isVectorTy() 936 ? ST->getMVEVectorCostFactor() 937 : 1; 938 939 // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost, 940 // without treating floats as more expensive that scalars or increasing the 941 // costs for custom operations. The results is also multiplied by the 942 // MVEVectorCostFactor where appropriate. 943 if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second)) 944 return LT.first * BaseCost; 945 946 // Else this is expand, assume that we need to scalarize this op. 947 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) { 948 unsigned Num = VTy->getNumElements(); 949 unsigned Cost = getArithmeticInstrCost(Opcode, Ty->getScalarType(), 950 CostKind); 951 // Return the cost of multiple scalar invocation plus the cost of 952 // inserting and extracting the values. 953 return BaseT::getScalarizationOverhead(VTy, Args) + Num * Cost; 954 } 955 956 return BaseCost; 957 } 958 959 int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, 960 MaybeAlign Alignment, unsigned AddressSpace, 961 TTI::TargetCostKind CostKind, 962 const Instruction *I) { 963 // TODO: Handle other cost kinds. 964 if (CostKind != TTI::TCK_RecipThroughput) 965 return 1; 966 967 // Type legalization can't handle structs 968 if (TLI->getValueType(DL, Src, true) == MVT::Other) 969 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 970 CostKind); 971 972 if (ST->hasNEON() && Src->isVectorTy() && 973 (Alignment && *Alignment != Align(16)) && 974 cast<VectorType>(Src)->getElementType()->isDoubleTy()) { 975 // Unaligned loads/stores are extremely inefficient. 976 // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr. 977 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); 978 return LT.first * 4; 979 } 980 981 // MVE can optimize a fpext(load(4xhalf)) using an extending integer load. 982 // Same for stores. 983 if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I && 984 ((Opcode == Instruction::Load && I->hasOneUse() && 985 isa<FPExtInst>(*I->user_begin())) || 986 (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) { 987 FixedVectorType *SrcVTy = cast<FixedVectorType>(Src); 988 Type *DstTy = 989 Opcode == Instruction::Load 990 ? (*I->user_begin())->getType() 991 : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType(); 992 if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() && 993 DstTy->getScalarType()->isFloatTy()) 994 return ST->getMVEVectorCostFactor(); 995 } 996 997 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy() 998 ? ST->getMVEVectorCostFactor() 999 : 1; 1000 return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 1001 CostKind, I); 1002 } 1003 1004 int ARMTTIImpl::getInterleavedMemoryOpCost( 1005 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 1006 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 1007 bool UseMaskForCond, bool UseMaskForGaps) { 1008 assert(Factor >= 2 && "Invalid interleave factor"); 1009 assert(isa<VectorType>(VecTy) && "Expect a vector type"); 1010 1011 // vldN/vstN doesn't support vector types of i64/f64 element. 1012 bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64; 1013 1014 if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits && 1015 !UseMaskForCond && !UseMaskForGaps) { 1016 unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements(); 1017 auto *SubVecTy = 1018 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor); 1019 1020 // vldN/vstN only support legal vector types of size 64 or 128 in bits. 1021 // Accesses having vector types that are a multiple of 128 bits can be 1022 // matched to more than one vldN/vstN instruction. 1023 int BaseCost = ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor() : 1; 1024 if (NumElts % Factor == 0 && 1025 TLI->isLegalInterleavedAccessType(Factor, SubVecTy, DL)) 1026 return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL); 1027 1028 // Some smaller than legal interleaved patterns are cheap as we can make 1029 // use of the vmovn or vrev patterns to interleave a standard load. This is 1030 // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is 1031 // promoted differently). The cost of 2 here is then a load and vrev or 1032 // vmovn. 1033 if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 && 1034 VecTy->isIntOrIntVectorTy() && DL.getTypeSizeInBits(SubVecTy) <= 64) 1035 return 2 * BaseCost; 1036 } 1037 1038 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 1039 Alignment, AddressSpace, CostKind, 1040 UseMaskForCond, UseMaskForGaps); 1041 } 1042 1043 unsigned ARMTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy, 1044 const Value *Ptr, bool VariableMask, 1045 Align Alignment, 1046 TTI::TargetCostKind CostKind, 1047 const Instruction *I) { 1048 using namespace PatternMatch; 1049 if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters) 1050 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 1051 Alignment, CostKind, I); 1052 1053 assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!"); 1054 auto *VTy = cast<FixedVectorType>(DataTy); 1055 1056 // TODO: Splitting, once we do that. 1057 1058 unsigned NumElems = VTy->getNumElements(); 1059 unsigned EltSize = VTy->getScalarSizeInBits(); 1060 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, DataTy); 1061 1062 // For now, it is assumed that for the MVE gather instructions the loads are 1063 // all effectively serialised. This means the cost is the scalar cost 1064 // multiplied by the number of elements being loaded. This is possibly very 1065 // conservative, but even so we still end up vectorising loops because the 1066 // cost per iteration for many loops is lower than for scalar loops. 1067 unsigned VectorCost = NumElems * LT.first; 1068 // The scalarization cost should be a lot higher. We use the number of vector 1069 // elements plus the scalarization overhead. 1070 unsigned ScalarCost = 1071 NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, {}); 1072 1073 if (Alignment < EltSize / 8) 1074 return ScalarCost; 1075 1076 unsigned ExtSize = EltSize; 1077 // Check whether there's a single user that asks for an extended type 1078 if (I != nullptr) { 1079 // Dependent of the caller of this function, a gather instruction will 1080 // either have opcode Instruction::Load or be a call to the masked_gather 1081 // intrinsic 1082 if ((I->getOpcode() == Instruction::Load || 1083 match(I, m_Intrinsic<Intrinsic::masked_gather>())) && 1084 I->hasOneUse()) { 1085 const User *Us = *I->users().begin(); 1086 if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) { 1087 // only allow valid type combinations 1088 unsigned TypeSize = 1089 cast<Instruction>(Us)->getType()->getScalarSizeInBits(); 1090 if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) || 1091 (TypeSize == 16 && EltSize == 8)) && 1092 TypeSize * NumElems == 128) { 1093 ExtSize = TypeSize; 1094 } 1095 } 1096 } 1097 // Check whether the input data needs to be truncated 1098 TruncInst *T; 1099 if ((I->getOpcode() == Instruction::Store || 1100 match(I, m_Intrinsic<Intrinsic::masked_scatter>())) && 1101 (T = dyn_cast<TruncInst>(I->getOperand(0)))) { 1102 // Only allow valid type combinations 1103 unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits(); 1104 if (((EltSize == 16 && TypeSize == 32) || 1105 (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) && 1106 TypeSize * NumElems == 128) 1107 ExtSize = TypeSize; 1108 } 1109 } 1110 1111 if (ExtSize * NumElems != 128 || NumElems < 4) 1112 return ScalarCost; 1113 1114 // Any (aligned) i32 gather will not need to be scalarised. 1115 if (ExtSize == 32) 1116 return VectorCost; 1117 // For smaller types, we need to ensure that the gep's inputs are correctly 1118 // extended from a small enough value. Other sizes (including i64) are 1119 // scalarized for now. 1120 if (ExtSize != 8 && ExtSize != 16) 1121 return ScalarCost; 1122 1123 if (const auto *BC = dyn_cast<BitCastInst>(Ptr)) 1124 Ptr = BC->getOperand(0); 1125 if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) { 1126 if (GEP->getNumOperands() != 2) 1127 return ScalarCost; 1128 unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType()); 1129 // Scale needs to be correct (which is only relevant for i16s). 1130 if (Scale != 1 && Scale * 8 != ExtSize) 1131 return ScalarCost; 1132 // And we need to zext (not sext) the indexes from a small enough type. 1133 if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) { 1134 if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize) 1135 return VectorCost; 1136 } 1137 return ScalarCost; 1138 } 1139 return ScalarCost; 1140 } 1141 1142 bool ARMTTIImpl::isLoweredToCall(const Function *F) { 1143 if (!F->isIntrinsic()) 1144 BaseT::isLoweredToCall(F); 1145 1146 // Assume all Arm-specific intrinsics map to an instruction. 1147 if (F->getName().startswith("llvm.arm")) 1148 return false; 1149 1150 switch (F->getIntrinsicID()) { 1151 default: break; 1152 case Intrinsic::powi: 1153 case Intrinsic::sin: 1154 case Intrinsic::cos: 1155 case Intrinsic::pow: 1156 case Intrinsic::log: 1157 case Intrinsic::log10: 1158 case Intrinsic::log2: 1159 case Intrinsic::exp: 1160 case Intrinsic::exp2: 1161 return true; 1162 case Intrinsic::sqrt: 1163 case Intrinsic::fabs: 1164 case Intrinsic::copysign: 1165 case Intrinsic::floor: 1166 case Intrinsic::ceil: 1167 case Intrinsic::trunc: 1168 case Intrinsic::rint: 1169 case Intrinsic::nearbyint: 1170 case Intrinsic::round: 1171 case Intrinsic::canonicalize: 1172 case Intrinsic::lround: 1173 case Intrinsic::llround: 1174 case Intrinsic::lrint: 1175 case Intrinsic::llrint: 1176 if (F->getReturnType()->isDoubleTy() && !ST->hasFP64()) 1177 return true; 1178 if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16()) 1179 return true; 1180 // Some operations can be handled by vector instructions and assume 1181 // unsupported vectors will be expanded into supported scalar ones. 1182 // TODO Handle scalar operations properly. 1183 return !ST->hasFPARMv8Base() && !ST->hasVFP2Base(); 1184 case Intrinsic::masked_store: 1185 case Intrinsic::masked_load: 1186 case Intrinsic::masked_gather: 1187 case Intrinsic::masked_scatter: 1188 return !ST->hasMVEIntegerOps(); 1189 case Intrinsic::sadd_with_overflow: 1190 case Intrinsic::uadd_with_overflow: 1191 case Intrinsic::ssub_with_overflow: 1192 case Intrinsic::usub_with_overflow: 1193 case Intrinsic::sadd_sat: 1194 case Intrinsic::uadd_sat: 1195 case Intrinsic::ssub_sat: 1196 case Intrinsic::usub_sat: 1197 return false; 1198 } 1199 1200 return BaseT::isLoweredToCall(F); 1201 } 1202 1203 bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, 1204 AssumptionCache &AC, 1205 TargetLibraryInfo *LibInfo, 1206 HardwareLoopInfo &HWLoopInfo) { 1207 // Low-overhead branches are only supported in the 'low-overhead branch' 1208 // extension of v8.1-m. 1209 if (!ST->hasLOB() || DisableLowOverheadLoops) { 1210 LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n"); 1211 return false; 1212 } 1213 1214 if (!SE.hasLoopInvariantBackedgeTakenCount(L)) { 1215 LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n"); 1216 return false; 1217 } 1218 1219 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L); 1220 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) { 1221 LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n"); 1222 return false; 1223 } 1224 1225 const SCEV *TripCountSCEV = 1226 SE.getAddExpr(BackedgeTakenCount, 1227 SE.getOne(BackedgeTakenCount->getType())); 1228 1229 // We need to store the trip count in LR, a 32-bit register. 1230 if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) { 1231 LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n"); 1232 return false; 1233 } 1234 1235 // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little 1236 // point in generating a hardware loop if that's going to happen. 1237 auto MaybeCall = [this](Instruction &I) { 1238 const ARMTargetLowering *TLI = getTLI(); 1239 unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode()); 1240 EVT VT = TLI->getValueType(DL, I.getType(), true); 1241 if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall) 1242 return true; 1243 1244 // Check if an intrinsic will be lowered to a call and assume that any 1245 // other CallInst will generate a bl. 1246 if (auto *Call = dyn_cast<CallInst>(&I)) { 1247 if (isa<IntrinsicInst>(Call)) { 1248 if (const Function *F = Call->getCalledFunction()) 1249 return isLoweredToCall(F); 1250 } 1251 return true; 1252 } 1253 1254 // FPv5 provides conversions between integer, double-precision, 1255 // single-precision, and half-precision formats. 1256 switch (I.getOpcode()) { 1257 default: 1258 break; 1259 case Instruction::FPToSI: 1260 case Instruction::FPToUI: 1261 case Instruction::SIToFP: 1262 case Instruction::UIToFP: 1263 case Instruction::FPTrunc: 1264 case Instruction::FPExt: 1265 return !ST->hasFPARMv8Base(); 1266 } 1267 1268 // FIXME: Unfortunately the approach of checking the Operation Action does 1269 // not catch all cases of Legalization that use library calls. Our 1270 // Legalization step categorizes some transformations into library calls as 1271 // Custom, Expand or even Legal when doing type legalization. So for now 1272 // we have to special case for instance the SDIV of 64bit integers and the 1273 // use of floating point emulation. 1274 if (VT.isInteger() && VT.getSizeInBits() >= 64) { 1275 switch (ISD) { 1276 default: 1277 break; 1278 case ISD::SDIV: 1279 case ISD::UDIV: 1280 case ISD::SREM: 1281 case ISD::UREM: 1282 case ISD::SDIVREM: 1283 case ISD::UDIVREM: 1284 return true; 1285 } 1286 } 1287 1288 // Assume all other non-float operations are supported. 1289 if (!VT.isFloatingPoint()) 1290 return false; 1291 1292 // We'll need a library call to handle most floats when using soft. 1293 if (TLI->useSoftFloat()) { 1294 switch (I.getOpcode()) { 1295 default: 1296 return true; 1297 case Instruction::Alloca: 1298 case Instruction::Load: 1299 case Instruction::Store: 1300 case Instruction::Select: 1301 case Instruction::PHI: 1302 return false; 1303 } 1304 } 1305 1306 // We'll need a libcall to perform double precision operations on a single 1307 // precision only FPU. 1308 if (I.getType()->isDoubleTy() && !ST->hasFP64()) 1309 return true; 1310 1311 // Likewise for half precision arithmetic. 1312 if (I.getType()->isHalfTy() && !ST->hasFullFP16()) 1313 return true; 1314 1315 return false; 1316 }; 1317 1318 auto IsHardwareLoopIntrinsic = [](Instruction &I) { 1319 if (auto *Call = dyn_cast<IntrinsicInst>(&I)) { 1320 switch (Call->getIntrinsicID()) { 1321 default: 1322 break; 1323 case Intrinsic::set_loop_iterations: 1324 case Intrinsic::test_set_loop_iterations: 1325 case Intrinsic::loop_decrement: 1326 case Intrinsic::loop_decrement_reg: 1327 return true; 1328 } 1329 } 1330 return false; 1331 }; 1332 1333 // Scan the instructions to see if there's any that we know will turn into a 1334 // call or if this loop is already a low-overhead loop. 1335 auto ScanLoop = [&](Loop *L) { 1336 for (auto *BB : L->getBlocks()) { 1337 for (auto &I : *BB) { 1338 if (MaybeCall(I) || IsHardwareLoopIntrinsic(I)) { 1339 LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n"); 1340 return false; 1341 } 1342 } 1343 } 1344 return true; 1345 }; 1346 1347 // Visit inner loops. 1348 for (auto Inner : *L) 1349 if (!ScanLoop(Inner)) 1350 return false; 1351 1352 if (!ScanLoop(L)) 1353 return false; 1354 1355 // TODO: Check whether the trip count calculation is expensive. If L is the 1356 // inner loop but we know it has a low trip count, calculating that trip 1357 // count (in the parent loop) may be detrimental. 1358 1359 LLVMContext &C = L->getHeader()->getContext(); 1360 HWLoopInfo.CounterInReg = true; 1361 HWLoopInfo.IsNestingLegal = false; 1362 HWLoopInfo.PerformEntryTest = true; 1363 HWLoopInfo.CountType = Type::getInt32Ty(C); 1364 HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1); 1365 return true; 1366 } 1367 1368 static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) { 1369 // We don't allow icmp's, and because we only look at single block loops, 1370 // we simply count the icmps, i.e. there should only be 1 for the backedge. 1371 if (isa<ICmpInst>(&I) && ++ICmpCount > 1) 1372 return false; 1373 1374 if (isa<FCmpInst>(&I)) 1375 return false; 1376 1377 // We could allow extending/narrowing FP loads/stores, but codegen is 1378 // too inefficient so reject this for now. 1379 if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I)) 1380 return false; 1381 1382 // Extends have to be extending-loads 1383 if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) ) 1384 if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0))) 1385 return false; 1386 1387 // Truncs have to be narrowing-stores 1388 if (isa<TruncInst>(&I) ) 1389 if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin())) 1390 return false; 1391 1392 return true; 1393 } 1394 1395 // To set up a tail-predicated loop, we need to know the total number of 1396 // elements processed by that loop. Thus, we need to determine the element 1397 // size and: 1398 // 1) it should be uniform for all operations in the vector loop, so we 1399 // e.g. don't want any widening/narrowing operations. 1400 // 2) it should be smaller than i64s because we don't have vector operations 1401 // that work on i64s. 1402 // 3) we don't want elements to be reversed or shuffled, to make sure the 1403 // tail-predication masks/predicates the right lanes. 1404 // 1405 static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, 1406 const DataLayout &DL, 1407 const LoopAccessInfo *LAI) { 1408 PredicatedScalarEvolution PSE = LAI->getPSE(); 1409 int ICmpCount = 0; 1410 int Stride = 0; 1411 1412 LLVM_DEBUG(dbgs() << "tail-predication: checking allowed instructions\n"); 1413 SmallVector<Instruction *, 16> LoadStores; 1414 for (BasicBlock *BB : L->blocks()) { 1415 for (Instruction &I : BB->instructionsWithoutDebug()) { 1416 if (isa<PHINode>(&I)) 1417 continue; 1418 if (!canTailPredicateInstruction(I, ICmpCount)) { 1419 LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump()); 1420 return false; 1421 } 1422 1423 Type *T = I.getType(); 1424 if (T->isPointerTy()) 1425 T = T->getPointerElementType(); 1426 1427 if (T->getScalarSizeInBits() > 32) { 1428 LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump()); 1429 return false; 1430 } 1431 1432 if (isa<StoreInst>(I) || isa<LoadInst>(I)) { 1433 Value *Ptr = isa<LoadInst>(I) ? I.getOperand(0) : I.getOperand(1); 1434 int64_t NextStride = getPtrStride(PSE, Ptr, L); 1435 // TODO: for now only allow consecutive strides of 1. We could support 1436 // other strides as long as it is uniform, but let's keep it simple for 1437 // now. 1438 if (Stride == 0 && NextStride == 1) { 1439 Stride = NextStride; 1440 continue; 1441 } 1442 if (Stride != NextStride) { 1443 LLVM_DEBUG(dbgs() << "Different strides found, can't " 1444 "tail-predicate\n."); 1445 return false; 1446 } 1447 } 1448 } 1449 } 1450 1451 LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n"); 1452 return true; 1453 } 1454 1455 bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, 1456 ScalarEvolution &SE, 1457 AssumptionCache &AC, 1458 TargetLibraryInfo *TLI, 1459 DominatorTree *DT, 1460 const LoopAccessInfo *LAI) { 1461 if (DisableTailPredication) 1462 return false; 1463 1464 // Creating a predicated vector loop is the first step for generating a 1465 // tail-predicated hardware loop, for which we need the MVE masked 1466 // load/stores instructions: 1467 if (!ST->hasMVEIntegerOps()) 1468 return false; 1469 1470 // For now, restrict this to single block loops. 1471 if (L->getNumBlocks() > 1) { 1472 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block " 1473 "loop.\n"); 1474 return false; 1475 } 1476 1477 assert(L->empty() && "preferPredicateOverEpilogue: inner-loop expected"); 1478 1479 HardwareLoopInfo HWLoopInfo(L); 1480 if (!HWLoopInfo.canAnalyze(*LI)) { 1481 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not " 1482 "analyzable.\n"); 1483 return false; 1484 } 1485 1486 // This checks if we have the low-overhead branch architecture 1487 // extension, and if we will create a hardware-loop: 1488 if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) { 1489 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not " 1490 "profitable.\n"); 1491 return false; 1492 } 1493 1494 if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) { 1495 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not " 1496 "a candidate.\n"); 1497 return false; 1498 } 1499 1500 return canTailPredicateLoop(L, LI, SE, DL, LAI); 1501 } 1502 1503 bool ARMTTIImpl::emitGetActiveLaneMask() const { 1504 if (!ST->hasMVEIntegerOps() || DisableTailPredication) 1505 return false; 1506 1507 // Intrinsic @llvm.get.active.lane.mask is supported. 1508 // It is used in the MVETailPredication pass, which requires the number of 1509 // elements processed by this vector loop to setup the tail-predicated 1510 // loop. 1511 return true; 1512 } 1513 void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 1514 TTI::UnrollingPreferences &UP) { 1515 // Only currently enable these preferences for M-Class cores. 1516 if (!ST->isMClass()) 1517 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP); 1518 1519 // Disable loop unrolling for Oz and Os. 1520 UP.OptSizeThreshold = 0; 1521 UP.PartialOptSizeThreshold = 0; 1522 if (L->getHeader()->getParent()->hasOptSize()) 1523 return; 1524 1525 // Only enable on Thumb-2 targets. 1526 if (!ST->isThumb2()) 1527 return; 1528 1529 SmallVector<BasicBlock*, 4> ExitingBlocks; 1530 L->getExitingBlocks(ExitingBlocks); 1531 LLVM_DEBUG(dbgs() << "Loop has:\n" 1532 << "Blocks: " << L->getNumBlocks() << "\n" 1533 << "Exit blocks: " << ExitingBlocks.size() << "\n"); 1534 1535 // Only allow another exit other than the latch. This acts as an early exit 1536 // as it mirrors the profitability calculation of the runtime unroller. 1537 if (ExitingBlocks.size() > 2) 1538 return; 1539 1540 // Limit the CFG of the loop body for targets with a branch predictor. 1541 // Allowing 4 blocks permits if-then-else diamonds in the body. 1542 if (ST->hasBranchPredictor() && L->getNumBlocks() > 4) 1543 return; 1544 1545 // Scan the loop: don't unroll loops with calls as this could prevent 1546 // inlining. 1547 unsigned Cost = 0; 1548 for (auto *BB : L->getBlocks()) { 1549 for (auto &I : *BB) { 1550 // Don't unroll vectorised loop. MVE does not benefit from it as much as 1551 // scalar code. 1552 if (I.getType()->isVectorTy()) 1553 return; 1554 1555 if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 1556 if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 1557 if (!isLoweredToCall(F)) 1558 continue; 1559 } 1560 return; 1561 } 1562 1563 SmallVector<const Value*, 4> Operands(I.value_op_begin(), 1564 I.value_op_end()); 1565 Cost += getUserCost(&I, Operands, TargetTransformInfo::TCK_CodeSize); 1566 } 1567 } 1568 1569 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n"); 1570 1571 UP.Partial = true; 1572 UP.Runtime = true; 1573 UP.UpperBound = true; 1574 UP.UnrollRemainder = true; 1575 UP.DefaultUnrollRuntimeCount = 4; 1576 UP.UnrollAndJam = true; 1577 UP.UnrollAndJamInnerLoopThreshold = 60; 1578 1579 // Force unrolling small loops can be very useful because of the branch 1580 // taken cost of the backedge. 1581 if (Cost < 12) 1582 UP.Force = true; 1583 } 1584 1585 bool ARMTTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty, 1586 TTI::ReductionFlags Flags) const { 1587 return ST->hasMVEIntegerOps(); 1588 } 1589