1 //===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "ARMTargetTransformInfo.h" 10 #include "ARMSubtarget.h" 11 #include "MCTargetDesc/ARMAddressingModes.h" 12 #include "llvm/ADT/APInt.h" 13 #include "llvm/ADT/SmallVector.h" 14 #include "llvm/Analysis/LoopInfo.h" 15 #include "llvm/CodeGen/CostTable.h" 16 #include "llvm/CodeGen/ISDOpcodes.h" 17 #include "llvm/CodeGen/ValueTypes.h" 18 #include "llvm/IR/BasicBlock.h" 19 #include "llvm/IR/CallSite.h" 20 #include "llvm/IR/DataLayout.h" 21 #include "llvm/IR/DerivedTypes.h" 22 #include "llvm/IR/Instruction.h" 23 #include "llvm/IR/Instructions.h" 24 #include "llvm/IR/IntrinsicInst.h" 25 #include "llvm/IR/PatternMatch.h" 26 #include "llvm/IR/Type.h" 27 #include "llvm/MC/SubtargetFeature.h" 28 #include "llvm/Support/Casting.h" 29 #include "llvm/Support/MachineValueType.h" 30 #include "llvm/Target/TargetMachine.h" 31 #include <algorithm> 32 #include <cassert> 33 #include <cstdint> 34 #include <utility> 35 36 using namespace llvm; 37 38 #define DEBUG_TYPE "armtti" 39 40 static cl::opt<bool> EnableMaskedLoadStores( 41 "enable-arm-maskedldst", cl::Hidden, cl::init(true), 42 cl::desc("Enable the generation of masked loads and stores")); 43 44 static cl::opt<bool> DisableLowOverheadLoops( 45 "disable-arm-loloops", cl::Hidden, cl::init(false), 46 cl::desc("Disable the generation of low-overhead loops")); 47 48 extern cl::opt<bool> DisableTailPredication; 49 50 extern cl::opt<bool> EnableMaskedGatherScatters; 51 52 bool ARMTTIImpl::areInlineCompatible(const Function *Caller, 53 const Function *Callee) const { 54 const TargetMachine &TM = getTLI()->getTargetMachine(); 55 const FeatureBitset &CallerBits = 56 TM.getSubtargetImpl(*Caller)->getFeatureBits(); 57 const FeatureBitset &CalleeBits = 58 TM.getSubtargetImpl(*Callee)->getFeatureBits(); 59 60 // To inline a callee, all features not in the whitelist must match exactly. 61 bool MatchExact = (CallerBits & ~InlineFeatureWhitelist) == 62 (CalleeBits & ~InlineFeatureWhitelist); 63 // For features in the whitelist, the callee's features must be a subset of 64 // the callers'. 65 bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeatureWhitelist) == 66 (CalleeBits & InlineFeatureWhitelist); 67 return MatchExact && MatchSubset; 68 } 69 70 bool ARMTTIImpl::shouldFavorBackedgeIndex(const Loop *L) const { 71 if (L->getHeader()->getParent()->hasOptSize()) 72 return false; 73 if (ST->hasMVEIntegerOps()) 74 return false; 75 return ST->isMClass() && ST->isThumb2() && L->getNumBlocks() == 1; 76 } 77 78 bool ARMTTIImpl::shouldFavorPostInc() const { 79 if (ST->hasMVEIntegerOps()) 80 return true; 81 return false; 82 } 83 84 int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { 85 assert(Ty->isIntegerTy()); 86 87 unsigned Bits = Ty->getPrimitiveSizeInBits(); 88 if (Bits == 0 || Imm.getActiveBits() >= 64) 89 return 4; 90 91 int64_t SImmVal = Imm.getSExtValue(); 92 uint64_t ZImmVal = Imm.getZExtValue(); 93 if (!ST->isThumb()) { 94 if ((SImmVal >= 0 && SImmVal < 65536) || 95 (ARM_AM::getSOImmVal(ZImmVal) != -1) || 96 (ARM_AM::getSOImmVal(~ZImmVal) != -1)) 97 return 1; 98 return ST->hasV6T2Ops() ? 2 : 3; 99 } 100 if (ST->isThumb2()) { 101 if ((SImmVal >= 0 && SImmVal < 65536) || 102 (ARM_AM::getT2SOImmVal(ZImmVal) != -1) || 103 (ARM_AM::getT2SOImmVal(~ZImmVal) != -1)) 104 return 1; 105 return ST->hasV6T2Ops() ? 2 : 3; 106 } 107 // Thumb1, any i8 imm cost 1. 108 if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256)) 109 return 1; 110 if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal)) 111 return 2; 112 // Load from constantpool. 113 return 3; 114 } 115 116 // Constants smaller than 256 fit in the immediate field of 117 // Thumb1 instructions so we return a zero cost and 1 otherwise. 118 int ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, 119 const APInt &Imm, Type *Ty) { 120 if (Imm.isNonNegative() && Imm.getLimitedValue() < 256) 121 return 0; 122 123 return 1; 124 } 125 126 int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, 127 Type *Ty) { 128 // Division by a constant can be turned into multiplication, but only if we 129 // know it's constant. So it's not so much that the immediate is cheap (it's 130 // not), but that the alternative is worse. 131 // FIXME: this is probably unneeded with GlobalISel. 132 if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv || 133 Opcode == Instruction::SRem || Opcode == Instruction::URem) && 134 Idx == 1) 135 return 0; 136 137 if (Opcode == Instruction::And) { 138 // UXTB/UXTH 139 if (Imm == 255 || Imm == 65535) 140 return 0; 141 // Conversion to BIC is free, and means we can use ~Imm instead. 142 return std::min(getIntImmCost(Imm, Ty), getIntImmCost(~Imm, Ty)); 143 } 144 145 if (Opcode == Instruction::Add) 146 // Conversion to SUB is free, and means we can use -Imm instead. 147 return std::min(getIntImmCost(Imm, Ty), getIntImmCost(-Imm, Ty)); 148 149 if (Opcode == Instruction::ICmp && Imm.isNegative() && 150 Ty->getIntegerBitWidth() == 32) { 151 int64_t NegImm = -Imm.getSExtValue(); 152 if (ST->isThumb2() && NegImm < 1<<12) 153 // icmp X, #-C -> cmn X, #C 154 return 0; 155 if (ST->isThumb() && NegImm < 1<<8) 156 // icmp X, #-C -> adds X, #C 157 return 0; 158 } 159 160 // xor a, -1 can always be folded to MVN 161 if (Opcode == Instruction::Xor && Imm.isAllOnesValue()) 162 return 0; 163 164 return getIntImmCost(Imm, Ty); 165 } 166 167 int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, 168 const Instruction *I) { 169 int ISD = TLI->InstructionOpcodeToISD(Opcode); 170 assert(ISD && "Invalid opcode"); 171 172 // Single to/from double precision conversions. 173 static const CostTblEntry NEONFltDblTbl[] = { 174 // Vector fptrunc/fpext conversions. 175 { ISD::FP_ROUND, MVT::v2f64, 2 }, 176 { ISD::FP_EXTEND, MVT::v2f32, 2 }, 177 { ISD::FP_EXTEND, MVT::v4f32, 4 } 178 }; 179 180 if (Src->isVectorTy() && ST->hasNEON() && (ISD == ISD::FP_ROUND || 181 ISD == ISD::FP_EXTEND)) { 182 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); 183 if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second)) 184 return LT.first * Entry->Cost; 185 } 186 187 EVT SrcTy = TLI->getValueType(DL, Src); 188 EVT DstTy = TLI->getValueType(DL, Dst); 189 190 if (!SrcTy.isSimple() || !DstTy.isSimple()) 191 return BaseT::getCastInstrCost(Opcode, Dst, Src); 192 193 // The extend of a load is free 194 if (I && isa<LoadInst>(I->getOperand(0))) { 195 static const TypeConversionCostTblEntry LoadConversionTbl[] = { 196 {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0}, 197 {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0}, 198 {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0}, 199 {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0}, 200 {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0}, 201 {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0}, 202 {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1}, 203 {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1}, 204 {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1}, 205 {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1}, 206 {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1}, 207 {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1}, 208 }; 209 if (const auto *Entry = ConvertCostTableLookup( 210 LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) 211 return Entry->Cost; 212 213 static const TypeConversionCostTblEntry MVELoadConversionTbl[] = { 214 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0}, 215 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0}, 216 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0}, 217 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0}, 218 {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0}, 219 {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0}, 220 }; 221 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) { 222 if (const auto *Entry = 223 ConvertCostTableLookup(MVELoadConversionTbl, ISD, 224 DstTy.getSimpleVT(), SrcTy.getSimpleVT())) 225 return Entry->Cost; 226 } 227 } 228 229 // Some arithmetic, load and store operations have specific instructions 230 // to cast up/down their types automatically at no extra cost. 231 // TODO: Get these tables to know at least what the related operations are. 232 static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = { 233 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0 }, 234 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0 }, 235 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 }, 236 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 }, 237 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 }, 238 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, 239 240 // The number of vmovl instructions for the extension. 241 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 242 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 243 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 244 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 245 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, 246 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, 247 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, 248 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, 249 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, 250 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, 251 252 // Operations that we legalize using splitting. 253 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 }, 254 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, 255 256 // Vector float <-> i32 conversions. 257 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 258 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 259 260 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 261 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 262 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 }, 263 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 }, 264 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 265 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 266 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, 267 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, 268 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, 269 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, 270 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 271 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 272 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 273 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 274 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 }, 275 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 }, 276 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 }, 277 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 }, 278 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 }, 279 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 }, 280 281 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, 282 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, 283 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 }, 284 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 }, 285 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 }, 286 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 }, 287 288 // Vector double <-> i32 conversions. 289 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 290 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 291 292 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 293 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 294 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 }, 295 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 }, 296 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 297 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 298 299 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 }, 300 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 }, 301 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 4 }, 302 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 4 }, 303 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 8 }, 304 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 8 } 305 }; 306 307 if (SrcTy.isVector() && ST->hasNEON()) { 308 if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD, 309 DstTy.getSimpleVT(), 310 SrcTy.getSimpleVT())) 311 return Entry->Cost; 312 } 313 314 // Scalar float to integer conversions. 315 static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = { 316 { ISD::FP_TO_SINT, MVT::i1, MVT::f32, 2 }, 317 { ISD::FP_TO_UINT, MVT::i1, MVT::f32, 2 }, 318 { ISD::FP_TO_SINT, MVT::i1, MVT::f64, 2 }, 319 { ISD::FP_TO_UINT, MVT::i1, MVT::f64, 2 }, 320 { ISD::FP_TO_SINT, MVT::i8, MVT::f32, 2 }, 321 { ISD::FP_TO_UINT, MVT::i8, MVT::f32, 2 }, 322 { ISD::FP_TO_SINT, MVT::i8, MVT::f64, 2 }, 323 { ISD::FP_TO_UINT, MVT::i8, MVT::f64, 2 }, 324 { ISD::FP_TO_SINT, MVT::i16, MVT::f32, 2 }, 325 { ISD::FP_TO_UINT, MVT::i16, MVT::f32, 2 }, 326 { ISD::FP_TO_SINT, MVT::i16, MVT::f64, 2 }, 327 { ISD::FP_TO_UINT, MVT::i16, MVT::f64, 2 }, 328 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 2 }, 329 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 2 }, 330 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 2 }, 331 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 2 }, 332 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 10 }, 333 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 10 }, 334 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 10 }, 335 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 10 } 336 }; 337 if (SrcTy.isFloatingPoint() && ST->hasNEON()) { 338 if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD, 339 DstTy.getSimpleVT(), 340 SrcTy.getSimpleVT())) 341 return Entry->Cost; 342 } 343 344 // Scalar integer to float conversions. 345 static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = { 346 { ISD::SINT_TO_FP, MVT::f32, MVT::i1, 2 }, 347 { ISD::UINT_TO_FP, MVT::f32, MVT::i1, 2 }, 348 { ISD::SINT_TO_FP, MVT::f64, MVT::i1, 2 }, 349 { ISD::UINT_TO_FP, MVT::f64, MVT::i1, 2 }, 350 { ISD::SINT_TO_FP, MVT::f32, MVT::i8, 2 }, 351 { ISD::UINT_TO_FP, MVT::f32, MVT::i8, 2 }, 352 { ISD::SINT_TO_FP, MVT::f64, MVT::i8, 2 }, 353 { ISD::UINT_TO_FP, MVT::f64, MVT::i8, 2 }, 354 { ISD::SINT_TO_FP, MVT::f32, MVT::i16, 2 }, 355 { ISD::UINT_TO_FP, MVT::f32, MVT::i16, 2 }, 356 { ISD::SINT_TO_FP, MVT::f64, MVT::i16, 2 }, 357 { ISD::UINT_TO_FP, MVT::f64, MVT::i16, 2 }, 358 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 2 }, 359 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 2 }, 360 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 2 }, 361 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 2 }, 362 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 10 }, 363 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 10 }, 364 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 10 }, 365 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 10 } 366 }; 367 368 if (SrcTy.isInteger() && ST->hasNEON()) { 369 if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl, 370 ISD, DstTy.getSimpleVT(), 371 SrcTy.getSimpleVT())) 372 return Entry->Cost; 373 } 374 375 // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one 376 // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext 377 // are linearised so take more. 378 static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = { 379 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, 380 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, 381 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, 382 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, 383 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 }, 384 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 }, 385 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, 386 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, 387 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 }, 388 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 }, 389 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 }, 390 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 }, 391 }; 392 393 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) { 394 if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl, 395 ISD, DstTy.getSimpleVT(), 396 SrcTy.getSimpleVT())) 397 return Entry->Cost * ST->getMVEVectorCostFactor(); 398 } 399 400 // Scalar integer conversion costs. 401 static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = { 402 // i16 -> i64 requires two dependent operations. 403 { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 }, 404 405 // Truncates on i64 are assumed to be free. 406 { ISD::TRUNCATE, MVT::i32, MVT::i64, 0 }, 407 { ISD::TRUNCATE, MVT::i16, MVT::i64, 0 }, 408 { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 }, 409 { ISD::TRUNCATE, MVT::i1, MVT::i64, 0 } 410 }; 411 412 if (SrcTy.isInteger()) { 413 if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD, 414 DstTy.getSimpleVT(), 415 SrcTy.getSimpleVT())) 416 return Entry->Cost; 417 } 418 419 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy() 420 ? ST->getMVEVectorCostFactor() 421 : 1; 422 return BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src); 423 } 424 425 int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, 426 unsigned Index) { 427 // Penalize inserting into an D-subregister. We end up with a three times 428 // lower estimated throughput on swift. 429 if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement && 430 ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32) 431 return 3; 432 433 if (ST->hasNEON() && (Opcode == Instruction::InsertElement || 434 Opcode == Instruction::ExtractElement)) { 435 // Cross-class copies are expensive on many microarchitectures, 436 // so assume they are expensive by default. 437 if (ValTy->getVectorElementType()->isIntegerTy()) 438 return 3; 439 440 // Even if it's not a cross class copy, this likely leads to mixing 441 // of NEON and VFP code and should be therefore penalized. 442 if (ValTy->isVectorTy() && 443 ValTy->getScalarSizeInBits() <= 32) 444 return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U); 445 } 446 447 if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement || 448 Opcode == Instruction::ExtractElement)) { 449 // We say MVE moves costs at least the MVEVectorCostFactor, even though 450 // they are scalar instructions. This helps prevent mixing scalar and 451 // vector, to prevent vectorising where we end up just scalarising the 452 // result anyway. 453 return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index), 454 ST->getMVEVectorCostFactor()) * 455 ValTy->getVectorNumElements() / 2; 456 } 457 458 return BaseT::getVectorInstrCost(Opcode, ValTy, Index); 459 } 460 461 int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, 462 const Instruction *I) { 463 int ISD = TLI->InstructionOpcodeToISD(Opcode); 464 // On NEON a vector select gets lowered to vbsl. 465 if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT) { 466 // Lowering of some vector selects is currently far from perfect. 467 static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = { 468 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 }, 469 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 }, 470 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 } 471 }; 472 473 EVT SelCondTy = TLI->getValueType(DL, CondTy); 474 EVT SelValTy = TLI->getValueType(DL, ValTy); 475 if (SelCondTy.isSimple() && SelValTy.isSimple()) { 476 if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD, 477 SelCondTy.getSimpleVT(), 478 SelValTy.getSimpleVT())) 479 return Entry->Cost; 480 } 481 482 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 483 return LT.first; 484 } 485 486 int BaseCost = ST->hasMVEIntegerOps() && ValTy->isVectorTy() 487 ? ST->getMVEVectorCostFactor() 488 : 1; 489 return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); 490 } 491 492 int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, 493 const SCEV *Ptr) { 494 // Address computations in vectorized code with non-consecutive addresses will 495 // likely result in more instructions compared to scalar code where the 496 // computation can more often be merged into the index mode. The resulting 497 // extra micro-ops can significantly decrease throughput. 498 unsigned NumVectorInstToHideOverhead = 10; 499 int MaxMergeDistance = 64; 500 501 if (ST->hasNEON()) { 502 if (Ty->isVectorTy() && SE && 503 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) 504 return NumVectorInstToHideOverhead; 505 506 // In many cases the address computation is not merged into the instruction 507 // addressing mode. 508 return 1; 509 } 510 return BaseT::getAddressComputationCost(Ty, SE, Ptr); 511 } 512 513 bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) { 514 if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps()) 515 return false; 516 517 if (auto *VecTy = dyn_cast<VectorType>(DataTy)) { 518 // Don't support v2i1 yet. 519 if (VecTy->getNumElements() == 2) 520 return false; 521 522 // We don't support extending fp types. 523 unsigned VecWidth = DataTy->getPrimitiveSizeInBits(); 524 if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy()) 525 return false; 526 } 527 528 unsigned EltWidth = DataTy->getScalarSizeInBits(); 529 return (EltWidth == 32 && (!Alignment || Alignment >= 4)) || 530 (EltWidth == 16 && (!Alignment || Alignment >= 2)) || 531 (EltWidth == 8); 532 } 533 534 bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, MaybeAlign Alignment) { 535 if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps()) 536 return false; 537 538 // This method is called in 2 places: 539 // - from the vectorizer with a scalar type, in which case we need to get 540 // this as good as we can with the limited info we have (and rely on the cost 541 // model for the rest). 542 // - from the masked intrinsic lowering pass with the actual vector type. 543 // For MVE, we have a custom lowering pass that will already have custom 544 // legalised any gathers that we can to MVE intrinsics, and want to expand all 545 // the rest. The pass runs before the masked intrinsic lowering pass, so if we 546 // are here, we know we want to expand. 547 if (isa<VectorType>(Ty)) 548 return false; 549 550 unsigned EltWidth = Ty->getScalarSizeInBits(); 551 return ((EltWidth == 32 && (!Alignment || Alignment >= 4)) || 552 (EltWidth == 16 && (!Alignment || Alignment >= 2)) || EltWidth == 8); 553 } 554 555 int ARMTTIImpl::getMemcpyCost(const Instruction *I) { 556 const MemCpyInst *MI = dyn_cast<MemCpyInst>(I); 557 assert(MI && "MemcpyInst expected"); 558 ConstantInt *C = dyn_cast<ConstantInt>(MI->getLength()); 559 560 // To model the cost of a library call, we assume 1 for the call, and 561 // 3 for the argument setup. 562 const unsigned LibCallCost = 4; 563 564 // If 'size' is not a constant, a library call will be generated. 565 if (!C) 566 return LibCallCost; 567 568 const unsigned Size = C->getValue().getZExtValue(); 569 const Align DstAlign = *MI->getDestAlign(); 570 const Align SrcAlign = *MI->getSourceAlign(); 571 const Function *F = I->getParent()->getParent(); 572 const unsigned Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize()); 573 std::vector<EVT> MemOps; 574 575 // MemOps will be poplulated with a list of data types that needs to be 576 // loaded and stored. That's why we multiply the number of elements by 2 to 577 // get the cost for this memcpy. 578 if (getTLI()->findOptimalMemOpLowering( 579 MemOps, Limit, 580 MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign, 581 /*IsVolatile*/ true), 582 MI->getDestAddressSpace(), MI->getSourceAddressSpace(), 583 F->getAttributes())) 584 return MemOps.size() * 2; 585 586 // If we can't find an optimal memop lowering, return the default cost 587 return LibCallCost; 588 } 589 590 int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, 591 Type *SubTp) { 592 if (ST->hasNEON()) { 593 if (Kind == TTI::SK_Broadcast) { 594 static const CostTblEntry NEONDupTbl[] = { 595 // VDUP handles these cases. 596 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, 597 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, 598 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, 599 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, 600 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1}, 601 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1}, 602 603 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}, 604 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, 605 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, 606 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}}; 607 608 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 609 610 if (const auto *Entry = 611 CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second)) 612 return LT.first * Entry->Cost; 613 } 614 if (Kind == TTI::SK_Reverse) { 615 static const CostTblEntry NEONShuffleTbl[] = { 616 // Reverse shuffle cost one instruction if we are shuffling within a 617 // double word (vrev) or two if we shuffle a quad word (vrev, vext). 618 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, 619 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, 620 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, 621 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, 622 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1}, 623 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1}, 624 625 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, 626 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, 627 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2}, 628 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}}; 629 630 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 631 632 if (const auto *Entry = 633 CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) 634 return LT.first * Entry->Cost; 635 } 636 if (Kind == TTI::SK_Select) { 637 static const CostTblEntry NEONSelShuffleTbl[] = { 638 // Select shuffle cost table for ARM. Cost is the number of 639 // instructions 640 // required to create the shuffled vector. 641 642 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, 643 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, 644 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, 645 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, 646 647 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, 648 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, 649 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2}, 650 651 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16}, 652 653 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}}; 654 655 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 656 if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl, 657 ISD::VECTOR_SHUFFLE, LT.second)) 658 return LT.first * Entry->Cost; 659 } 660 } 661 if (ST->hasMVEIntegerOps()) { 662 if (Kind == TTI::SK_Broadcast) { 663 static const CostTblEntry MVEDupTbl[] = { 664 // VDUP handles these cases. 665 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}, 666 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, 667 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}, 668 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, 669 {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}}; 670 671 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 672 673 if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE, 674 LT.second)) 675 return LT.first * Entry->Cost * ST->getMVEVectorCostFactor(); 676 } 677 } 678 int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy() 679 ? ST->getMVEVectorCostFactor() 680 : 1; 681 return BaseCost * BaseT::getShuffleCost(Kind, Tp, Index, SubTp); 682 } 683 684 int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, 685 TTI::OperandValueKind Op1Info, 686 TTI::OperandValueKind Op2Info, 687 TTI::OperandValueProperties Opd1PropInfo, 688 TTI::OperandValueProperties Opd2PropInfo, 689 ArrayRef<const Value *> Args, 690 const Instruction *CxtI) { 691 int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode); 692 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 693 694 if (ST->hasNEON()) { 695 const unsigned FunctionCallDivCost = 20; 696 const unsigned ReciprocalDivCost = 10; 697 static const CostTblEntry CostTbl[] = { 698 // Division. 699 // These costs are somewhat random. Choose a cost of 20 to indicate that 700 // vectorizing devision (added function call) is going to be very expensive. 701 // Double registers types. 702 { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost}, 703 { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost}, 704 { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost}, 705 { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost}, 706 { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost}, 707 { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost}, 708 { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost}, 709 { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost}, 710 { ISD::SDIV, MVT::v4i16, ReciprocalDivCost}, 711 { ISD::UDIV, MVT::v4i16, ReciprocalDivCost}, 712 { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost}, 713 { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost}, 714 { ISD::SDIV, MVT::v8i8, ReciprocalDivCost}, 715 { ISD::UDIV, MVT::v8i8, ReciprocalDivCost}, 716 { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost}, 717 { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost}, 718 // Quad register types. 719 { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost}, 720 { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost}, 721 { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost}, 722 { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost}, 723 { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost}, 724 { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost}, 725 { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost}, 726 { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost}, 727 { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost}, 728 { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost}, 729 { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost}, 730 { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost}, 731 { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost}, 732 { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost}, 733 { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost}, 734 { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost}, 735 // Multiplication. 736 }; 737 738 if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second)) 739 return LT.first * Entry->Cost; 740 741 int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info, 742 Opd1PropInfo, Opd2PropInfo); 743 744 // This is somewhat of a hack. The problem that we are facing is that SROA 745 // creates a sequence of shift, and, or instructions to construct values. 746 // These sequences are recognized by the ISel and have zero-cost. Not so for 747 // the vectorized code. Because we have support for v2i64 but not i64 those 748 // sequences look particularly beneficial to vectorize. 749 // To work around this we increase the cost of v2i64 operations to make them 750 // seem less beneficial. 751 if (LT.second == MVT::v2i64 && 752 Op2Info == TargetTransformInfo::OK_UniformConstantValue) 753 Cost += 4; 754 755 return Cost; 756 } 757 758 // If this operation is a shift on arm/thumb2, it might well be folded into 759 // the following instruction, hence having a cost of 0. 760 auto LooksLikeAFreeShift = [&]() { 761 if (ST->isThumb1Only() || Ty->isVectorTy()) 762 return false; 763 764 if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift()) 765 return false; 766 if (Op2Info != TargetTransformInfo::OK_UniformConstantValue) 767 return false; 768 769 // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB 770 switch (cast<Instruction>(CxtI->user_back())->getOpcode()) { 771 case Instruction::Add: 772 case Instruction::Sub: 773 case Instruction::And: 774 case Instruction::Xor: 775 case Instruction::Or: 776 case Instruction::ICmp: 777 return true; 778 default: 779 return false; 780 } 781 }; 782 if (LooksLikeAFreeShift()) 783 return 0; 784 785 int BaseCost = ST->hasMVEIntegerOps() && Ty->isVectorTy() 786 ? ST->getMVEVectorCostFactor() 787 : 1; 788 789 // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost, 790 // without treating floats as more expensive that scalars or increasing the 791 // costs for custom operations. The results is also multiplied by the 792 // MVEVectorCostFactor where appropriate. 793 if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second)) 794 return LT.first * BaseCost; 795 796 // Else this is expand, assume that we need to scalarize this op. 797 if (Ty->isVectorTy()) { 798 unsigned Num = Ty->getVectorNumElements(); 799 unsigned Cost = getArithmeticInstrCost(Opcode, Ty->getScalarType()); 800 // Return the cost of multiple scalar invocation plus the cost of 801 // inserting and extracting the values. 802 return BaseT::getScalarizationOverhead(Ty, Args) + Num * Cost; 803 } 804 805 return BaseCost; 806 } 807 808 int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, 809 MaybeAlign Alignment, unsigned AddressSpace, 810 const Instruction *I) { 811 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); 812 813 if (ST->hasNEON() && Src->isVectorTy() && 814 (Alignment && *Alignment != Align(16)) && 815 Src->getVectorElementType()->isDoubleTy()) { 816 // Unaligned loads/stores are extremely inefficient. 817 // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr. 818 return LT.first * 4; 819 } 820 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy() 821 ? ST->getMVEVectorCostFactor() 822 : 1; 823 return BaseCost * LT.first; 824 } 825 826 int ARMTTIImpl::getInterleavedMemoryOpCost( 827 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 828 unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond, 829 bool UseMaskForGaps) { 830 assert(Factor >= 2 && "Invalid interleave factor"); 831 assert(isa<VectorType>(VecTy) && "Expect a vector type"); 832 833 // vldN/vstN doesn't support vector types of i64/f64 element. 834 bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64; 835 836 if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits && 837 !UseMaskForCond && !UseMaskForGaps) { 838 unsigned NumElts = VecTy->getVectorNumElements(); 839 auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor); 840 841 // vldN/vstN only support legal vector types of size 64 or 128 in bits. 842 // Accesses having vector types that are a multiple of 128 bits can be 843 // matched to more than one vldN/vstN instruction. 844 int BaseCost = ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor() : 1; 845 if (NumElts % Factor == 0 && 846 TLI->isLegalInterleavedAccessType(Factor, SubVecTy, DL)) 847 return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL); 848 849 // Some smaller than legal interleaved patterns are cheap as we can make 850 // use of the vmovn or vrev patterns to interleave a standard load. This is 851 // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is 852 // promoted differently). The cost of 2 here is then a load and vrev or 853 // vmovn. 854 if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 && 855 VecTy->isIntOrIntVectorTy() && DL.getTypeSizeInBits(SubVecTy) <= 64) 856 return 2 * BaseCost; 857 } 858 859 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 860 Alignment, AddressSpace, 861 UseMaskForCond, UseMaskForGaps); 862 } 863 864 unsigned ARMTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy, 865 Value *Ptr, bool VariableMask, 866 unsigned Alignment) { 867 if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters) 868 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 869 Alignment); 870 871 assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!"); 872 VectorType *VTy = cast<VectorType>(DataTy); 873 874 // TODO: Splitting, once we do that. 875 // TODO: trunc/sext/zext the result/input 876 877 unsigned NumElems = VTy->getNumElements(); 878 unsigned EltSize = VTy->getScalarSizeInBits(); 879 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, DataTy); 880 881 // For now, it is assumed that for the MVE gather instructions the loads are 882 // all effectively serialised. This means the cost is the scalar cost 883 // multiplied by the number of elements being loaded. This is possibly very 884 // conservative, but even so we still end up vectorising loops because the 885 // cost per iteration for many loops is lower than for scalar loops. 886 unsigned VectorCost = NumElems * LT.first; 887 // The scalarization cost should be a lot higher. We use the number of vector 888 // elements plus the scalarization overhead. 889 unsigned ScalarCost = 890 NumElems * LT.first + BaseT::getScalarizationOverhead(DataTy, {}); 891 892 // TODO: Cost extended gathers or trunc stores correctly. 893 if (EltSize * NumElems != 128 || NumElems < 4) 894 return ScalarCost; 895 if (Alignment < EltSize / 8) 896 return ScalarCost; 897 898 // Any (aligned) i32 gather will not need to be scalarised. 899 if (EltSize == 32) 900 return VectorCost; 901 // For smaller types, we need to ensure that the gep's inputs are correctly 902 // extended from a small enough value. Other size (including i64) are 903 // scalarized for now. 904 if (EltSize != 8 && EltSize != 16) 905 return ScalarCost; 906 907 if (auto BC = dyn_cast<BitCastInst>(Ptr)) 908 Ptr = BC->getOperand(0); 909 if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) { 910 if (GEP->getNumOperands() != 2) 911 return ScalarCost; 912 unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType()); 913 // Scale needs to be correct (which is only relevant for i16s). 914 if (Scale != 1 && Scale * 8 != EltSize) 915 return ScalarCost; 916 // And we need to zext (not sext) the indexes from a small enough type. 917 if (auto ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) 918 if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= EltSize) 919 return VectorCost; 920 return ScalarCost; 921 } 922 return ScalarCost; 923 } 924 925 bool ARMTTIImpl::isLoweredToCall(const Function *F) { 926 if (!F->isIntrinsic()) 927 BaseT::isLoweredToCall(F); 928 929 // Assume all Arm-specific intrinsics map to an instruction. 930 if (F->getName().startswith("llvm.arm")) 931 return false; 932 933 switch (F->getIntrinsicID()) { 934 default: break; 935 case Intrinsic::powi: 936 case Intrinsic::sin: 937 case Intrinsic::cos: 938 case Intrinsic::pow: 939 case Intrinsic::log: 940 case Intrinsic::log10: 941 case Intrinsic::log2: 942 case Intrinsic::exp: 943 case Intrinsic::exp2: 944 return true; 945 case Intrinsic::sqrt: 946 case Intrinsic::fabs: 947 case Intrinsic::copysign: 948 case Intrinsic::floor: 949 case Intrinsic::ceil: 950 case Intrinsic::trunc: 951 case Intrinsic::rint: 952 case Intrinsic::nearbyint: 953 case Intrinsic::round: 954 case Intrinsic::canonicalize: 955 case Intrinsic::lround: 956 case Intrinsic::llround: 957 case Intrinsic::lrint: 958 case Intrinsic::llrint: 959 if (F->getReturnType()->isDoubleTy() && !ST->hasFP64()) 960 return true; 961 if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16()) 962 return true; 963 // Some operations can be handled by vector instructions and assume 964 // unsupported vectors will be expanded into supported scalar ones. 965 // TODO Handle scalar operations properly. 966 return !ST->hasFPARMv8Base() && !ST->hasVFP2Base(); 967 case Intrinsic::masked_store: 968 case Intrinsic::masked_load: 969 case Intrinsic::masked_gather: 970 case Intrinsic::masked_scatter: 971 return !ST->hasMVEIntegerOps(); 972 case Intrinsic::sadd_with_overflow: 973 case Intrinsic::uadd_with_overflow: 974 case Intrinsic::ssub_with_overflow: 975 case Intrinsic::usub_with_overflow: 976 case Intrinsic::sadd_sat: 977 case Intrinsic::uadd_sat: 978 case Intrinsic::ssub_sat: 979 case Intrinsic::usub_sat: 980 return false; 981 } 982 983 return BaseT::isLoweredToCall(F); 984 } 985 986 bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, 987 AssumptionCache &AC, 988 TargetLibraryInfo *LibInfo, 989 HardwareLoopInfo &HWLoopInfo) { 990 // Low-overhead branches are only supported in the 'low-overhead branch' 991 // extension of v8.1-m. 992 if (!ST->hasLOB() || DisableLowOverheadLoops) 993 return false; 994 995 if (!SE.hasLoopInvariantBackedgeTakenCount(L)) 996 return false; 997 998 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L); 999 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) 1000 return false; 1001 1002 const SCEV *TripCountSCEV = 1003 SE.getAddExpr(BackedgeTakenCount, 1004 SE.getOne(BackedgeTakenCount->getType())); 1005 1006 // We need to store the trip count in LR, a 32-bit register. 1007 if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) 1008 return false; 1009 1010 // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little 1011 // point in generating a hardware loop if that's going to happen. 1012 auto MaybeCall = [this](Instruction &I) { 1013 const ARMTargetLowering *TLI = getTLI(); 1014 unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode()); 1015 EVT VT = TLI->getValueType(DL, I.getType(), true); 1016 if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall) 1017 return true; 1018 1019 // Check if an intrinsic will be lowered to a call and assume that any 1020 // other CallInst will generate a bl. 1021 if (auto *Call = dyn_cast<CallInst>(&I)) { 1022 if (isa<IntrinsicInst>(Call)) { 1023 if (const Function *F = Call->getCalledFunction()) 1024 return isLoweredToCall(F); 1025 } 1026 return true; 1027 } 1028 1029 // FPv5 provides conversions between integer, double-precision, 1030 // single-precision, and half-precision formats. 1031 switch (I.getOpcode()) { 1032 default: 1033 break; 1034 case Instruction::FPToSI: 1035 case Instruction::FPToUI: 1036 case Instruction::SIToFP: 1037 case Instruction::UIToFP: 1038 case Instruction::FPTrunc: 1039 case Instruction::FPExt: 1040 return !ST->hasFPARMv8Base(); 1041 } 1042 1043 // FIXME: Unfortunately the approach of checking the Operation Action does 1044 // not catch all cases of Legalization that use library calls. Our 1045 // Legalization step categorizes some transformations into library calls as 1046 // Custom, Expand or even Legal when doing type legalization. So for now 1047 // we have to special case for instance the SDIV of 64bit integers and the 1048 // use of floating point emulation. 1049 if (VT.isInteger() && VT.getSizeInBits() >= 64) { 1050 switch (ISD) { 1051 default: 1052 break; 1053 case ISD::SDIV: 1054 case ISD::UDIV: 1055 case ISD::SREM: 1056 case ISD::UREM: 1057 case ISD::SDIVREM: 1058 case ISD::UDIVREM: 1059 return true; 1060 } 1061 } 1062 1063 // Assume all other non-float operations are supported. 1064 if (!VT.isFloatingPoint()) 1065 return false; 1066 1067 // We'll need a library call to handle most floats when using soft. 1068 if (TLI->useSoftFloat()) { 1069 switch (I.getOpcode()) { 1070 default: 1071 return true; 1072 case Instruction::Alloca: 1073 case Instruction::Load: 1074 case Instruction::Store: 1075 case Instruction::Select: 1076 case Instruction::PHI: 1077 return false; 1078 } 1079 } 1080 1081 // We'll need a libcall to perform double precision operations on a single 1082 // precision only FPU. 1083 if (I.getType()->isDoubleTy() && !ST->hasFP64()) 1084 return true; 1085 1086 // Likewise for half precision arithmetic. 1087 if (I.getType()->isHalfTy() && !ST->hasFullFP16()) 1088 return true; 1089 1090 return false; 1091 }; 1092 1093 auto IsHardwareLoopIntrinsic = [](Instruction &I) { 1094 if (auto *Call = dyn_cast<IntrinsicInst>(&I)) { 1095 switch (Call->getIntrinsicID()) { 1096 default: 1097 break; 1098 case Intrinsic::set_loop_iterations: 1099 case Intrinsic::test_set_loop_iterations: 1100 case Intrinsic::loop_decrement: 1101 case Intrinsic::loop_decrement_reg: 1102 return true; 1103 } 1104 } 1105 return false; 1106 }; 1107 1108 // Scan the instructions to see if there's any that we know will turn into a 1109 // call or if this loop is already a low-overhead loop. 1110 auto ScanLoop = [&](Loop *L) { 1111 for (auto *BB : L->getBlocks()) { 1112 for (auto &I : *BB) { 1113 if (MaybeCall(I) || IsHardwareLoopIntrinsic(I)) 1114 return false; 1115 } 1116 } 1117 return true; 1118 }; 1119 1120 // Visit inner loops. 1121 for (auto Inner : *L) 1122 if (!ScanLoop(Inner)) 1123 return false; 1124 1125 if (!ScanLoop(L)) 1126 return false; 1127 1128 // TODO: Check whether the trip count calculation is expensive. If L is the 1129 // inner loop but we know it has a low trip count, calculating that trip 1130 // count (in the parent loop) may be detrimental. 1131 1132 LLVMContext &C = L->getHeader()->getContext(); 1133 HWLoopInfo.CounterInReg = true; 1134 HWLoopInfo.IsNestingLegal = false; 1135 HWLoopInfo.PerformEntryTest = true; 1136 HWLoopInfo.CountType = Type::getInt32Ty(C); 1137 HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1); 1138 return true; 1139 } 1140 1141 static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) { 1142 // We don't allow icmp's, and because we only look at single block loops, 1143 // we simply count the icmps, i.e. there should only be 1 for the backedge. 1144 if (isa<ICmpInst>(&I) && ++ICmpCount > 1) 1145 return false; 1146 1147 if (isa<FCmpInst>(&I)) 1148 return false; 1149 1150 // We could allow extending/narrowing FP loads/stores, but codegen is 1151 // too inefficient so reject this for now. 1152 if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I)) 1153 return false; 1154 1155 // Extends have to be extending-loads 1156 if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) ) 1157 if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0))) 1158 return false; 1159 1160 // Truncs have to be narrowing-stores 1161 if (isa<TruncInst>(&I) ) 1162 if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin())) 1163 return false; 1164 1165 return true; 1166 } 1167 1168 // To set up a tail-predicated loop, we need to know the total number of 1169 // elements processed by that loop. Thus, we need to determine the element 1170 // size and: 1171 // 1) it should be uniform for all operations in the vector loop, so we 1172 // e.g. don't want any widening/narrowing operations. 1173 // 2) it should be smaller than i64s because we don't have vector operations 1174 // that work on i64s. 1175 // 3) we don't want elements to be reversed or shuffled, to make sure the 1176 // tail-predication masks/predicates the right lanes. 1177 // 1178 static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, 1179 const DataLayout &DL, 1180 const LoopAccessInfo *LAI) { 1181 PredicatedScalarEvolution PSE = LAI->getPSE(); 1182 int ICmpCount = 0; 1183 int Stride = 0; 1184 1185 LLVM_DEBUG(dbgs() << "tail-predication: checking allowed instructions\n"); 1186 SmallVector<Instruction *, 16> LoadStores; 1187 for (BasicBlock *BB : L->blocks()) { 1188 for (Instruction &I : BB->instructionsWithoutDebug()) { 1189 if (isa<PHINode>(&I)) 1190 continue; 1191 if (!canTailPredicateInstruction(I, ICmpCount)) { 1192 LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump()); 1193 return false; 1194 } 1195 1196 Type *T = I.getType(); 1197 if (T->isPointerTy()) 1198 T = T->getPointerElementType(); 1199 1200 if (T->getScalarSizeInBits() > 32) { 1201 LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump()); 1202 return false; 1203 } 1204 1205 if (isa<StoreInst>(I) || isa<LoadInst>(I)) { 1206 Value *Ptr = isa<LoadInst>(I) ? I.getOperand(0) : I.getOperand(1); 1207 int64_t NextStride = getPtrStride(PSE, Ptr, L); 1208 // TODO: for now only allow consecutive strides of 1. We could support 1209 // other strides as long as it is uniform, but let's keep it simple for 1210 // now. 1211 if (Stride == 0 && NextStride == 1) { 1212 Stride = NextStride; 1213 continue; 1214 } 1215 if (Stride != NextStride) { 1216 LLVM_DEBUG(dbgs() << "Different strides found, can't " 1217 "tail-predicate\n."); 1218 return false; 1219 } 1220 } 1221 } 1222 } 1223 1224 LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n"); 1225 return true; 1226 } 1227 1228 bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, 1229 ScalarEvolution &SE, 1230 AssumptionCache &AC, 1231 TargetLibraryInfo *TLI, 1232 DominatorTree *DT, 1233 const LoopAccessInfo *LAI) { 1234 if (DisableTailPredication) 1235 return false; 1236 1237 // Creating a predicated vector loop is the first step for generating a 1238 // tail-predicated hardware loop, for which we need the MVE masked 1239 // load/stores instructions: 1240 if (!ST->hasMVEIntegerOps()) 1241 return false; 1242 1243 // For now, restrict this to single block loops. 1244 if (L->getNumBlocks() > 1) { 1245 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block " 1246 "loop.\n"); 1247 return false; 1248 } 1249 1250 assert(L->empty() && "preferPredicateOverEpilogue: inner-loop expected"); 1251 1252 HardwareLoopInfo HWLoopInfo(L); 1253 if (!HWLoopInfo.canAnalyze(*LI)) { 1254 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not " 1255 "analyzable.\n"); 1256 return false; 1257 } 1258 1259 // This checks if we have the low-overhead branch architecture 1260 // extension, and if we will create a hardware-loop: 1261 if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) { 1262 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not " 1263 "profitable.\n"); 1264 return false; 1265 } 1266 1267 if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) { 1268 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not " 1269 "a candidate.\n"); 1270 return false; 1271 } 1272 1273 return canTailPredicateLoop(L, LI, SE, DL, LAI); 1274 } 1275 1276 1277 void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 1278 TTI::UnrollingPreferences &UP) { 1279 // Only currently enable these preferences for M-Class cores. 1280 if (!ST->isMClass()) 1281 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP); 1282 1283 // Disable loop unrolling for Oz and Os. 1284 UP.OptSizeThreshold = 0; 1285 UP.PartialOptSizeThreshold = 0; 1286 if (L->getHeader()->getParent()->hasOptSize()) 1287 return; 1288 1289 // Only enable on Thumb-2 targets. 1290 if (!ST->isThumb2()) 1291 return; 1292 1293 SmallVector<BasicBlock*, 4> ExitingBlocks; 1294 L->getExitingBlocks(ExitingBlocks); 1295 LLVM_DEBUG(dbgs() << "Loop has:\n" 1296 << "Blocks: " << L->getNumBlocks() << "\n" 1297 << "Exit blocks: " << ExitingBlocks.size() << "\n"); 1298 1299 // Only allow another exit other than the latch. This acts as an early exit 1300 // as it mirrors the profitability calculation of the runtime unroller. 1301 if (ExitingBlocks.size() > 2) 1302 return; 1303 1304 // Limit the CFG of the loop body for targets with a branch predictor. 1305 // Allowing 4 blocks permits if-then-else diamonds in the body. 1306 if (ST->hasBranchPredictor() && L->getNumBlocks() > 4) 1307 return; 1308 1309 // Scan the loop: don't unroll loops with calls as this could prevent 1310 // inlining. 1311 unsigned Cost = 0; 1312 for (auto *BB : L->getBlocks()) { 1313 for (auto &I : *BB) { 1314 // Don't unroll vectorised loop. MVE does not benefit from it as much as 1315 // scalar code. 1316 if (I.getType()->isVectorTy()) 1317 return; 1318 1319 if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 1320 ImmutableCallSite CS(&I); 1321 if (const Function *F = CS.getCalledFunction()) { 1322 if (!isLoweredToCall(F)) 1323 continue; 1324 } 1325 return; 1326 } 1327 1328 SmallVector<const Value*, 4> Operands(I.value_op_begin(), 1329 I.value_op_end()); 1330 Cost += getUserCost(&I, Operands); 1331 } 1332 } 1333 1334 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n"); 1335 1336 UP.Partial = true; 1337 UP.Runtime = true; 1338 UP.UpperBound = true; 1339 UP.UnrollRemainder = true; 1340 UP.DefaultUnrollRuntimeCount = 4; 1341 UP.UnrollAndJam = true; 1342 UP.UnrollAndJamInnerLoopThreshold = 60; 1343 1344 // Force unrolling small loops can be very useful because of the branch 1345 // taken cost of the backedge. 1346 if (Cost < 12) 1347 UP.Force = true; 1348 } 1349 1350 bool ARMTTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty, 1351 TTI::ReductionFlags Flags) const { 1352 assert(isa<VectorType>(Ty) && "Expected Ty to be a vector type"); 1353 unsigned ScalarBits = Ty->getScalarSizeInBits(); 1354 if (!ST->hasMVEIntegerOps()) 1355 return false; 1356 1357 switch (Opcode) { 1358 case Instruction::FAdd: 1359 case Instruction::FMul: 1360 case Instruction::And: 1361 case Instruction::Or: 1362 case Instruction::Xor: 1363 case Instruction::Mul: 1364 case Instruction::FCmp: 1365 return false; 1366 case Instruction::ICmp: 1367 case Instruction::Add: 1368 return ScalarBits < 64 && 1369 (ScalarBits * Ty->getVectorNumElements()) % 128 == 0; 1370 default: 1371 llvm_unreachable("Unhandled reduction opcode"); 1372 } 1373 return false; 1374 } 1375