1 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 /// \file 10 /// This file implements a TargetTransformInfo analysis pass specific to the 11 /// X86 target machine. It uses the target's detailed information to provide 12 /// more precise answers to certain TTI queries, while letting the target 13 /// independent and default TTI implementations handle the rest. 14 /// 15 //===----------------------------------------------------------------------===// 16 17 #define DEBUG_TYPE "x86tti" 18 #include "X86.h" 19 #include "X86TargetMachine.h" 20 #include "llvm/Analysis/TargetTransformInfo.h" 21 #include "llvm/Support/Debug.h" 22 #include "llvm/Target/TargetLowering.h" 23 using namespace llvm; 24 25 // Declare the pass initialization routine locally as target-specific passes 26 // don't havve a target-wide initialization entry point, and so we rely on the 27 // pass constructor initialization. 28 namespace llvm { 29 void initializeX86TTIPass(PassRegistry &); 30 } 31 32 namespace { 33 34 class X86TTI : public ImmutablePass, public TargetTransformInfo { 35 const X86TargetMachine *TM; 36 const X86Subtarget *ST; 37 const X86TargetLowering *TLI; 38 39 /// Estimate the overhead of scalarizing an instruction. Insert and Extract 40 /// are set if the result needs to be inserted and/or extracted from vectors. 41 unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const; 42 43 public: 44 X86TTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) { 45 llvm_unreachable("This pass cannot be directly constructed"); 46 } 47 48 X86TTI(const X86TargetMachine *TM) 49 : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()), 50 TLI(TM->getTargetLowering()) { 51 initializeX86TTIPass(*PassRegistry::getPassRegistry()); 52 } 53 54 virtual void initializePass() { 55 pushTTIStack(this); 56 } 57 58 virtual void finalizePass() { 59 popTTIStack(); 60 } 61 62 virtual void getAnalysisUsage(AnalysisUsage &AU) const { 63 TargetTransformInfo::getAnalysisUsage(AU); 64 } 65 66 /// Pass identification. 67 static char ID; 68 69 /// Provide necessary pointer adjustments for the two base classes. 70 virtual void *getAdjustedAnalysisPointer(const void *ID) { 71 if (ID == &TargetTransformInfo::ID) 72 return (TargetTransformInfo*)this; 73 return this; 74 } 75 76 /// \name Scalar TTI Implementations 77 /// @{ 78 virtual PopcntSupportKind getPopcntSupport(unsigned TyWidth) const; 79 80 /// @} 81 82 /// \name Vector TTI Implementations 83 /// @{ 84 85 virtual unsigned getNumberOfRegisters(bool Vector) const; 86 virtual unsigned getRegisterBitWidth(bool Vector) const; 87 virtual unsigned getMaximumUnrollFactor() const; 88 virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const; 89 virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, 90 int Index, Type *SubTp) const; 91 virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst, 92 Type *Src) const; 93 virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 94 Type *CondTy) const; 95 virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val, 96 unsigned Index) const; 97 virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src, 98 unsigned Alignment, 99 unsigned AddressSpace) const; 100 101 /// @} 102 }; 103 104 } // end anonymous namespace 105 106 INITIALIZE_AG_PASS(X86TTI, TargetTransformInfo, "x86tti", 107 "X86 Target Transform Info", true, true, false) 108 char X86TTI::ID = 0; 109 110 ImmutablePass * 111 llvm::createX86TargetTransformInfoPass(const X86TargetMachine *TM) { 112 return new X86TTI(TM); 113 } 114 115 116 //===----------------------------------------------------------------------===// 117 // 118 // X86 cost model. 119 // 120 //===----------------------------------------------------------------------===// 121 122 namespace { 123 struct X86CostTblEntry { 124 int ISD; 125 MVT Type; 126 unsigned Cost; 127 }; 128 } 129 130 static int 131 FindInTable(const X86CostTblEntry *Tbl, unsigned len, int ISD, MVT Ty) { 132 for (unsigned int i = 0; i < len; ++i) 133 if (Tbl[i].ISD == ISD && Tbl[i].Type == Ty) 134 return i; 135 136 // Could not find an entry. 137 return -1; 138 } 139 140 namespace { 141 struct X86TypeConversionCostTblEntry { 142 int ISD; 143 MVT Dst; 144 MVT Src; 145 unsigned Cost; 146 }; 147 } 148 149 static int 150 FindInConvertTable(const X86TypeConversionCostTblEntry *Tbl, unsigned len, 151 int ISD, MVT Dst, MVT Src) { 152 for (unsigned int i = 0; i < len; ++i) 153 if (Tbl[i].ISD == ISD && Tbl[i].Src == Src && Tbl[i].Dst == Dst) 154 return i; 155 156 // Could not find an entry. 157 return -1; 158 } 159 160 X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const { 161 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 162 // TODO: Currently the __builtin_popcount() implementation using SSE3 163 // instructions is inefficient. Once the problem is fixed, we should 164 // call ST->hasSSE3() instead of ST->hasSSE4(). 165 return ST->hasSSE41() ? PSK_FastHardware : PSK_Software; 166 } 167 168 unsigned X86TTI::getNumberOfRegisters(bool Vector) const { 169 if (Vector && !ST->hasSSE1()) 170 return 0; 171 172 if (ST->is64Bit()) 173 return 16; 174 return 8; 175 } 176 177 unsigned X86TTI::getRegisterBitWidth(bool Vector) const { 178 if (Vector) { 179 if (ST->hasAVX()) return 256; 180 if (ST->hasSSE1()) return 128; 181 return 0; 182 } 183 184 if (ST->is64Bit()) 185 return 64; 186 return 32; 187 188 } 189 190 unsigned X86TTI::getMaximumUnrollFactor() const { 191 if (ST->isAtom()) 192 return 1; 193 194 // Sandybridge and Haswell have multiple execution ports and pipelined 195 // vector units. 196 if (ST->hasAVX()) 197 return 4; 198 199 return 2; 200 } 201 202 unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty) const { 203 // Legalize the type. 204 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty); 205 206 int ISD = TLI->InstructionOpcodeToISD(Opcode); 207 assert(ISD && "Invalid opcode"); 208 209 static const X86CostTblEntry AVX1CostTable[] = { 210 // We don't have to scalarize unsupported ops. We can issue two half-sized 211 // operations and we only need to extract the upper YMM half. 212 // Two ops + 1 extract + 1 insert = 4. 213 { ISD::MUL, MVT::v8i32, 4 }, 214 { ISD::SUB, MVT::v8i32, 4 }, 215 { ISD::ADD, MVT::v8i32, 4 }, 216 { ISD::MUL, MVT::v4i64, 4 }, 217 { ISD::SUB, MVT::v4i64, 4 }, 218 { ISD::ADD, MVT::v4i64, 4 }, 219 }; 220 221 // Look for AVX1 lowering tricks. 222 if (ST->hasAVX()) { 223 int Idx = FindInTable(AVX1CostTable, array_lengthof(AVX1CostTable), ISD, 224 LT.second); 225 if (Idx != -1) 226 return LT.first * AVX1CostTable[Idx].Cost; 227 } 228 // Fallback to the default implementation. 229 return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty); 230 } 231 232 unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, 233 Type *SubTp) const { 234 // We only estimate the cost of reverse shuffles. 235 if (Kind != SK_Reverse) 236 return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); 237 238 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp); 239 unsigned Cost = 1; 240 if (LT.second.getSizeInBits() > 128) 241 Cost = 3; // Extract + insert + copy. 242 243 // Multiple by the number of parts. 244 return Cost * LT.first; 245 } 246 247 unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { 248 int ISD = TLI->InstructionOpcodeToISD(Opcode); 249 assert(ISD && "Invalid opcode"); 250 251 EVT SrcTy = TLI->getValueType(Src); 252 EVT DstTy = TLI->getValueType(Dst); 253 254 if (!SrcTy.isSimple() || !DstTy.isSimple()) 255 return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); 256 257 static const X86TypeConversionCostTblEntry AVXConversionTbl[] = { 258 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, 259 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, 260 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, 261 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, 262 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, 263 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1 }, 264 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 1 }, 265 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 1 }, 266 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 1 }, 267 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 1 }, 268 { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 1 }, 269 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 }, 270 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 6 }, 271 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 9 }, 272 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 3 }, 273 }; 274 275 if (ST->hasAVX()) { 276 int Idx = FindInConvertTable(AVXConversionTbl, 277 array_lengthof(AVXConversionTbl), 278 ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()); 279 if (Idx != -1) 280 return AVXConversionTbl[Idx].Cost; 281 } 282 283 return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); 284 } 285 286 unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 287 Type *CondTy) const { 288 // Legalize the type. 289 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy); 290 291 MVT MTy = LT.second; 292 293 int ISD = TLI->InstructionOpcodeToISD(Opcode); 294 assert(ISD && "Invalid opcode"); 295 296 static const X86CostTblEntry SSE42CostTbl[] = { 297 { ISD::SETCC, MVT::v2f64, 1 }, 298 { ISD::SETCC, MVT::v4f32, 1 }, 299 { ISD::SETCC, MVT::v2i64, 1 }, 300 { ISD::SETCC, MVT::v4i32, 1 }, 301 { ISD::SETCC, MVT::v8i16, 1 }, 302 { ISD::SETCC, MVT::v16i8, 1 }, 303 }; 304 305 static const X86CostTblEntry AVX1CostTbl[] = { 306 { ISD::SETCC, MVT::v4f64, 1 }, 307 { ISD::SETCC, MVT::v8f32, 1 }, 308 // AVX1 does not support 8-wide integer compare. 309 { ISD::SETCC, MVT::v4i64, 4 }, 310 { ISD::SETCC, MVT::v8i32, 4 }, 311 { ISD::SETCC, MVT::v16i16, 4 }, 312 { ISD::SETCC, MVT::v32i8, 4 }, 313 }; 314 315 static const X86CostTblEntry AVX2CostTbl[] = { 316 { ISD::SETCC, MVT::v4i64, 1 }, 317 { ISD::SETCC, MVT::v8i32, 1 }, 318 { ISD::SETCC, MVT::v16i16, 1 }, 319 { ISD::SETCC, MVT::v32i8, 1 }, 320 }; 321 322 if (ST->hasAVX2()) { 323 int Idx = FindInTable(AVX2CostTbl, array_lengthof(AVX2CostTbl), ISD, MTy); 324 if (Idx != -1) 325 return LT.first * AVX2CostTbl[Idx].Cost; 326 } 327 328 if (ST->hasAVX()) { 329 int Idx = FindInTable(AVX1CostTbl, array_lengthof(AVX1CostTbl), ISD, MTy); 330 if (Idx != -1) 331 return LT.first * AVX1CostTbl[Idx].Cost; 332 } 333 334 if (ST->hasSSE42()) { 335 int Idx = FindInTable(SSE42CostTbl, array_lengthof(SSE42CostTbl), ISD, MTy); 336 if (Idx != -1) 337 return LT.first * SSE42CostTbl[Idx].Cost; 338 } 339 340 return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy); 341 } 342 343 unsigned X86TTI::getVectorInstrCost(unsigned Opcode, Type *Val, 344 unsigned Index) const { 345 assert(Val->isVectorTy() && "This must be a vector type"); 346 347 if (Index != -1U) { 348 // Legalize the type. 349 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val); 350 351 // This type is legalized to a scalar type. 352 if (!LT.second.isVector()) 353 return 0; 354 355 // The type may be split. Normalize the index to the new type. 356 unsigned Width = LT.second.getVectorNumElements(); 357 Index = Index % Width; 358 359 // Floating point scalars are already located in index #0. 360 if (Val->getScalarType()->isFloatingPointTy() && Index == 0) 361 return 0; 362 } 363 364 return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index); 365 } 366 367 unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, 368 unsigned AddressSpace) const { 369 // Legalize the type. 370 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src); 371 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && 372 "Invalid Opcode"); 373 374 // Each load/store unit costs 1. 375 unsigned Cost = LT.first * 1; 376 377 // On Sandybridge 256bit load/stores are double pumped 378 // (but not on Haswell). 379 if (LT.second.getSizeInBits() > 128 && !ST->hasAVX2()) 380 Cost*=2; 381 382 return Cost; 383 } 384