1 //===-- PPCTargetTransformInfo.cpp - PPC specific TTI pass ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 /// \file 10 /// This file implements a TargetTransformInfo analysis pass specific to the 11 /// PPC target machine. It uses the target's detailed information to provide 12 /// more precise answers to certain TTI queries, while letting the target 13 /// independent and default TTI implementations handle the rest. 14 /// 15 //===----------------------------------------------------------------------===// 16 17 #define DEBUG_TYPE "ppctti" 18 #include "PPC.h" 19 #include "PPCTargetMachine.h" 20 #include "llvm/Analysis/TargetTransformInfo.h" 21 #include "llvm/Support/CommandLine.h" 22 #include "llvm/Support/Debug.h" 23 #include "llvm/Target/CostTable.h" 24 #include "llvm/Target/TargetLowering.h" 25 using namespace llvm; 26 27 static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting", 28 cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden); 29 30 // Declare the pass initialization routine locally as target-specific passes 31 // don't havve a target-wide initialization entry point, and so we rely on the 32 // pass constructor initialization. 33 namespace llvm { 34 void initializePPCTTIPass(PassRegistry &); 35 } 36 37 namespace { 38 39 class PPCTTI final : public ImmutablePass, public TargetTransformInfo { 40 const PPCSubtarget *ST; 41 const PPCTargetLowering *TLI; 42 43 public: 44 PPCTTI() : ImmutablePass(ID), ST(0), TLI(0) { 45 llvm_unreachable("This pass cannot be directly constructed"); 46 } 47 48 PPCTTI(const PPCTargetMachine *TM) 49 : ImmutablePass(ID), ST(TM->getSubtargetImpl()), 50 TLI(TM->getTargetLowering()) { 51 initializePPCTTIPass(*PassRegistry::getPassRegistry()); 52 } 53 54 virtual void initializePass() override { 55 pushTTIStack(this); 56 } 57 58 virtual void getAnalysisUsage(AnalysisUsage &AU) const override { 59 TargetTransformInfo::getAnalysisUsage(AU); 60 } 61 62 /// Pass identification. 63 static char ID; 64 65 /// Provide necessary pointer adjustments for the two base classes. 66 virtual void *getAdjustedAnalysisPointer(const void *ID) override { 67 if (ID == &TargetTransformInfo::ID) 68 return (TargetTransformInfo*)this; 69 return this; 70 } 71 72 /// \name Scalar TTI Implementations 73 /// @{ 74 unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override; 75 76 unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, 77 Type *Ty) const override; 78 unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, 79 Type *Ty) const override; 80 81 virtual PopcntSupportKind 82 getPopcntSupport(unsigned TyWidth) const override; 83 virtual void getUnrollingPreferences( 84 Loop *L, UnrollingPreferences &UP) const override; 85 86 /// @} 87 88 /// \name Vector TTI Implementations 89 /// @{ 90 91 virtual unsigned getNumberOfRegisters(bool Vector) const override; 92 virtual unsigned getRegisterBitWidth(bool Vector) const override; 93 virtual unsigned getMaximumUnrollFactor() const override; 94 virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, 95 OperandValueKind, 96 OperandValueKind) const override; 97 virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, 98 int Index, Type *SubTp) const override; 99 virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst, 100 Type *Src) const override; 101 virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 102 Type *CondTy) const override; 103 virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val, 104 unsigned Index) const override; 105 virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src, 106 unsigned Alignment, 107 unsigned AddressSpace) const override; 108 109 /// @} 110 }; 111 112 } // end anonymous namespace 113 114 INITIALIZE_AG_PASS(PPCTTI, TargetTransformInfo, "ppctti", 115 "PPC Target Transform Info", true, true, false) 116 char PPCTTI::ID = 0; 117 118 ImmutablePass * 119 llvm::createPPCTargetTransformInfoPass(const PPCTargetMachine *TM) { 120 return new PPCTTI(TM); 121 } 122 123 124 //===----------------------------------------------------------------------===// 125 // 126 // PPC cost model. 127 // 128 //===----------------------------------------------------------------------===// 129 130 PPCTTI::PopcntSupportKind PPCTTI::getPopcntSupport(unsigned TyWidth) const { 131 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 132 if (ST->hasPOPCNTD() && TyWidth <= 64) 133 return PSK_FastHardware; 134 return PSK_Software; 135 } 136 137 unsigned PPCTTI::getIntImmCost(const APInt &Imm, Type *Ty) const { 138 if (DisablePPCConstHoist) 139 return TargetTransformInfo::getIntImmCost(Imm, Ty); 140 141 assert(Ty->isIntegerTy()); 142 143 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 144 if (BitSize == 0) 145 return ~0U; 146 147 if (Imm == 0) 148 return TCC_Free; 149 150 if (Imm.getBitWidth() <= 64) { 151 if (isInt<16>(Imm.getSExtValue())) 152 return TCC_Basic; 153 154 if (isInt<32>(Imm.getSExtValue())) { 155 // A constant that can be materialized using lis. 156 if ((Imm.getZExtValue() & 0xFFFF) == 0) 157 return TCC_Basic; 158 159 return 2 * TCC_Basic; 160 } 161 } 162 163 return 4 * TCC_Basic; 164 } 165 166 unsigned PPCTTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx, 167 const APInt &Imm, Type *Ty) const { 168 if (DisablePPCConstHoist) 169 return TargetTransformInfo::getIntImmCost(IID, Idx, Imm, Ty); 170 171 assert(Ty->isIntegerTy()); 172 173 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 174 if (BitSize == 0) 175 return ~0U; 176 177 switch (IID) { 178 default: return TCC_Free; 179 case Intrinsic::sadd_with_overflow: 180 case Intrinsic::uadd_with_overflow: 181 case Intrinsic::ssub_with_overflow: 182 case Intrinsic::usub_with_overflow: 183 if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<16>(Imm.getSExtValue())) 184 return TCC_Free; 185 break; 186 } 187 return PPCTTI::getIntImmCost(Imm, Ty); 188 } 189 190 unsigned PPCTTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, 191 Type *Ty) const { 192 if (DisablePPCConstHoist) 193 return TargetTransformInfo::getIntImmCost(Opcode, Idx, Imm, Ty); 194 195 assert(Ty->isIntegerTy()); 196 197 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 198 if (BitSize == 0) 199 return ~0U; 200 201 unsigned ImmIdx = ~0U; 202 bool ShiftedFree = false, RunFree = false, UnsignedFree = false, 203 ZeroFree = false; 204 switch (Opcode) { 205 default: return TCC_Free; 206 case Instruction::GetElementPtr: 207 // Always hoist the base address of a GetElementPtr. This prevents the 208 // creation of new constants for every base constant that gets constant 209 // folded with the offset. 210 if (Idx == 0) 211 return 2 * TCC_Basic; 212 return TCC_Free; 213 case Instruction::And: 214 RunFree = true; // (for the rotate-and-mask instructions) 215 // Fallthrough... 216 case Instruction::Add: 217 case Instruction::Or: 218 case Instruction::Xor: 219 ShiftedFree = true; 220 // Fallthrough... 221 case Instruction::Sub: 222 case Instruction::Mul: 223 case Instruction::Shl: 224 case Instruction::LShr: 225 case Instruction::AShr: 226 ImmIdx = 1; 227 break; 228 case Instruction::ICmp: 229 UnsignedFree = true; 230 ImmIdx = 1; 231 // Fallthrough... (zero comparisons can use record-form instructions) 232 case Instruction::Select: 233 ZeroFree = true; 234 break; 235 case Instruction::PHI: 236 case Instruction::Call: 237 case Instruction::Ret: 238 case Instruction::Load: 239 case Instruction::Store: 240 break; 241 } 242 243 if (ZeroFree && Imm == 0) 244 return TCC_Free; 245 246 if (Idx == ImmIdx && Imm.getBitWidth() <= 64) { 247 if (isInt<16>(Imm.getSExtValue())) 248 return TCC_Free; 249 250 if (RunFree) { 251 if (Imm.getBitWidth() <= 32 && 252 (isShiftedMask_32(Imm.getZExtValue()) || 253 isShiftedMask_32(~Imm.getZExtValue()))) 254 return TCC_Free; 255 256 257 if (ST->isPPC64() && 258 (isShiftedMask_64(Imm.getZExtValue()) || 259 isShiftedMask_64(~Imm.getZExtValue()))) 260 return TCC_Free; 261 } 262 263 if (UnsignedFree && isUInt<16>(Imm.getZExtValue())) 264 return TCC_Free; 265 266 if (ShiftedFree && (Imm.getZExtValue() & 0xFFFF) == 0) 267 return TCC_Free; 268 } 269 270 return PPCTTI::getIntImmCost(Imm, Ty); 271 } 272 273 void PPCTTI::getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const { 274 if (ST->getDarwinDirective() == PPC::DIR_A2) { 275 // The A2 is in-order with a deep pipeline, and concatenation unrolling 276 // helps expose latency-hiding opportunities to the instruction scheduler. 277 UP.Partial = UP.Runtime = true; 278 } 279 } 280 281 unsigned PPCTTI::getNumberOfRegisters(bool Vector) const { 282 if (Vector && !ST->hasAltivec()) 283 return 0; 284 return ST->hasVSX() ? 64 : 32; 285 } 286 287 unsigned PPCTTI::getRegisterBitWidth(bool Vector) const { 288 if (Vector) { 289 if (ST->hasAltivec()) return 128; 290 return 0; 291 } 292 293 if (ST->isPPC64()) 294 return 64; 295 return 32; 296 297 } 298 299 unsigned PPCTTI::getMaximumUnrollFactor() const { 300 unsigned Directive = ST->getDarwinDirective(); 301 // The 440 has no SIMD support, but floating-point instructions 302 // have a 5-cycle latency, so unroll by 5x for latency hiding. 303 if (Directive == PPC::DIR_440) 304 return 5; 305 306 // The A2 has no SIMD support, but floating-point instructions 307 // have a 6-cycle latency, so unroll by 6x for latency hiding. 308 if (Directive == PPC::DIR_A2) 309 return 6; 310 311 // FIXME: For lack of any better information, do no harm... 312 if (Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500) 313 return 1; 314 315 // For most things, modern systems have two execution units (and 316 // out-of-order execution). 317 return 2; 318 } 319 320 unsigned PPCTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, 321 OperandValueKind Op1Info, 322 OperandValueKind Op2Info) const { 323 assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); 324 325 // Fallback to the default implementation. 326 return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info, 327 Op2Info); 328 } 329 330 unsigned PPCTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, 331 Type *SubTp) const { 332 return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); 333 } 334 335 unsigned PPCTTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { 336 assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); 337 338 return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); 339 } 340 341 unsigned PPCTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 342 Type *CondTy) const { 343 return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy); 344 } 345 346 unsigned PPCTTI::getVectorInstrCost(unsigned Opcode, Type *Val, 347 unsigned Index) const { 348 assert(Val->isVectorTy() && "This must be a vector type"); 349 350 int ISD = TLI->InstructionOpcodeToISD(Opcode); 351 assert(ISD && "Invalid opcode"); 352 353 if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) { 354 // Double-precision scalars are already located in index #0. 355 if (Index == 0) 356 return 0; 357 358 return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index); 359 } 360 361 // Estimated cost of a load-hit-store delay. This was obtained 362 // experimentally as a minimum needed to prevent unprofitable 363 // vectorization for the paq8p benchmark. It may need to be 364 // raised further if other unprofitable cases remain. 365 unsigned LHSPenalty = 2; 366 if (ISD == ISD::INSERT_VECTOR_ELT) 367 LHSPenalty += 7; 368 369 // Vector element insert/extract with Altivec is very expensive, 370 // because they require store and reload with the attendant 371 // processor stall for load-hit-store. Until VSX is available, 372 // these need to be estimated as very costly. 373 if (ISD == ISD::EXTRACT_VECTOR_ELT || 374 ISD == ISD::INSERT_VECTOR_ELT) 375 return LHSPenalty + 376 TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index); 377 378 return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index); 379 } 380 381 unsigned PPCTTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, 382 unsigned AddressSpace) const { 383 // Legalize the type. 384 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src); 385 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && 386 "Invalid Opcode"); 387 388 unsigned Cost = 389 TargetTransformInfo::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace); 390 391 // VSX loads/stores support unaligned access. 392 if (ST->hasVSX()) { 393 if (LT.second == MVT::v2f64 || LT.second == MVT::v2i64) 394 return Cost; 395 } 396 397 bool UnalignedAltivec = 398 Src->isVectorTy() && 399 Src->getPrimitiveSizeInBits() >= LT.second.getSizeInBits() && 400 LT.second.getSizeInBits() == 128 && 401 Opcode == Instruction::Load; 402 403 // PPC in general does not support unaligned loads and stores. They'll need 404 // to be decomposed based on the alignment factor. 405 unsigned SrcBytes = LT.second.getStoreSize(); 406 if (SrcBytes && Alignment && Alignment < SrcBytes && !UnalignedAltivec) { 407 Cost += LT.first*(SrcBytes/Alignment-1); 408 409 // For a vector type, there is also scalarization overhead (only for 410 // stores, loads are expanded using the vector-load + permutation sequence, 411 // which is much less expensive). 412 if (Src->isVectorTy() && Opcode == Instruction::Store) 413 for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i) 414 Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i); 415 } 416 417 return Cost; 418 } 419 420