1 //===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the AArch64 specific subclass of TargetSubtarget. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "AArch64Subtarget.h" 14 15 #include "AArch64.h" 16 #include "AArch64InstrInfo.h" 17 #include "AArch64PBQPRegAlloc.h" 18 #include "AArch64TargetMachine.h" 19 #include "GISel/AArch64CallLowering.h" 20 #include "GISel/AArch64LegalizerInfo.h" 21 #include "GISel/AArch64RegisterBankInfo.h" 22 #include "MCTargetDesc/AArch64AddressingModes.h" 23 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/IR/GlobalValue.h" 26 #include "llvm/Support/TargetParser.h" 27 28 using namespace llvm; 29 30 #define DEBUG_TYPE "aarch64-subtarget" 31 32 #define GET_SUBTARGETINFO_CTOR 33 #define GET_SUBTARGETINFO_TARGET_DESC 34 #include "AArch64GenSubtargetInfo.inc" 35 36 static cl::opt<bool> 37 EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if " 38 "converter pass"), cl::init(true), cl::Hidden); 39 40 // If OS supports TBI, use this flag to enable it. 41 static cl::opt<bool> 42 UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of " 43 "an address is ignored"), cl::init(false), cl::Hidden); 44 45 static cl::opt<bool> 46 UseNonLazyBind("aarch64-enable-nonlazybind", 47 cl::desc("Call nonlazybind functions via direct GOT load"), 48 cl::init(false), cl::Hidden); 49 50 static cl::opt<bool> UseAA("aarch64-use-aa", cl::init(true), 51 cl::desc("Enable the use of AA during codegen.")); 52 53 AArch64Subtarget &AArch64Subtarget::initializeSubtargetDependencies( 54 StringRef FS, StringRef CPUString, StringRef TuneCPUString) { 55 // Determine default and user-specified characteristics 56 57 if (CPUString.empty()) 58 CPUString = "generic"; 59 60 if (TuneCPUString.empty()) 61 TuneCPUString = CPUString; 62 63 ParseSubtargetFeatures(CPUString, TuneCPUString, FS); 64 initializeProperties(); 65 66 return *this; 67 } 68 69 void AArch64Subtarget::initializeProperties() { 70 // Initialize CPU specific properties. We should add a tablegen feature for 71 // this in the future so we can specify it together with the subtarget 72 // features. 73 switch (ARMProcFamily) { 74 case Others: 75 break; 76 case Carmel: 77 CacheLineSize = 64; 78 break; 79 case CortexA35: 80 break; 81 case CortexA53: 82 case CortexA55: 83 PrefFunctionLogAlignment = 4; 84 break; 85 case CortexA57: 86 MaxInterleaveFactor = 4; 87 PrefFunctionLogAlignment = 4; 88 break; 89 case CortexA65: 90 PrefFunctionLogAlignment = 3; 91 break; 92 case CortexA72: 93 case CortexA73: 94 case CortexA75: 95 case CortexA76: 96 case CortexA77: 97 case CortexA78: 98 case CortexA78C: 99 case CortexR82: 100 case CortexX1: 101 PrefFunctionLogAlignment = 4; 102 break; 103 case CortexA510: 104 case CortexA710: 105 case CortexX2: 106 PrefFunctionLogAlignment = 4; 107 VScaleForTuning = 1; 108 break; 109 case A64FX: 110 CacheLineSize = 256; 111 PrefFunctionLogAlignment = 3; 112 PrefLoopLogAlignment = 2; 113 MaxInterleaveFactor = 4; 114 PrefetchDistance = 128; 115 MinPrefetchStride = 1024; 116 MaxPrefetchIterationsAhead = 4; 117 VScaleForTuning = 4; 118 break; 119 case AppleA7: 120 case AppleA10: 121 case AppleA11: 122 case AppleA12: 123 case AppleA13: 124 case AppleA14: 125 CacheLineSize = 64; 126 PrefetchDistance = 280; 127 MinPrefetchStride = 2048; 128 MaxPrefetchIterationsAhead = 3; 129 break; 130 case ExynosM3: 131 MaxInterleaveFactor = 4; 132 MaxJumpTableSize = 20; 133 PrefFunctionLogAlignment = 5; 134 PrefLoopLogAlignment = 4; 135 break; 136 case Falkor: 137 MaxInterleaveFactor = 4; 138 // FIXME: remove this to enable 64-bit SLP if performance looks good. 139 MinVectorRegisterBitWidth = 128; 140 CacheLineSize = 128; 141 PrefetchDistance = 820; 142 MinPrefetchStride = 2048; 143 MaxPrefetchIterationsAhead = 8; 144 break; 145 case Kryo: 146 MaxInterleaveFactor = 4; 147 VectorInsertExtractBaseCost = 2; 148 CacheLineSize = 128; 149 PrefetchDistance = 740; 150 MinPrefetchStride = 1024; 151 MaxPrefetchIterationsAhead = 11; 152 // FIXME: remove this to enable 64-bit SLP if performance looks good. 153 MinVectorRegisterBitWidth = 128; 154 break; 155 case NeoverseE1: 156 PrefFunctionLogAlignment = 3; 157 break; 158 case NeoverseN1: 159 PrefFunctionLogAlignment = 4; 160 PrefLoopLogAlignment = 5; 161 MaxBytesForLoopAlignment = 16; 162 break; 163 case NeoverseN2: 164 PrefFunctionLogAlignment = 4; 165 PrefLoopLogAlignment = 5; 166 MaxBytesForLoopAlignment = 16; 167 VScaleForTuning = 1; 168 break; 169 case NeoverseV1: 170 PrefFunctionLogAlignment = 4; 171 PrefLoopLogAlignment = 5; 172 MaxBytesForLoopAlignment = 16; 173 VScaleForTuning = 2; 174 break; 175 case Neoverse512TVB: 176 PrefFunctionLogAlignment = 4; 177 VScaleForTuning = 1; 178 MaxInterleaveFactor = 4; 179 break; 180 case Saphira: 181 MaxInterleaveFactor = 4; 182 // FIXME: remove this to enable 64-bit SLP if performance looks good. 183 MinVectorRegisterBitWidth = 128; 184 break; 185 case ThunderX2T99: 186 CacheLineSize = 64; 187 PrefFunctionLogAlignment = 3; 188 PrefLoopLogAlignment = 2; 189 MaxInterleaveFactor = 4; 190 PrefetchDistance = 128; 191 MinPrefetchStride = 1024; 192 MaxPrefetchIterationsAhead = 4; 193 // FIXME: remove this to enable 64-bit SLP if performance looks good. 194 MinVectorRegisterBitWidth = 128; 195 break; 196 case ThunderX: 197 case ThunderXT88: 198 case ThunderXT81: 199 case ThunderXT83: 200 CacheLineSize = 128; 201 PrefFunctionLogAlignment = 3; 202 PrefLoopLogAlignment = 2; 203 // FIXME: remove this to enable 64-bit SLP if performance looks good. 204 MinVectorRegisterBitWidth = 128; 205 break; 206 case TSV110: 207 CacheLineSize = 64; 208 PrefFunctionLogAlignment = 4; 209 PrefLoopLogAlignment = 2; 210 break; 211 case ThunderX3T110: 212 CacheLineSize = 64; 213 PrefFunctionLogAlignment = 4; 214 PrefLoopLogAlignment = 2; 215 MaxInterleaveFactor = 4; 216 PrefetchDistance = 128; 217 MinPrefetchStride = 1024; 218 MaxPrefetchIterationsAhead = 4; 219 // FIXME: remove this to enable 64-bit SLP if performance looks good. 220 MinVectorRegisterBitWidth = 128; 221 break; 222 } 223 } 224 225 AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU, 226 const std::string &TuneCPU, 227 const std::string &FS, 228 const TargetMachine &TM, bool LittleEndian, 229 unsigned MinSVEVectorSizeInBitsOverride, 230 unsigned MaxSVEVectorSizeInBitsOverride) 231 : AArch64GenSubtargetInfo(TT, CPU, TuneCPU, FS), 232 ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()), 233 CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()), 234 IsLittle(LittleEndian), 235 MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride), 236 MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT), 237 InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU)), 238 TLInfo(TM, *this) { 239 if (AArch64::isX18ReservedByDefault(TT)) 240 ReserveXRegister.set(18); 241 242 CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering())); 243 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 244 Legalizer.reset(new AArch64LegalizerInfo(*this)); 245 246 auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo()); 247 248 // FIXME: At this point, we can't rely on Subtarget having RBI. 249 // It's awkward to mix passing RBI and the Subtarget; should we pass 250 // TII/TRI as well? 251 InstSelector.reset(createAArch64InstructionSelector( 252 *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI)); 253 254 RegBankInfo.reset(RBI); 255 } 256 257 const CallLowering *AArch64Subtarget::getCallLowering() const { 258 return CallLoweringInfo.get(); 259 } 260 261 const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const { 262 return InlineAsmLoweringInfo.get(); 263 } 264 265 InstructionSelector *AArch64Subtarget::getInstructionSelector() const { 266 return InstSelector.get(); 267 } 268 269 const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const { 270 return Legalizer.get(); 271 } 272 273 const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const { 274 return RegBankInfo.get(); 275 } 276 277 /// Find the target operand flags that describe how a global value should be 278 /// referenced for the current subtarget. 279 unsigned 280 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV, 281 const TargetMachine &TM) const { 282 // MachO large model always goes via a GOT, simply to get a single 8-byte 283 // absolute relocation on all global addresses. 284 if (TM.getCodeModel() == CodeModel::Large && isTargetMachO()) 285 return AArch64II::MO_GOT; 286 287 if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) { 288 if (GV->hasDLLImportStorageClass()) 289 return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT; 290 if (getTargetTriple().isOSWindows()) 291 return AArch64II::MO_GOT | AArch64II::MO_COFFSTUB; 292 return AArch64II::MO_GOT; 293 } 294 295 // The small code model's direct accesses use ADRP, which cannot 296 // necessarily produce the value 0 (if the code is above 4GB). 297 // Same for the tiny code model, where we have a pc relative LDR. 298 if ((useSmallAddressing() || TM.getCodeModel() == CodeModel::Tiny) && 299 GV->hasExternalWeakLinkage()) 300 return AArch64II::MO_GOT; 301 302 // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate 303 // that their nominal addresses are tagged and outside of the code model. In 304 // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the 305 // tag if necessary based on MO_TAGGED. 306 if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType())) 307 return AArch64II::MO_NC | AArch64II::MO_TAGGED; 308 309 return AArch64II::MO_NO_FLAG; 310 } 311 312 unsigned AArch64Subtarget::classifyGlobalFunctionReference( 313 const GlobalValue *GV, const TargetMachine &TM) const { 314 // MachO large model always goes via a GOT, because we don't have the 315 // relocations available to do anything else.. 316 if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() && 317 !GV->hasInternalLinkage()) 318 return AArch64II::MO_GOT; 319 320 // NonLazyBind goes via GOT unless we know it's available locally. 321 auto *F = dyn_cast<Function>(GV); 322 if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) && 323 !TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) 324 return AArch64II::MO_GOT; 325 326 // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB. 327 if (getTargetTriple().isOSWindows()) 328 return ClassifyGlobalReference(GV, TM); 329 330 return AArch64II::MO_NO_FLAG; 331 } 332 333 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 334 unsigned NumRegionInstrs) const { 335 // LNT run (at least on Cyclone) showed reasonably significant gains for 336 // bi-directional scheduling. 253.perlbmk. 337 Policy.OnlyTopDown = false; 338 Policy.OnlyBottomUp = false; 339 // Enabling or Disabling the latency heuristic is a close call: It seems to 340 // help nearly no benchmark on out-of-order architectures, on the other hand 341 // it regresses register pressure on a few benchmarking. 342 Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic; 343 } 344 345 bool AArch64Subtarget::enableEarlyIfConversion() const { 346 return EnableEarlyIfConvert; 347 } 348 349 bool AArch64Subtarget::supportsAddressTopByteIgnored() const { 350 if (!UseAddressTopByteIgnored) 351 return false; 352 353 if (TargetTriple.isiOS()) { 354 return TargetTriple.getiOSVersion() >= VersionTuple(8); 355 } 356 357 return false; 358 } 359 360 std::unique_ptr<PBQPRAConstraint> 361 AArch64Subtarget::getCustomPBQPConstraints() const { 362 return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr; 363 } 364 365 void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const { 366 // We usually compute max call frame size after ISel. Do the computation now 367 // if the .mir file didn't specify it. Note that this will probably give you 368 // bogus values after PEI has eliminated the callframe setup/destroy pseudo 369 // instructions, specify explicitly if you need it to be correct. 370 MachineFrameInfo &MFI = MF.getFrameInfo(); 371 if (!MFI.isMaxCallFrameSizeComputed()) 372 MFI.computeMaxCallFrameSize(MF); 373 } 374 375 bool AArch64Subtarget::useSVEForFixedLengthVectors() const { 376 // Prefer NEON unless larger SVE registers are available. 377 return hasSVE() && getMinSVEVectorSizeInBits() >= 256; 378 } 379 380 bool AArch64Subtarget::useAA() const { return UseAA; } 381