1 //===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the AArch64 specific subclass of TargetSubtarget. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "AArch64Subtarget.h" 14 15 #include "AArch64.h" 16 #include "AArch64InstrInfo.h" 17 #include "AArch64PBQPRegAlloc.h" 18 #include "AArch64TargetMachine.h" 19 #include "GISel/AArch64CallLowering.h" 20 #include "GISel/AArch64LegalizerInfo.h" 21 #include "GISel/AArch64RegisterBankInfo.h" 22 #include "MCTargetDesc/AArch64AddressingModes.h" 23 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/IR/GlobalValue.h" 26 #include "llvm/Support/TargetParser.h" 27 28 using namespace llvm; 29 30 #define DEBUG_TYPE "aarch64-subtarget" 31 32 #define GET_SUBTARGETINFO_CTOR 33 #define GET_SUBTARGETINFO_TARGET_DESC 34 #include "AArch64GenSubtargetInfo.inc" 35 36 static cl::opt<bool> 37 EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if " 38 "converter pass"), cl::init(true), cl::Hidden); 39 40 // If OS supports TBI, use this flag to enable it. 41 static cl::opt<bool> 42 UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of " 43 "an address is ignored"), cl::init(false), cl::Hidden); 44 45 static cl::opt<bool> 46 UseNonLazyBind("aarch64-enable-nonlazybind", 47 cl::desc("Call nonlazybind functions via direct GOT load"), 48 cl::init(false), cl::Hidden); 49 50 static cl::opt<bool> UseAA("aarch64-use-aa", cl::init(true), 51 cl::desc("Enable the use of AA during codegen.")); 52 53 AArch64Subtarget &AArch64Subtarget::initializeSubtargetDependencies( 54 StringRef FS, StringRef CPUString, StringRef TuneCPUString) { 55 // Determine default and user-specified characteristics 56 57 if (CPUString.empty()) 58 CPUString = "generic"; 59 60 if (TuneCPUString.empty()) 61 TuneCPUString = CPUString; 62 63 ParseSubtargetFeatures(CPUString, TuneCPUString, FS); 64 initializeProperties(); 65 66 return *this; 67 } 68 69 void AArch64Subtarget::initializeProperties() { 70 // Initialize CPU specific properties. We should add a tablegen feature for 71 // this in the future so we can specify it together with the subtarget 72 // features. 73 switch (ARMProcFamily) { 74 case Others: 75 break; 76 case Carmel: 77 CacheLineSize = 64; 78 break; 79 case CortexA35: 80 break; 81 case CortexA53: 82 case CortexA55: 83 PrefFunctionLogAlignment = 4; 84 break; 85 case CortexA510: 86 PrefFunctionLogAlignment = 4; 87 VScaleForTuning = 1; 88 break; 89 case CortexA57: 90 MaxInterleaveFactor = 4; 91 PrefFunctionLogAlignment = 4; 92 break; 93 case CortexA65: 94 PrefFunctionLogAlignment = 3; 95 break; 96 case CortexA72: 97 case CortexA73: 98 case CortexA75: 99 case CortexA76: 100 case CortexA77: 101 case CortexA78: 102 case CortexA78C: 103 case CortexR82: 104 case CortexX1: 105 PrefFunctionLogAlignment = 4; 106 break; 107 case CortexX2: 108 PrefFunctionLogAlignment = 4; 109 VScaleForTuning = 1; 110 break; 111 case A64FX: 112 CacheLineSize = 256; 113 PrefFunctionLogAlignment = 3; 114 PrefLoopLogAlignment = 2; 115 MaxInterleaveFactor = 4; 116 PrefetchDistance = 128; 117 MinPrefetchStride = 1024; 118 MaxPrefetchIterationsAhead = 4; 119 VScaleForTuning = 4; 120 break; 121 case AppleA7: 122 case AppleA10: 123 case AppleA11: 124 case AppleA12: 125 case AppleA13: 126 case AppleA14: 127 CacheLineSize = 64; 128 PrefetchDistance = 280; 129 MinPrefetchStride = 2048; 130 MaxPrefetchIterationsAhead = 3; 131 break; 132 case ExynosM3: 133 MaxInterleaveFactor = 4; 134 MaxJumpTableSize = 20; 135 PrefFunctionLogAlignment = 5; 136 PrefLoopLogAlignment = 4; 137 break; 138 case Falkor: 139 MaxInterleaveFactor = 4; 140 // FIXME: remove this to enable 64-bit SLP if performance looks good. 141 MinVectorRegisterBitWidth = 128; 142 CacheLineSize = 128; 143 PrefetchDistance = 820; 144 MinPrefetchStride = 2048; 145 MaxPrefetchIterationsAhead = 8; 146 break; 147 case Kryo: 148 MaxInterleaveFactor = 4; 149 VectorInsertExtractBaseCost = 2; 150 CacheLineSize = 128; 151 PrefetchDistance = 740; 152 MinPrefetchStride = 1024; 153 MaxPrefetchIterationsAhead = 11; 154 // FIXME: remove this to enable 64-bit SLP if performance looks good. 155 MinVectorRegisterBitWidth = 128; 156 break; 157 case NeoverseE1: 158 PrefFunctionLogAlignment = 3; 159 break; 160 case NeoverseN1: 161 PrefFunctionLogAlignment = 4; 162 break; 163 case NeoverseN2: 164 PrefFunctionLogAlignment = 4; 165 VScaleForTuning = 1; 166 break; 167 case NeoverseV1: 168 PrefFunctionLogAlignment = 4; 169 VScaleForTuning = 2; 170 break; 171 case Neoverse512TVB: 172 PrefFunctionLogAlignment = 4; 173 VScaleForTuning = 1; 174 MaxInterleaveFactor = 4; 175 break; 176 case Saphira: 177 MaxInterleaveFactor = 4; 178 // FIXME: remove this to enable 64-bit SLP if performance looks good. 179 MinVectorRegisterBitWidth = 128; 180 break; 181 case ThunderX2T99: 182 CacheLineSize = 64; 183 PrefFunctionLogAlignment = 3; 184 PrefLoopLogAlignment = 2; 185 MaxInterleaveFactor = 4; 186 PrefetchDistance = 128; 187 MinPrefetchStride = 1024; 188 MaxPrefetchIterationsAhead = 4; 189 // FIXME: remove this to enable 64-bit SLP if performance looks good. 190 MinVectorRegisterBitWidth = 128; 191 break; 192 case ThunderX: 193 case ThunderXT88: 194 case ThunderXT81: 195 case ThunderXT83: 196 CacheLineSize = 128; 197 PrefFunctionLogAlignment = 3; 198 PrefLoopLogAlignment = 2; 199 // FIXME: remove this to enable 64-bit SLP if performance looks good. 200 MinVectorRegisterBitWidth = 128; 201 break; 202 case TSV110: 203 CacheLineSize = 64; 204 PrefFunctionLogAlignment = 4; 205 PrefLoopLogAlignment = 2; 206 break; 207 case ThunderX3T110: 208 CacheLineSize = 64; 209 PrefFunctionLogAlignment = 4; 210 PrefLoopLogAlignment = 2; 211 MaxInterleaveFactor = 4; 212 PrefetchDistance = 128; 213 MinPrefetchStride = 1024; 214 MaxPrefetchIterationsAhead = 4; 215 // FIXME: remove this to enable 64-bit SLP if performance looks good. 216 MinVectorRegisterBitWidth = 128; 217 break; 218 } 219 } 220 221 AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU, 222 const std::string &TuneCPU, 223 const std::string &FS, 224 const TargetMachine &TM, bool LittleEndian, 225 unsigned MinSVEVectorSizeInBitsOverride, 226 unsigned MaxSVEVectorSizeInBitsOverride) 227 : AArch64GenSubtargetInfo(TT, CPU, TuneCPU, FS), 228 ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()), 229 CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()), 230 IsLittle(LittleEndian), 231 MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride), 232 MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT), 233 FrameLowering(), 234 InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU)), TSInfo(), 235 TLInfo(TM, *this) { 236 if (AArch64::isX18ReservedByDefault(TT)) 237 ReserveXRegister.set(18); 238 239 CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering())); 240 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 241 Legalizer.reset(new AArch64LegalizerInfo(*this)); 242 243 auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo()); 244 245 // FIXME: At this point, we can't rely on Subtarget having RBI. 246 // It's awkward to mix passing RBI and the Subtarget; should we pass 247 // TII/TRI as well? 248 InstSelector.reset(createAArch64InstructionSelector( 249 *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI)); 250 251 RegBankInfo.reset(RBI); 252 } 253 254 const CallLowering *AArch64Subtarget::getCallLowering() const { 255 return CallLoweringInfo.get(); 256 } 257 258 const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const { 259 return InlineAsmLoweringInfo.get(); 260 } 261 262 InstructionSelector *AArch64Subtarget::getInstructionSelector() const { 263 return InstSelector.get(); 264 } 265 266 const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const { 267 return Legalizer.get(); 268 } 269 270 const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const { 271 return RegBankInfo.get(); 272 } 273 274 /// Find the target operand flags that describe how a global value should be 275 /// referenced for the current subtarget. 276 unsigned 277 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV, 278 const TargetMachine &TM) const { 279 // MachO large model always goes via a GOT, simply to get a single 8-byte 280 // absolute relocation on all global addresses. 281 if (TM.getCodeModel() == CodeModel::Large && isTargetMachO()) 282 return AArch64II::MO_GOT; 283 284 if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) { 285 if (GV->hasDLLImportStorageClass()) 286 return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT; 287 if (getTargetTriple().isOSWindows()) 288 return AArch64II::MO_GOT | AArch64II::MO_COFFSTUB; 289 return AArch64II::MO_GOT; 290 } 291 292 // The small code model's direct accesses use ADRP, which cannot 293 // necessarily produce the value 0 (if the code is above 4GB). 294 // Same for the tiny code model, where we have a pc relative LDR. 295 if ((useSmallAddressing() || TM.getCodeModel() == CodeModel::Tiny) && 296 GV->hasExternalWeakLinkage()) 297 return AArch64II::MO_GOT; 298 299 // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate 300 // that their nominal addresses are tagged and outside of the code model. In 301 // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the 302 // tag if necessary based on MO_TAGGED. 303 if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType())) 304 return AArch64II::MO_NC | AArch64II::MO_TAGGED; 305 306 return AArch64II::MO_NO_FLAG; 307 } 308 309 unsigned AArch64Subtarget::classifyGlobalFunctionReference( 310 const GlobalValue *GV, const TargetMachine &TM) const { 311 // MachO large model always goes via a GOT, because we don't have the 312 // relocations available to do anything else.. 313 if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() && 314 !GV->hasInternalLinkage()) 315 return AArch64II::MO_GOT; 316 317 // NonLazyBind goes via GOT unless we know it's available locally. 318 auto *F = dyn_cast<Function>(GV); 319 if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) && 320 !TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) 321 return AArch64II::MO_GOT; 322 323 // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB. 324 if (getTargetTriple().isOSWindows()) 325 return ClassifyGlobalReference(GV, TM); 326 327 return AArch64II::MO_NO_FLAG; 328 } 329 330 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 331 unsigned NumRegionInstrs) const { 332 // LNT run (at least on Cyclone) showed reasonably significant gains for 333 // bi-directional scheduling. 253.perlbmk. 334 Policy.OnlyTopDown = false; 335 Policy.OnlyBottomUp = false; 336 // Enabling or Disabling the latency heuristic is a close call: It seems to 337 // help nearly no benchmark on out-of-order architectures, on the other hand 338 // it regresses register pressure on a few benchmarking. 339 Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic; 340 } 341 342 bool AArch64Subtarget::enableEarlyIfConversion() const { 343 return EnableEarlyIfConvert; 344 } 345 346 bool AArch64Subtarget::supportsAddressTopByteIgnored() const { 347 if (!UseAddressTopByteIgnored) 348 return false; 349 350 if (TargetTriple.isiOS()) { 351 unsigned Major, Minor, Micro; 352 TargetTriple.getiOSVersion(Major, Minor, Micro); 353 return Major >= 8; 354 } 355 356 return false; 357 } 358 359 std::unique_ptr<PBQPRAConstraint> 360 AArch64Subtarget::getCustomPBQPConstraints() const { 361 return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr; 362 } 363 364 void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const { 365 // We usually compute max call frame size after ISel. Do the computation now 366 // if the .mir file didn't specify it. Note that this will probably give you 367 // bogus values after PEI has eliminated the callframe setup/destroy pseudo 368 // instructions, specify explicitly if you need it to be correct. 369 MachineFrameInfo &MFI = MF.getFrameInfo(); 370 if (!MFI.isMaxCallFrameSizeComputed()) 371 MFI.computeMaxCallFrameSize(MF); 372 } 373 374 bool AArch64Subtarget::useSVEForFixedLengthVectors() const { 375 // Prefer NEON unless larger SVE registers are available. 376 return hasSVE() && getMinSVEVectorSizeInBits() >= 256; 377 } 378 379 bool AArch64Subtarget::useAA() const { return UseAA; } 380