1 //===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the AArch64 specific subclass of TargetSubtarget. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "AArch64Subtarget.h" 14 15 #include "AArch64.h" 16 #include "AArch64InstrInfo.h" 17 #include "AArch64PBQPRegAlloc.h" 18 #include "AArch64TargetMachine.h" 19 #include "GISel/AArch64CallLowering.h" 20 #include "GISel/AArch64LegalizerInfo.h" 21 #include "GISel/AArch64RegisterBankInfo.h" 22 #include "MCTargetDesc/AArch64AddressingModes.h" 23 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" 24 #include "llvm/CodeGen/MachineFrameInfo.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/IR/GlobalValue.h" 27 #include "llvm/Support/AArch64TargetParser.h" 28 #include "llvm/Support/TargetParser.h" 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "aarch64-subtarget" 33 34 #define GET_SUBTARGETINFO_CTOR 35 #define GET_SUBTARGETINFO_TARGET_DESC 36 #include "AArch64GenSubtargetInfo.inc" 37 38 static cl::opt<bool> 39 EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if " 40 "converter pass"), cl::init(true), cl::Hidden); 41 42 // If OS supports TBI, use this flag to enable it. 43 static cl::opt<bool> 44 UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of " 45 "an address is ignored"), cl::init(false), cl::Hidden); 46 47 static cl::opt<bool> 48 UseNonLazyBind("aarch64-enable-nonlazybind", 49 cl::desc("Call nonlazybind functions via direct GOT load"), 50 cl::init(false), cl::Hidden); 51 52 static cl::opt<bool> UseAA("aarch64-use-aa", cl::init(true), 53 cl::desc("Enable the use of AA during codegen.")); 54 55 AArch64Subtarget &AArch64Subtarget::initializeSubtargetDependencies( 56 StringRef FS, StringRef CPUString, StringRef TuneCPUString) { 57 // Determine default and user-specified characteristics 58 59 if (CPUString.empty()) 60 CPUString = "generic"; 61 62 if (TuneCPUString.empty()) 63 TuneCPUString = CPUString; 64 65 ParseSubtargetFeatures(CPUString, TuneCPUString, FS); 66 initializeProperties(); 67 68 return *this; 69 } 70 71 void AArch64Subtarget::initializeProperties() { 72 // Initialize CPU specific properties. We should add a tablegen feature for 73 // this in the future so we can specify it together with the subtarget 74 // features. 75 switch (ARMProcFamily) { 76 case Others: 77 break; 78 case Carmel: 79 CacheLineSize = 64; 80 break; 81 case CortexA35: 82 break; 83 case CortexA53: 84 case CortexA55: 85 PrefFunctionLogAlignment = 4; 86 break; 87 case CortexA57: 88 MaxInterleaveFactor = 4; 89 PrefFunctionLogAlignment = 4; 90 break; 91 case CortexA65: 92 PrefFunctionLogAlignment = 3; 93 break; 94 case CortexA72: 95 case CortexA73: 96 case CortexA75: 97 case CortexA76: 98 case CortexA77: 99 case CortexA78: 100 case CortexA78C: 101 case CortexR82: 102 case CortexX1: 103 case CortexX1C: 104 PrefFunctionLogAlignment = 4; 105 break; 106 case CortexA510: 107 case CortexA710: 108 case CortexX2: 109 PrefFunctionLogAlignment = 4; 110 VScaleForTuning = 1; 111 break; 112 case A64FX: 113 CacheLineSize = 256; 114 PrefFunctionLogAlignment = 3; 115 PrefLoopLogAlignment = 2; 116 MaxInterleaveFactor = 4; 117 PrefetchDistance = 128; 118 MinPrefetchStride = 1024; 119 MaxPrefetchIterationsAhead = 4; 120 VScaleForTuning = 4; 121 break; 122 case AppleA7: 123 case AppleA10: 124 case AppleA11: 125 case AppleA12: 126 case AppleA13: 127 case AppleA14: 128 CacheLineSize = 64; 129 PrefetchDistance = 280; 130 MinPrefetchStride = 2048; 131 MaxPrefetchIterationsAhead = 3; 132 break; 133 case ExynosM3: 134 MaxInterleaveFactor = 4; 135 MaxJumpTableSize = 20; 136 PrefFunctionLogAlignment = 5; 137 PrefLoopLogAlignment = 4; 138 break; 139 case Falkor: 140 MaxInterleaveFactor = 4; 141 // FIXME: remove this to enable 64-bit SLP if performance looks good. 142 MinVectorRegisterBitWidth = 128; 143 CacheLineSize = 128; 144 PrefetchDistance = 820; 145 MinPrefetchStride = 2048; 146 MaxPrefetchIterationsAhead = 8; 147 break; 148 case Kryo: 149 MaxInterleaveFactor = 4; 150 VectorInsertExtractBaseCost = 2; 151 CacheLineSize = 128; 152 PrefetchDistance = 740; 153 MinPrefetchStride = 1024; 154 MaxPrefetchIterationsAhead = 11; 155 // FIXME: remove this to enable 64-bit SLP if performance looks good. 156 MinVectorRegisterBitWidth = 128; 157 break; 158 case NeoverseE1: 159 PrefFunctionLogAlignment = 3; 160 break; 161 case NeoverseN1: 162 PrefFunctionLogAlignment = 4; 163 PrefLoopLogAlignment = 5; 164 MaxBytesForLoopAlignment = 16; 165 break; 166 case NeoverseN2: 167 PrefFunctionLogAlignment = 4; 168 PrefLoopLogAlignment = 5; 169 MaxBytesForLoopAlignment = 16; 170 VScaleForTuning = 1; 171 break; 172 case NeoverseV1: 173 PrefFunctionLogAlignment = 4; 174 PrefLoopLogAlignment = 5; 175 MaxBytesForLoopAlignment = 16; 176 VScaleForTuning = 2; 177 break; 178 case Neoverse512TVB: 179 PrefFunctionLogAlignment = 4; 180 VScaleForTuning = 1; 181 MaxInterleaveFactor = 4; 182 break; 183 case Saphira: 184 MaxInterleaveFactor = 4; 185 // FIXME: remove this to enable 64-bit SLP if performance looks good. 186 MinVectorRegisterBitWidth = 128; 187 break; 188 case ThunderX2T99: 189 CacheLineSize = 64; 190 PrefFunctionLogAlignment = 3; 191 PrefLoopLogAlignment = 2; 192 MaxInterleaveFactor = 4; 193 PrefetchDistance = 128; 194 MinPrefetchStride = 1024; 195 MaxPrefetchIterationsAhead = 4; 196 // FIXME: remove this to enable 64-bit SLP if performance looks good. 197 MinVectorRegisterBitWidth = 128; 198 break; 199 case ThunderX: 200 case ThunderXT88: 201 case ThunderXT81: 202 case ThunderXT83: 203 CacheLineSize = 128; 204 PrefFunctionLogAlignment = 3; 205 PrefLoopLogAlignment = 2; 206 // FIXME: remove this to enable 64-bit SLP if performance looks good. 207 MinVectorRegisterBitWidth = 128; 208 break; 209 case TSV110: 210 CacheLineSize = 64; 211 PrefFunctionLogAlignment = 4; 212 PrefLoopLogAlignment = 2; 213 break; 214 case ThunderX3T110: 215 CacheLineSize = 64; 216 PrefFunctionLogAlignment = 4; 217 PrefLoopLogAlignment = 2; 218 MaxInterleaveFactor = 4; 219 PrefetchDistance = 128; 220 MinPrefetchStride = 1024; 221 MaxPrefetchIterationsAhead = 4; 222 // FIXME: remove this to enable 64-bit SLP if performance looks good. 223 MinVectorRegisterBitWidth = 128; 224 break; 225 } 226 } 227 228 AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU, 229 const std::string &TuneCPU, 230 const std::string &FS, 231 const TargetMachine &TM, bool LittleEndian, 232 unsigned MinSVEVectorSizeInBitsOverride, 233 unsigned MaxSVEVectorSizeInBitsOverride) 234 : AArch64GenSubtargetInfo(TT, CPU, TuneCPU, FS), 235 ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()), 236 CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()), 237 IsLittle(LittleEndian), 238 MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride), 239 MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT), 240 InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU)), 241 TLInfo(TM, *this) { 242 if (AArch64::isX18ReservedByDefault(TT)) 243 ReserveXRegister.set(18); 244 245 CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering())); 246 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 247 Legalizer.reset(new AArch64LegalizerInfo(*this)); 248 249 auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo()); 250 251 // FIXME: At this point, we can't rely on Subtarget having RBI. 252 // It's awkward to mix passing RBI and the Subtarget; should we pass 253 // TII/TRI as well? 254 InstSelector.reset(createAArch64InstructionSelector( 255 *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI)); 256 257 RegBankInfo.reset(RBI); 258 } 259 260 const CallLowering *AArch64Subtarget::getCallLowering() const { 261 return CallLoweringInfo.get(); 262 } 263 264 const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const { 265 return InlineAsmLoweringInfo.get(); 266 } 267 268 InstructionSelector *AArch64Subtarget::getInstructionSelector() const { 269 return InstSelector.get(); 270 } 271 272 const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const { 273 return Legalizer.get(); 274 } 275 276 const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const { 277 return RegBankInfo.get(); 278 } 279 280 /// Find the target operand flags that describe how a global value should be 281 /// referenced for the current subtarget. 282 unsigned 283 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV, 284 const TargetMachine &TM) const { 285 // MachO large model always goes via a GOT, simply to get a single 8-byte 286 // absolute relocation on all global addresses. 287 if (TM.getCodeModel() == CodeModel::Large && isTargetMachO()) 288 return AArch64II::MO_GOT; 289 290 if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) { 291 if (GV->hasDLLImportStorageClass()) 292 return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT; 293 if (getTargetTriple().isOSWindows()) 294 return AArch64II::MO_GOT | AArch64II::MO_COFFSTUB; 295 return AArch64II::MO_GOT; 296 } 297 298 // The small code model's direct accesses use ADRP, which cannot 299 // necessarily produce the value 0 (if the code is above 4GB). 300 // Same for the tiny code model, where we have a pc relative LDR. 301 if ((useSmallAddressing() || TM.getCodeModel() == CodeModel::Tiny) && 302 GV->hasExternalWeakLinkage()) 303 return AArch64II::MO_GOT; 304 305 // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate 306 // that their nominal addresses are tagged and outside of the code model. In 307 // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the 308 // tag if necessary based on MO_TAGGED. 309 if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType())) 310 return AArch64II::MO_NC | AArch64II::MO_TAGGED; 311 312 return AArch64II::MO_NO_FLAG; 313 } 314 315 unsigned AArch64Subtarget::classifyGlobalFunctionReference( 316 const GlobalValue *GV, const TargetMachine &TM) const { 317 // MachO large model always goes via a GOT, because we don't have the 318 // relocations available to do anything else.. 319 if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() && 320 !GV->hasInternalLinkage()) 321 return AArch64II::MO_GOT; 322 323 // NonLazyBind goes via GOT unless we know it's available locally. 324 auto *F = dyn_cast<Function>(GV); 325 if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) && 326 !TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) 327 return AArch64II::MO_GOT; 328 329 // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB. 330 if (getTargetTriple().isOSWindows()) 331 return ClassifyGlobalReference(GV, TM); 332 333 return AArch64II::MO_NO_FLAG; 334 } 335 336 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 337 unsigned NumRegionInstrs) const { 338 // LNT run (at least on Cyclone) showed reasonably significant gains for 339 // bi-directional scheduling. 253.perlbmk. 340 Policy.OnlyTopDown = false; 341 Policy.OnlyBottomUp = false; 342 // Enabling or Disabling the latency heuristic is a close call: It seems to 343 // help nearly no benchmark on out-of-order architectures, on the other hand 344 // it regresses register pressure on a few benchmarking. 345 Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic; 346 } 347 348 bool AArch64Subtarget::enableEarlyIfConversion() const { 349 return EnableEarlyIfConvert; 350 } 351 352 bool AArch64Subtarget::supportsAddressTopByteIgnored() const { 353 if (!UseAddressTopByteIgnored) 354 return false; 355 356 if (TargetTriple.isDriverKit()) 357 return true; 358 if (TargetTriple.isiOS()) { 359 return TargetTriple.getiOSVersion() >= VersionTuple(8); 360 } 361 362 return false; 363 } 364 365 std::unique_ptr<PBQPRAConstraint> 366 AArch64Subtarget::getCustomPBQPConstraints() const { 367 return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr; 368 } 369 370 void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const { 371 // We usually compute max call frame size after ISel. Do the computation now 372 // if the .mir file didn't specify it. Note that this will probably give you 373 // bogus values after PEI has eliminated the callframe setup/destroy pseudo 374 // instructions, specify explicitly if you need it to be correct. 375 MachineFrameInfo &MFI = MF.getFrameInfo(); 376 if (!MFI.isMaxCallFrameSizeComputed()) 377 MFI.computeMaxCallFrameSize(MF); 378 } 379 380 bool AArch64Subtarget::useAA() const { return UseAA; } 381