1 //===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the AArch64 specific subclass of TargetSubtarget. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "AArch64Subtarget.h" 14 15 #include "AArch64.h" 16 #include "AArch64InstrInfo.h" 17 #include "AArch64PBQPRegAlloc.h" 18 #include "AArch64TargetMachine.h" 19 #include "GISel/AArch64CallLowering.h" 20 #include "GISel/AArch64LegalizerInfo.h" 21 #include "GISel/AArch64RegisterBankInfo.h" 22 #include "MCTargetDesc/AArch64AddressingModes.h" 23 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" 24 #include "llvm/CodeGen/MachineFrameInfo.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/IR/GlobalValue.h" 27 #include "llvm/Support/AArch64TargetParser.h" 28 #include "llvm/Support/TargetParser.h" 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "aarch64-subtarget" 33 34 #define GET_SUBTARGETINFO_CTOR 35 #define GET_SUBTARGETINFO_TARGET_DESC 36 #include "AArch64GenSubtargetInfo.inc" 37 38 static cl::opt<bool> 39 EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if " 40 "converter pass"), cl::init(true), cl::Hidden); 41 42 // If OS supports TBI, use this flag to enable it. 43 static cl::opt<bool> 44 UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of " 45 "an address is ignored"), cl::init(false), cl::Hidden); 46 47 static cl::opt<bool> 48 UseNonLazyBind("aarch64-enable-nonlazybind", 49 cl::desc("Call nonlazybind functions via direct GOT load"), 50 cl::init(false), cl::Hidden); 51 52 static cl::opt<bool> UseAA("aarch64-use-aa", cl::init(true), 53 cl::desc("Enable the use of AA during codegen.")); 54 55 AArch64Subtarget &AArch64Subtarget::initializeSubtargetDependencies( 56 StringRef FS, StringRef CPUString, StringRef TuneCPUString) { 57 // Determine default and user-specified characteristics 58 59 if (CPUString.empty()) 60 CPUString = "generic"; 61 62 if (TuneCPUString.empty()) 63 TuneCPUString = CPUString; 64 65 ParseSubtargetFeatures(CPUString, TuneCPUString, FS); 66 initializeProperties(); 67 68 return *this; 69 } 70 71 void AArch64Subtarget::initializeProperties() { 72 // Initialize CPU specific properties. We should add a tablegen feature for 73 // this in the future so we can specify it together with the subtarget 74 // features. 75 switch (ARMProcFamily) { 76 case Others: 77 break; 78 case Carmel: 79 CacheLineSize = 64; 80 break; 81 case CortexA35: 82 case CortexA53: 83 case CortexA55: 84 PrefFunctionLogAlignment = 4; 85 PrefLoopLogAlignment = 4; 86 MaxBytesForLoopAlignment = 8; 87 break; 88 case CortexA57: 89 MaxInterleaveFactor = 4; 90 PrefFunctionLogAlignment = 4; 91 PrefLoopLogAlignment = 4; 92 MaxBytesForLoopAlignment = 8; 93 break; 94 case CortexA65: 95 PrefFunctionLogAlignment = 3; 96 break; 97 case CortexA72: 98 case CortexA73: 99 case CortexA75: 100 PrefFunctionLogAlignment = 4; 101 PrefLoopLogAlignment = 4; 102 MaxBytesForLoopAlignment = 8; 103 break; 104 case CortexA76: 105 case CortexA77: 106 case CortexA78: 107 case CortexA78C: 108 case CortexR82: 109 case CortexX1: 110 case CortexX1C: 111 PrefFunctionLogAlignment = 4; 112 PrefLoopLogAlignment = 5; 113 MaxBytesForLoopAlignment = 16; 114 break; 115 case CortexA510: 116 PrefFunctionLogAlignment = 4; 117 VScaleForTuning = 1; 118 PrefLoopLogAlignment = 4; 119 MaxBytesForLoopAlignment = 8; 120 break; 121 case CortexA710: 122 case CortexX2: 123 PrefFunctionLogAlignment = 4; 124 VScaleForTuning = 1; 125 PrefLoopLogAlignment = 5; 126 MaxBytesForLoopAlignment = 16; 127 break; 128 case A64FX: 129 CacheLineSize = 256; 130 PrefFunctionLogAlignment = 3; 131 PrefLoopLogAlignment = 2; 132 MaxInterleaveFactor = 4; 133 PrefetchDistance = 128; 134 MinPrefetchStride = 1024; 135 MaxPrefetchIterationsAhead = 4; 136 VScaleForTuning = 4; 137 break; 138 case AppleA7: 139 case AppleA10: 140 case AppleA11: 141 case AppleA12: 142 case AppleA13: 143 case AppleA14: 144 CacheLineSize = 64; 145 PrefetchDistance = 280; 146 MinPrefetchStride = 2048; 147 MaxPrefetchIterationsAhead = 3; 148 break; 149 case ExynosM3: 150 MaxInterleaveFactor = 4; 151 MaxJumpTableSize = 20; 152 PrefFunctionLogAlignment = 5; 153 PrefLoopLogAlignment = 4; 154 break; 155 case Falkor: 156 MaxInterleaveFactor = 4; 157 // FIXME: remove this to enable 64-bit SLP if performance looks good. 158 MinVectorRegisterBitWidth = 128; 159 CacheLineSize = 128; 160 PrefetchDistance = 820; 161 MinPrefetchStride = 2048; 162 MaxPrefetchIterationsAhead = 8; 163 break; 164 case Kryo: 165 MaxInterleaveFactor = 4; 166 VectorInsertExtractBaseCost = 2; 167 CacheLineSize = 128; 168 PrefetchDistance = 740; 169 MinPrefetchStride = 1024; 170 MaxPrefetchIterationsAhead = 11; 171 // FIXME: remove this to enable 64-bit SLP if performance looks good. 172 MinVectorRegisterBitWidth = 128; 173 break; 174 case NeoverseE1: 175 PrefFunctionLogAlignment = 3; 176 break; 177 case NeoverseN1: 178 PrefFunctionLogAlignment = 4; 179 PrefLoopLogAlignment = 5; 180 MaxBytesForLoopAlignment = 16; 181 break; 182 case NeoverseN2: 183 PrefFunctionLogAlignment = 4; 184 PrefLoopLogAlignment = 5; 185 MaxBytesForLoopAlignment = 16; 186 VScaleForTuning = 1; 187 break; 188 case NeoverseV1: 189 PrefFunctionLogAlignment = 4; 190 PrefLoopLogAlignment = 5; 191 MaxBytesForLoopAlignment = 16; 192 VScaleForTuning = 2; 193 break; 194 case Neoverse512TVB: 195 PrefFunctionLogAlignment = 4; 196 VScaleForTuning = 1; 197 MaxInterleaveFactor = 4; 198 break; 199 case Saphira: 200 MaxInterleaveFactor = 4; 201 // FIXME: remove this to enable 64-bit SLP if performance looks good. 202 MinVectorRegisterBitWidth = 128; 203 break; 204 case ThunderX2T99: 205 CacheLineSize = 64; 206 PrefFunctionLogAlignment = 3; 207 PrefLoopLogAlignment = 2; 208 MaxInterleaveFactor = 4; 209 PrefetchDistance = 128; 210 MinPrefetchStride = 1024; 211 MaxPrefetchIterationsAhead = 4; 212 // FIXME: remove this to enable 64-bit SLP if performance looks good. 213 MinVectorRegisterBitWidth = 128; 214 break; 215 case ThunderX: 216 case ThunderXT88: 217 case ThunderXT81: 218 case ThunderXT83: 219 CacheLineSize = 128; 220 PrefFunctionLogAlignment = 3; 221 PrefLoopLogAlignment = 2; 222 // FIXME: remove this to enable 64-bit SLP if performance looks good. 223 MinVectorRegisterBitWidth = 128; 224 break; 225 case TSV110: 226 CacheLineSize = 64; 227 PrefFunctionLogAlignment = 4; 228 PrefLoopLogAlignment = 2; 229 break; 230 case ThunderX3T110: 231 CacheLineSize = 64; 232 PrefFunctionLogAlignment = 4; 233 PrefLoopLogAlignment = 2; 234 MaxInterleaveFactor = 4; 235 PrefetchDistance = 128; 236 MinPrefetchStride = 1024; 237 MaxPrefetchIterationsAhead = 4; 238 // FIXME: remove this to enable 64-bit SLP if performance looks good. 239 MinVectorRegisterBitWidth = 128; 240 break; 241 } 242 } 243 244 AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU, 245 const std::string &TuneCPU, 246 const std::string &FS, 247 const TargetMachine &TM, bool LittleEndian, 248 unsigned MinSVEVectorSizeInBitsOverride, 249 unsigned MaxSVEVectorSizeInBitsOverride) 250 : AArch64GenSubtargetInfo(TT, CPU, TuneCPU, FS), 251 ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()), 252 CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()), 253 IsLittle(LittleEndian), 254 MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride), 255 MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT), 256 InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU)), 257 TLInfo(TM, *this) { 258 if (AArch64::isX18ReservedByDefault(TT)) 259 ReserveXRegister.set(18); 260 261 CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering())); 262 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 263 Legalizer.reset(new AArch64LegalizerInfo(*this)); 264 265 auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo()); 266 267 // FIXME: At this point, we can't rely on Subtarget having RBI. 268 // It's awkward to mix passing RBI and the Subtarget; should we pass 269 // TII/TRI as well? 270 InstSelector.reset(createAArch64InstructionSelector( 271 *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI)); 272 273 RegBankInfo.reset(RBI); 274 } 275 276 const CallLowering *AArch64Subtarget::getCallLowering() const { 277 return CallLoweringInfo.get(); 278 } 279 280 const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const { 281 return InlineAsmLoweringInfo.get(); 282 } 283 284 InstructionSelector *AArch64Subtarget::getInstructionSelector() const { 285 return InstSelector.get(); 286 } 287 288 const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const { 289 return Legalizer.get(); 290 } 291 292 const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const { 293 return RegBankInfo.get(); 294 } 295 296 /// Find the target operand flags that describe how a global value should be 297 /// referenced for the current subtarget. 298 unsigned 299 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV, 300 const TargetMachine &TM) const { 301 // MachO large model always goes via a GOT, simply to get a single 8-byte 302 // absolute relocation on all global addresses. 303 if (TM.getCodeModel() == CodeModel::Large && isTargetMachO()) 304 return AArch64II::MO_GOT; 305 306 if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) { 307 if (GV->hasDLLImportStorageClass()) 308 return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT; 309 if (getTargetTriple().isOSWindows()) 310 return AArch64II::MO_GOT | AArch64II::MO_COFFSTUB; 311 return AArch64II::MO_GOT; 312 } 313 314 // The small code model's direct accesses use ADRP, which cannot 315 // necessarily produce the value 0 (if the code is above 4GB). 316 // Same for the tiny code model, where we have a pc relative LDR. 317 if ((useSmallAddressing() || TM.getCodeModel() == CodeModel::Tiny) && 318 GV->hasExternalWeakLinkage()) 319 return AArch64II::MO_GOT; 320 321 // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate 322 // that their nominal addresses are tagged and outside of the code model. In 323 // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the 324 // tag if necessary based on MO_TAGGED. 325 if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType())) 326 return AArch64II::MO_NC | AArch64II::MO_TAGGED; 327 328 return AArch64II::MO_NO_FLAG; 329 } 330 331 unsigned AArch64Subtarget::classifyGlobalFunctionReference( 332 const GlobalValue *GV, const TargetMachine &TM) const { 333 // MachO large model always goes via a GOT, because we don't have the 334 // relocations available to do anything else.. 335 if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() && 336 !GV->hasInternalLinkage()) 337 return AArch64II::MO_GOT; 338 339 // NonLazyBind goes via GOT unless we know it's available locally. 340 auto *F = dyn_cast<Function>(GV); 341 if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) && 342 !TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) 343 return AArch64II::MO_GOT; 344 345 // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB. 346 if (getTargetTriple().isOSWindows()) 347 return ClassifyGlobalReference(GV, TM); 348 349 return AArch64II::MO_NO_FLAG; 350 } 351 352 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 353 unsigned NumRegionInstrs) const { 354 // LNT run (at least on Cyclone) showed reasonably significant gains for 355 // bi-directional scheduling. 253.perlbmk. 356 Policy.OnlyTopDown = false; 357 Policy.OnlyBottomUp = false; 358 // Enabling or Disabling the latency heuristic is a close call: It seems to 359 // help nearly no benchmark on out-of-order architectures, on the other hand 360 // it regresses register pressure on a few benchmarking. 361 Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic; 362 } 363 364 bool AArch64Subtarget::enableEarlyIfConversion() const { 365 return EnableEarlyIfConvert; 366 } 367 368 bool AArch64Subtarget::supportsAddressTopByteIgnored() const { 369 if (!UseAddressTopByteIgnored) 370 return false; 371 372 if (TargetTriple.isDriverKit()) 373 return true; 374 if (TargetTriple.isiOS()) { 375 return TargetTriple.getiOSVersion() >= VersionTuple(8); 376 } 377 378 return false; 379 } 380 381 std::unique_ptr<PBQPRAConstraint> 382 AArch64Subtarget::getCustomPBQPConstraints() const { 383 return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr; 384 } 385 386 void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const { 387 // We usually compute max call frame size after ISel. Do the computation now 388 // if the .mir file didn't specify it. Note that this will probably give you 389 // bogus values after PEI has eliminated the callframe setup/destroy pseudo 390 // instructions, specify explicitly if you need it to be correct. 391 MachineFrameInfo &MFI = MF.getFrameInfo(); 392 if (!MFI.isMaxCallFrameSizeComputed()) 393 MFI.computeMaxCallFrameSize(MF); 394 } 395 396 bool AArch64Subtarget::useAA() const { return UseAA; } 397