1 //===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the AArch64 specific subclass of TargetSubtarget.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64Subtarget.h"
14 
15 #include "AArch64.h"
16 #include "AArch64InstrInfo.h"
17 #include "AArch64PBQPRegAlloc.h"
18 #include "AArch64TargetMachine.h"
19 #include "GISel/AArch64CallLowering.h"
20 #include "GISel/AArch64LegalizerInfo.h"
21 #include "GISel/AArch64RegisterBankInfo.h"
22 #include "MCTargetDesc/AArch64AddressingModes.h"
23 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/IR/GlobalValue.h"
26 #include "llvm/Support/TargetParser.h"
27 
28 using namespace llvm;
29 
30 #define DEBUG_TYPE "aarch64-subtarget"
31 
32 #define GET_SUBTARGETINFO_CTOR
33 #define GET_SUBTARGETINFO_TARGET_DESC
34 #include "AArch64GenSubtargetInfo.inc"
35 
36 static cl::opt<bool>
37 EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if "
38                      "converter pass"), cl::init(true), cl::Hidden);
39 
40 // If OS supports TBI, use this flag to enable it.
41 static cl::opt<bool>
42 UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of "
43                          "an address is ignored"), cl::init(false), cl::Hidden);
44 
45 static cl::opt<bool>
46     UseNonLazyBind("aarch64-enable-nonlazybind",
47                    cl::desc("Call nonlazybind functions via direct GOT load"),
48                    cl::init(false), cl::Hidden);
49 
50 static cl::opt<bool> UseAA("aarch64-use-aa", cl::init(true),
51                            cl::desc("Enable the use of AA during codegen."));
52 
53 AArch64Subtarget &AArch64Subtarget::initializeSubtargetDependencies(
54     StringRef FS, StringRef CPUString, StringRef TuneCPUString) {
55   // Determine default and user-specified characteristics
56 
57   if (CPUString.empty())
58     CPUString = "generic";
59 
60   if (TuneCPUString.empty())
61     TuneCPUString = CPUString;
62 
63   ParseSubtargetFeatures(CPUString, TuneCPUString, FS);
64   initializeProperties();
65 
66   return *this;
67 }
68 
69 void AArch64Subtarget::initializeProperties() {
70   // Initialize CPU specific properties. We should add a tablegen feature for
71   // this in the future so we can specify it together with the subtarget
72   // features.
73   switch (ARMProcFamily) {
74   case Others:
75     break;
76   case Carmel:
77     CacheLineSize = 64;
78     break;
79   case CortexA35:
80     break;
81   case CortexA53:
82   case CortexA55:
83     PrefFunctionLogAlignment = 4;
84     break;
85   case CortexA510:
86     PrefFunctionLogAlignment = 4;
87     VScaleForTuning = 1;
88     break;
89   case CortexA57:
90     MaxInterleaveFactor = 4;
91     PrefFunctionLogAlignment = 4;
92     break;
93   case CortexA65:
94     PrefFunctionLogAlignment = 3;
95     break;
96   case CortexA72:
97   case CortexA73:
98   case CortexA75:
99   case CortexA76:
100   case CortexA77:
101   case CortexA78:
102   case CortexA78C:
103   case CortexR82:
104   case CortexX1:
105     PrefFunctionLogAlignment = 4;
106     break;
107   case CortexX2:
108     PrefFunctionLogAlignment = 4;
109     VScaleForTuning = 1;
110     break;
111   case A64FX:
112     CacheLineSize = 256;
113     PrefFunctionLogAlignment = 3;
114     PrefLoopLogAlignment = 2;
115     MaxInterleaveFactor = 4;
116     PrefetchDistance = 128;
117     MinPrefetchStride = 1024;
118     MaxPrefetchIterationsAhead = 4;
119     VScaleForTuning = 4;
120     break;
121   case AppleA7:
122   case AppleA10:
123   case AppleA11:
124   case AppleA12:
125   case AppleA13:
126   case AppleA14:
127     CacheLineSize = 64;
128     PrefetchDistance = 280;
129     MinPrefetchStride = 2048;
130     MaxPrefetchIterationsAhead = 3;
131     break;
132   case ExynosM3:
133     MaxInterleaveFactor = 4;
134     MaxJumpTableSize = 20;
135     PrefFunctionLogAlignment = 5;
136     PrefLoopLogAlignment = 4;
137     break;
138   case Falkor:
139     MaxInterleaveFactor = 4;
140     // FIXME: remove this to enable 64-bit SLP if performance looks good.
141     MinVectorRegisterBitWidth = 128;
142     CacheLineSize = 128;
143     PrefetchDistance = 820;
144     MinPrefetchStride = 2048;
145     MaxPrefetchIterationsAhead = 8;
146     break;
147   case Kryo:
148     MaxInterleaveFactor = 4;
149     VectorInsertExtractBaseCost = 2;
150     CacheLineSize = 128;
151     PrefetchDistance = 740;
152     MinPrefetchStride = 1024;
153     MaxPrefetchIterationsAhead = 11;
154     // FIXME: remove this to enable 64-bit SLP if performance looks good.
155     MinVectorRegisterBitWidth = 128;
156     break;
157   case NeoverseE1:
158     PrefFunctionLogAlignment = 3;
159     break;
160   case NeoverseN1:
161     PrefFunctionLogAlignment = 4;
162     break;
163   case NeoverseN2:
164     PrefFunctionLogAlignment = 4;
165     VScaleForTuning = 1;
166     break;
167   case NeoverseV1:
168     PrefFunctionLogAlignment = 4;
169     VScaleForTuning = 2;
170     break;
171   case Neoverse512TVB:
172     PrefFunctionLogAlignment = 4;
173     VScaleForTuning = 1;
174     MaxInterleaveFactor = 4;
175     break;
176   case Saphira:
177     MaxInterleaveFactor = 4;
178     // FIXME: remove this to enable 64-bit SLP if performance looks good.
179     MinVectorRegisterBitWidth = 128;
180     break;
181   case ThunderX2T99:
182     CacheLineSize = 64;
183     PrefFunctionLogAlignment = 3;
184     PrefLoopLogAlignment = 2;
185     MaxInterleaveFactor = 4;
186     PrefetchDistance = 128;
187     MinPrefetchStride = 1024;
188     MaxPrefetchIterationsAhead = 4;
189     // FIXME: remove this to enable 64-bit SLP if performance looks good.
190     MinVectorRegisterBitWidth = 128;
191     break;
192   case ThunderX:
193   case ThunderXT88:
194   case ThunderXT81:
195   case ThunderXT83:
196     CacheLineSize = 128;
197     PrefFunctionLogAlignment = 3;
198     PrefLoopLogAlignment = 2;
199     // FIXME: remove this to enable 64-bit SLP if performance looks good.
200     MinVectorRegisterBitWidth = 128;
201     break;
202   case TSV110:
203     CacheLineSize = 64;
204     PrefFunctionLogAlignment = 4;
205     PrefLoopLogAlignment = 2;
206     break;
207   case ThunderX3T110:
208     CacheLineSize = 64;
209     PrefFunctionLogAlignment = 4;
210     PrefLoopLogAlignment = 2;
211     MaxInterleaveFactor = 4;
212     PrefetchDistance = 128;
213     MinPrefetchStride = 1024;
214     MaxPrefetchIterationsAhead = 4;
215     // FIXME: remove this to enable 64-bit SLP if performance looks good.
216     MinVectorRegisterBitWidth = 128;
217     break;
218   }
219 }
220 
221 AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
222                                    const std::string &TuneCPU,
223                                    const std::string &FS,
224                                    const TargetMachine &TM, bool LittleEndian,
225                                    unsigned MinSVEVectorSizeInBitsOverride,
226                                    unsigned MaxSVEVectorSizeInBitsOverride)
227     : AArch64GenSubtargetInfo(TT, CPU, TuneCPU, FS),
228       ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()),
229       CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()),
230       IsLittle(LittleEndian),
231       MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride),
232       MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT),
233       FrameLowering(),
234       InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU)), TSInfo(),
235       TLInfo(TM, *this) {
236   if (AArch64::isX18ReservedByDefault(TT))
237     ReserveXRegister.set(18);
238 
239   CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering()));
240   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
241   Legalizer.reset(new AArch64LegalizerInfo(*this));
242 
243   auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo());
244 
245   // FIXME: At this point, we can't rely on Subtarget having RBI.
246   // It's awkward to mix passing RBI and the Subtarget; should we pass
247   // TII/TRI as well?
248   InstSelector.reset(createAArch64InstructionSelector(
249       *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI));
250 
251   RegBankInfo.reset(RBI);
252 }
253 
254 const CallLowering *AArch64Subtarget::getCallLowering() const {
255   return CallLoweringInfo.get();
256 }
257 
258 const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const {
259   return InlineAsmLoweringInfo.get();
260 }
261 
262 InstructionSelector *AArch64Subtarget::getInstructionSelector() const {
263   return InstSelector.get();
264 }
265 
266 const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const {
267   return Legalizer.get();
268 }
269 
270 const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const {
271   return RegBankInfo.get();
272 }
273 
274 /// Find the target operand flags that describe how a global value should be
275 /// referenced for the current subtarget.
276 unsigned
277 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
278                                           const TargetMachine &TM) const {
279   // MachO large model always goes via a GOT, simply to get a single 8-byte
280   // absolute relocation on all global addresses.
281   if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
282     return AArch64II::MO_GOT;
283 
284   if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) {
285     if (GV->hasDLLImportStorageClass())
286       return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT;
287     if (getTargetTriple().isOSWindows())
288       return AArch64II::MO_GOT | AArch64II::MO_COFFSTUB;
289     return AArch64II::MO_GOT;
290   }
291 
292   // The small code model's direct accesses use ADRP, which cannot
293   // necessarily produce the value 0 (if the code is above 4GB).
294   // Same for the tiny code model, where we have a pc relative LDR.
295   if ((useSmallAddressing() || TM.getCodeModel() == CodeModel::Tiny) &&
296       GV->hasExternalWeakLinkage())
297     return AArch64II::MO_GOT;
298 
299   // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate
300   // that their nominal addresses are tagged and outside of the code model. In
301   // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the
302   // tag if necessary based on MO_TAGGED.
303   if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType()))
304     return AArch64II::MO_NC | AArch64II::MO_TAGGED;
305 
306   return AArch64II::MO_NO_FLAG;
307 }
308 
309 unsigned AArch64Subtarget::classifyGlobalFunctionReference(
310     const GlobalValue *GV, const TargetMachine &TM) const {
311   // MachO large model always goes via a GOT, because we don't have the
312   // relocations available to do anything else..
313   if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() &&
314       !GV->hasInternalLinkage())
315     return AArch64II::MO_GOT;
316 
317   // NonLazyBind goes via GOT unless we know it's available locally.
318   auto *F = dyn_cast<Function>(GV);
319   if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) &&
320       !TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
321     return AArch64II::MO_GOT;
322 
323   // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB.
324   if (getTargetTriple().isOSWindows())
325     return ClassifyGlobalReference(GV, TM);
326 
327   return AArch64II::MO_NO_FLAG;
328 }
329 
330 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
331                                            unsigned NumRegionInstrs) const {
332   // LNT run (at least on Cyclone) showed reasonably significant gains for
333   // bi-directional scheduling. 253.perlbmk.
334   Policy.OnlyTopDown = false;
335   Policy.OnlyBottomUp = false;
336   // Enabling or Disabling the latency heuristic is a close call: It seems to
337   // help nearly no benchmark on out-of-order architectures, on the other hand
338   // it regresses register pressure on a few benchmarking.
339   Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;
340 }
341 
342 bool AArch64Subtarget::enableEarlyIfConversion() const {
343   return EnableEarlyIfConvert;
344 }
345 
346 bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
347   if (!UseAddressTopByteIgnored)
348     return false;
349 
350   if (TargetTriple.isiOS()) {
351     unsigned Major, Minor, Micro;
352     TargetTriple.getiOSVersion(Major, Minor, Micro);
353     return Major >= 8;
354   }
355 
356   return false;
357 }
358 
359 std::unique_ptr<PBQPRAConstraint>
360 AArch64Subtarget::getCustomPBQPConstraints() const {
361   return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr;
362 }
363 
364 void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const {
365   // We usually compute max call frame size after ISel. Do the computation now
366   // if the .mir file didn't specify it. Note that this will probably give you
367   // bogus values after PEI has eliminated the callframe setup/destroy pseudo
368   // instructions, specify explicitly if you need it to be correct.
369   MachineFrameInfo &MFI = MF.getFrameInfo();
370   if (!MFI.isMaxCallFrameSizeComputed())
371     MFI.computeMaxCallFrameSize(MF);
372 }
373 
374 bool AArch64Subtarget::useSVEForFixedLengthVectors() const {
375   // Prefer NEON unless larger SVE registers are available.
376   return hasSVE() && getMinSVEVectorSizeInBits() >= 256;
377 }
378 
379 bool AArch64Subtarget::useAA() const { return UseAA; }
380