1 //===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the AArch64 specific subclass of TargetSubtarget.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64Subtarget.h"
14 
15 #include "AArch64.h"
16 #include "AArch64InstrInfo.h"
17 #include "AArch64PBQPRegAlloc.h"
18 #include "AArch64TargetMachine.h"
19 #include "GISel/AArch64CallLowering.h"
20 #include "GISel/AArch64LegalizerInfo.h"
21 #include "GISel/AArch64RegisterBankInfo.h"
22 #include "MCTargetDesc/AArch64AddressingModes.h"
23 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
24 #include "llvm/CodeGen/MachineFrameInfo.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/IR/GlobalValue.h"
27 #include "llvm/Support/AArch64TargetParser.h"
28 #include "llvm/Support/TargetParser.h"
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "aarch64-subtarget"
33 
34 #define GET_SUBTARGETINFO_CTOR
35 #define GET_SUBTARGETINFO_TARGET_DESC
36 #include "AArch64GenSubtargetInfo.inc"
37 
38 static cl::opt<bool>
39 EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if "
40                      "converter pass"), cl::init(true), cl::Hidden);
41 
42 // If OS supports TBI, use this flag to enable it.
43 static cl::opt<bool>
44 UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of "
45                          "an address is ignored"), cl::init(false), cl::Hidden);
46 
47 static cl::opt<bool>
48     UseNonLazyBind("aarch64-enable-nonlazybind",
49                    cl::desc("Call nonlazybind functions via direct GOT load"),
50                    cl::init(false), cl::Hidden);
51 
52 static cl::opt<bool> UseAA("aarch64-use-aa", cl::init(true),
53                            cl::desc("Enable the use of AA during codegen."));
54 
55 AArch64Subtarget &AArch64Subtarget::initializeSubtargetDependencies(
56     StringRef FS, StringRef CPUString, StringRef TuneCPUString) {
57   // Determine default and user-specified characteristics
58 
59   if (CPUString.empty())
60     CPUString = "generic";
61 
62   if (TuneCPUString.empty())
63     TuneCPUString = CPUString;
64 
65   ParseSubtargetFeatures(CPUString, TuneCPUString, FS);
66   initializeProperties();
67 
68   return *this;
69 }
70 
71 void AArch64Subtarget::initializeProperties() {
72   // Initialize CPU specific properties. We should add a tablegen feature for
73   // this in the future so we can specify it together with the subtarget
74   // features.
75   switch (ARMProcFamily) {
76   case Others:
77     break;
78   case Carmel:
79     CacheLineSize = 64;
80     break;
81   case CortexA35:
82     break;
83   case CortexA53:
84   case CortexA55:
85     PrefFunctionLogAlignment = 4;
86     break;
87   case CortexA57:
88     MaxInterleaveFactor = 4;
89     PrefFunctionLogAlignment = 4;
90     break;
91   case CortexA65:
92     PrefFunctionLogAlignment = 3;
93     break;
94   case CortexA72:
95   case CortexA73:
96   case CortexA75:
97   case CortexA76:
98   case CortexA77:
99   case CortexA78:
100   case CortexA78C:
101   case CortexR82:
102   case CortexX1:
103   case CortexX1C:
104     PrefFunctionLogAlignment = 4;
105     break;
106   case CortexA510:
107   case CortexA710:
108   case CortexX2:
109     PrefFunctionLogAlignment = 4;
110     VScaleForTuning = 1;
111     break;
112   case A64FX:
113     CacheLineSize = 256;
114     PrefFunctionLogAlignment = 3;
115     PrefLoopLogAlignment = 2;
116     MaxInterleaveFactor = 4;
117     PrefetchDistance = 128;
118     MinPrefetchStride = 1024;
119     MaxPrefetchIterationsAhead = 4;
120     VScaleForTuning = 4;
121     break;
122   case AppleA7:
123   case AppleA10:
124   case AppleA11:
125   case AppleA12:
126   case AppleA13:
127   case AppleA14:
128     CacheLineSize = 64;
129     PrefetchDistance = 280;
130     MinPrefetchStride = 2048;
131     MaxPrefetchIterationsAhead = 3;
132     break;
133   case ExynosM3:
134     MaxInterleaveFactor = 4;
135     MaxJumpTableSize = 20;
136     PrefFunctionLogAlignment = 5;
137     PrefLoopLogAlignment = 4;
138     break;
139   case Falkor:
140     MaxInterleaveFactor = 4;
141     // FIXME: remove this to enable 64-bit SLP if performance looks good.
142     MinVectorRegisterBitWidth = 128;
143     CacheLineSize = 128;
144     PrefetchDistance = 820;
145     MinPrefetchStride = 2048;
146     MaxPrefetchIterationsAhead = 8;
147     break;
148   case Kryo:
149     MaxInterleaveFactor = 4;
150     VectorInsertExtractBaseCost = 2;
151     CacheLineSize = 128;
152     PrefetchDistance = 740;
153     MinPrefetchStride = 1024;
154     MaxPrefetchIterationsAhead = 11;
155     // FIXME: remove this to enable 64-bit SLP if performance looks good.
156     MinVectorRegisterBitWidth = 128;
157     break;
158   case NeoverseE1:
159     PrefFunctionLogAlignment = 3;
160     break;
161   case NeoverseN1:
162     PrefFunctionLogAlignment = 4;
163     PrefLoopLogAlignment = 5;
164     MaxBytesForLoopAlignment = 16;
165     break;
166   case NeoverseN2:
167     PrefFunctionLogAlignment = 4;
168     PrefLoopLogAlignment = 5;
169     MaxBytesForLoopAlignment = 16;
170     VScaleForTuning = 1;
171     break;
172   case NeoverseV1:
173     PrefFunctionLogAlignment = 4;
174     PrefLoopLogAlignment = 5;
175     MaxBytesForLoopAlignment = 16;
176     VScaleForTuning = 2;
177     break;
178   case Neoverse512TVB:
179     PrefFunctionLogAlignment = 4;
180     VScaleForTuning = 1;
181     MaxInterleaveFactor = 4;
182     break;
183   case Saphira:
184     MaxInterleaveFactor = 4;
185     // FIXME: remove this to enable 64-bit SLP if performance looks good.
186     MinVectorRegisterBitWidth = 128;
187     break;
188   case ThunderX2T99:
189     CacheLineSize = 64;
190     PrefFunctionLogAlignment = 3;
191     PrefLoopLogAlignment = 2;
192     MaxInterleaveFactor = 4;
193     PrefetchDistance = 128;
194     MinPrefetchStride = 1024;
195     MaxPrefetchIterationsAhead = 4;
196     // FIXME: remove this to enable 64-bit SLP if performance looks good.
197     MinVectorRegisterBitWidth = 128;
198     break;
199   case ThunderX:
200   case ThunderXT88:
201   case ThunderXT81:
202   case ThunderXT83:
203     CacheLineSize = 128;
204     PrefFunctionLogAlignment = 3;
205     PrefLoopLogAlignment = 2;
206     // FIXME: remove this to enable 64-bit SLP if performance looks good.
207     MinVectorRegisterBitWidth = 128;
208     break;
209   case TSV110:
210     CacheLineSize = 64;
211     PrefFunctionLogAlignment = 4;
212     PrefLoopLogAlignment = 2;
213     break;
214   case ThunderX3T110:
215     CacheLineSize = 64;
216     PrefFunctionLogAlignment = 4;
217     PrefLoopLogAlignment = 2;
218     MaxInterleaveFactor = 4;
219     PrefetchDistance = 128;
220     MinPrefetchStride = 1024;
221     MaxPrefetchIterationsAhead = 4;
222     // FIXME: remove this to enable 64-bit SLP if performance looks good.
223     MinVectorRegisterBitWidth = 128;
224     break;
225   }
226 }
227 
228 AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
229                                    const std::string &TuneCPU,
230                                    const std::string &FS,
231                                    const TargetMachine &TM, bool LittleEndian,
232                                    unsigned MinSVEVectorSizeInBitsOverride,
233                                    unsigned MaxSVEVectorSizeInBitsOverride)
234     : AArch64GenSubtargetInfo(TT, CPU, TuneCPU, FS),
235       ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()),
236       CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()),
237       IsLittle(LittleEndian),
238       MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride),
239       MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT),
240       InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU)),
241       TLInfo(TM, *this) {
242   if (AArch64::isX18ReservedByDefault(TT))
243     ReserveXRegister.set(18);
244 
245   CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering()));
246   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
247   Legalizer.reset(new AArch64LegalizerInfo(*this));
248 
249   auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo());
250 
251   // FIXME: At this point, we can't rely on Subtarget having RBI.
252   // It's awkward to mix passing RBI and the Subtarget; should we pass
253   // TII/TRI as well?
254   InstSelector.reset(createAArch64InstructionSelector(
255       *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI));
256 
257   RegBankInfo.reset(RBI);
258 }
259 
260 const CallLowering *AArch64Subtarget::getCallLowering() const {
261   return CallLoweringInfo.get();
262 }
263 
264 const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const {
265   return InlineAsmLoweringInfo.get();
266 }
267 
268 InstructionSelector *AArch64Subtarget::getInstructionSelector() const {
269   return InstSelector.get();
270 }
271 
272 const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const {
273   return Legalizer.get();
274 }
275 
276 const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const {
277   return RegBankInfo.get();
278 }
279 
280 /// Find the target operand flags that describe how a global value should be
281 /// referenced for the current subtarget.
282 unsigned
283 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
284                                           const TargetMachine &TM) const {
285   // MachO large model always goes via a GOT, simply to get a single 8-byte
286   // absolute relocation on all global addresses.
287   if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
288     return AArch64II::MO_GOT;
289 
290   if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) {
291     if (GV->hasDLLImportStorageClass())
292       return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT;
293     if (getTargetTriple().isOSWindows())
294       return AArch64II::MO_GOT | AArch64II::MO_COFFSTUB;
295     return AArch64II::MO_GOT;
296   }
297 
298   // The small code model's direct accesses use ADRP, which cannot
299   // necessarily produce the value 0 (if the code is above 4GB).
300   // Same for the tiny code model, where we have a pc relative LDR.
301   if ((useSmallAddressing() || TM.getCodeModel() == CodeModel::Tiny) &&
302       GV->hasExternalWeakLinkage())
303     return AArch64II::MO_GOT;
304 
305   // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate
306   // that their nominal addresses are tagged and outside of the code model. In
307   // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the
308   // tag if necessary based on MO_TAGGED.
309   if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType()))
310     return AArch64II::MO_NC | AArch64II::MO_TAGGED;
311 
312   return AArch64II::MO_NO_FLAG;
313 }
314 
315 unsigned AArch64Subtarget::classifyGlobalFunctionReference(
316     const GlobalValue *GV, const TargetMachine &TM) const {
317   // MachO large model always goes via a GOT, because we don't have the
318   // relocations available to do anything else..
319   if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() &&
320       !GV->hasInternalLinkage())
321     return AArch64II::MO_GOT;
322 
323   // NonLazyBind goes via GOT unless we know it's available locally.
324   auto *F = dyn_cast<Function>(GV);
325   if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) &&
326       !TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
327     return AArch64II::MO_GOT;
328 
329   // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB.
330   if (getTargetTriple().isOSWindows())
331     return ClassifyGlobalReference(GV, TM);
332 
333   return AArch64II::MO_NO_FLAG;
334 }
335 
336 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
337                                            unsigned NumRegionInstrs) const {
338   // LNT run (at least on Cyclone) showed reasonably significant gains for
339   // bi-directional scheduling. 253.perlbmk.
340   Policy.OnlyTopDown = false;
341   Policy.OnlyBottomUp = false;
342   // Enabling or Disabling the latency heuristic is a close call: It seems to
343   // help nearly no benchmark on out-of-order architectures, on the other hand
344   // it regresses register pressure on a few benchmarking.
345   Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;
346 }
347 
348 bool AArch64Subtarget::enableEarlyIfConversion() const {
349   return EnableEarlyIfConvert;
350 }
351 
352 bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
353   if (!UseAddressTopByteIgnored)
354     return false;
355 
356   if (TargetTriple.isDriverKit())
357     return true;
358   if (TargetTriple.isiOS()) {
359     return TargetTriple.getiOSVersion() >= VersionTuple(8);
360   }
361 
362   return false;
363 }
364 
365 std::unique_ptr<PBQPRAConstraint>
366 AArch64Subtarget::getCustomPBQPConstraints() const {
367   return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr;
368 }
369 
370 void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const {
371   // We usually compute max call frame size after ISel. Do the computation now
372   // if the .mir file didn't specify it. Note that this will probably give you
373   // bogus values after PEI has eliminated the callframe setup/destroy pseudo
374   // instructions, specify explicitly if you need it to be correct.
375   MachineFrameInfo &MFI = MF.getFrameInfo();
376   if (!MFI.isMaxCallFrameSizeComputed())
377     MFI.computeMaxCallFrameSize(MF);
378 }
379 
380 bool AArch64Subtarget::useAA() const { return UseAA; }
381