1 //===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the AArch64 specific subclass of TargetSubtarget.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64Subtarget.h"
14 
15 #include "AArch64.h"
16 #include "AArch64InstrInfo.h"
17 #include "AArch64PBQPRegAlloc.h"
18 #include "AArch64TargetMachine.h"
19 #include "GISel/AArch64CallLowering.h"
20 #include "GISel/AArch64LegalizerInfo.h"
21 #include "GISel/AArch64RegisterBankInfo.h"
22 #include "MCTargetDesc/AArch64AddressingModes.h"
23 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
24 #include "llvm/CodeGen/MachineFrameInfo.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/IR/GlobalValue.h"
27 #include "llvm/Support/AArch64TargetParser.h"
28 #include "llvm/Support/TargetParser.h"
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "aarch64-subtarget"
33 
34 #define GET_SUBTARGETINFO_CTOR
35 #define GET_SUBTARGETINFO_TARGET_DESC
36 #include "AArch64GenSubtargetInfo.inc"
37 
38 static cl::opt<bool>
39 EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if "
40                      "converter pass"), cl::init(true), cl::Hidden);
41 
42 // If OS supports TBI, use this flag to enable it.
43 static cl::opt<bool>
44 UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of "
45                          "an address is ignored"), cl::init(false), cl::Hidden);
46 
47 static cl::opt<bool>
48     UseNonLazyBind("aarch64-enable-nonlazybind",
49                    cl::desc("Call nonlazybind functions via direct GOT load"),
50                    cl::init(false), cl::Hidden);
51 
52 static cl::opt<bool> UseAA("aarch64-use-aa", cl::init(true),
53                            cl::desc("Enable the use of AA during codegen."));
54 
55 AArch64Subtarget &AArch64Subtarget::initializeSubtargetDependencies(
56     StringRef FS, StringRef CPUString, StringRef TuneCPUString) {
57   // Determine default and user-specified characteristics
58 
59   if (CPUString.empty())
60     CPUString = "generic";
61 
62   if (TuneCPUString.empty())
63     TuneCPUString = CPUString;
64 
65   ParseSubtargetFeatures(CPUString, TuneCPUString, FS);
66   initializeProperties();
67 
68   return *this;
69 }
70 
71 void AArch64Subtarget::initializeProperties() {
72   // Initialize CPU specific properties. We should add a tablegen feature for
73   // this in the future so we can specify it together with the subtarget
74   // features.
75   switch (ARMProcFamily) {
76   case Others:
77     break;
78   case Carmel:
79     CacheLineSize = 64;
80     break;
81   case CortexA35:
82   case CortexA53:
83   case CortexA55:
84     PrefFunctionLogAlignment = 4;
85     PrefLoopLogAlignment = 4;
86     MaxBytesForLoopAlignment = 8;
87     break;
88   case CortexA57:
89     MaxInterleaveFactor = 4;
90     PrefFunctionLogAlignment = 4;
91     PrefLoopLogAlignment = 4;
92     MaxBytesForLoopAlignment = 8;
93     break;
94   case CortexA65:
95     PrefFunctionLogAlignment = 3;
96     break;
97   case CortexA72:
98   case CortexA73:
99   case CortexA75:
100     PrefFunctionLogAlignment = 4;
101     PrefLoopLogAlignment = 4;
102     MaxBytesForLoopAlignment = 8;
103     break;
104   case CortexA76:
105   case CortexA77:
106   case CortexA78:
107   case CortexA78C:
108   case CortexR82:
109   case CortexX1:
110   case CortexX1C:
111     PrefFunctionLogAlignment = 4;
112     PrefLoopLogAlignment = 5;
113     MaxBytesForLoopAlignment = 16;
114     break;
115   case CortexA510:
116     PrefFunctionLogAlignment = 4;
117     VScaleForTuning = 1;
118     PrefLoopLogAlignment = 4;
119     MaxBytesForLoopAlignment = 8;
120     break;
121   case CortexA710:
122   case CortexX2:
123     PrefFunctionLogAlignment = 4;
124     VScaleForTuning = 1;
125     PrefLoopLogAlignment = 5;
126     MaxBytesForLoopAlignment = 16;
127     break;
128   case A64FX:
129     CacheLineSize = 256;
130     PrefFunctionLogAlignment = 3;
131     PrefLoopLogAlignment = 2;
132     MaxInterleaveFactor = 4;
133     PrefetchDistance = 128;
134     MinPrefetchStride = 1024;
135     MaxPrefetchIterationsAhead = 4;
136     VScaleForTuning = 4;
137     break;
138   case AppleA7:
139   case AppleA10:
140   case AppleA11:
141   case AppleA12:
142   case AppleA13:
143   case AppleA14:
144     CacheLineSize = 64;
145     PrefetchDistance = 280;
146     MinPrefetchStride = 2048;
147     MaxPrefetchIterationsAhead = 3;
148     break;
149   case ExynosM3:
150     MaxInterleaveFactor = 4;
151     MaxJumpTableSize = 20;
152     PrefFunctionLogAlignment = 5;
153     PrefLoopLogAlignment = 4;
154     break;
155   case Falkor:
156     MaxInterleaveFactor = 4;
157     // FIXME: remove this to enable 64-bit SLP if performance looks good.
158     MinVectorRegisterBitWidth = 128;
159     CacheLineSize = 128;
160     PrefetchDistance = 820;
161     MinPrefetchStride = 2048;
162     MaxPrefetchIterationsAhead = 8;
163     break;
164   case Kryo:
165     MaxInterleaveFactor = 4;
166     VectorInsertExtractBaseCost = 2;
167     CacheLineSize = 128;
168     PrefetchDistance = 740;
169     MinPrefetchStride = 1024;
170     MaxPrefetchIterationsAhead = 11;
171     // FIXME: remove this to enable 64-bit SLP if performance looks good.
172     MinVectorRegisterBitWidth = 128;
173     break;
174   case NeoverseE1:
175     PrefFunctionLogAlignment = 3;
176     break;
177   case NeoverseN1:
178     PrefFunctionLogAlignment = 4;
179     PrefLoopLogAlignment = 5;
180     MaxBytesForLoopAlignment = 16;
181     break;
182   case NeoverseN2:
183     PrefFunctionLogAlignment = 4;
184     PrefLoopLogAlignment = 5;
185     MaxBytesForLoopAlignment = 16;
186     VScaleForTuning = 1;
187     break;
188   case NeoverseV1:
189     PrefFunctionLogAlignment = 4;
190     PrefLoopLogAlignment = 5;
191     MaxBytesForLoopAlignment = 16;
192     VScaleForTuning = 2;
193     break;
194   case Neoverse512TVB:
195     PrefFunctionLogAlignment = 4;
196     VScaleForTuning = 1;
197     MaxInterleaveFactor = 4;
198     break;
199   case Saphira:
200     MaxInterleaveFactor = 4;
201     // FIXME: remove this to enable 64-bit SLP if performance looks good.
202     MinVectorRegisterBitWidth = 128;
203     break;
204   case ThunderX2T99:
205     CacheLineSize = 64;
206     PrefFunctionLogAlignment = 3;
207     PrefLoopLogAlignment = 2;
208     MaxInterleaveFactor = 4;
209     PrefetchDistance = 128;
210     MinPrefetchStride = 1024;
211     MaxPrefetchIterationsAhead = 4;
212     // FIXME: remove this to enable 64-bit SLP if performance looks good.
213     MinVectorRegisterBitWidth = 128;
214     break;
215   case ThunderX:
216   case ThunderXT88:
217   case ThunderXT81:
218   case ThunderXT83:
219     CacheLineSize = 128;
220     PrefFunctionLogAlignment = 3;
221     PrefLoopLogAlignment = 2;
222     // FIXME: remove this to enable 64-bit SLP if performance looks good.
223     MinVectorRegisterBitWidth = 128;
224     break;
225   case TSV110:
226     CacheLineSize = 64;
227     PrefFunctionLogAlignment = 4;
228     PrefLoopLogAlignment = 2;
229     break;
230   case ThunderX3T110:
231     CacheLineSize = 64;
232     PrefFunctionLogAlignment = 4;
233     PrefLoopLogAlignment = 2;
234     MaxInterleaveFactor = 4;
235     PrefetchDistance = 128;
236     MinPrefetchStride = 1024;
237     MaxPrefetchIterationsAhead = 4;
238     // FIXME: remove this to enable 64-bit SLP if performance looks good.
239     MinVectorRegisterBitWidth = 128;
240     break;
241   }
242 }
243 
244 AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
245                                    const std::string &TuneCPU,
246                                    const std::string &FS,
247                                    const TargetMachine &TM, bool LittleEndian,
248                                    unsigned MinSVEVectorSizeInBitsOverride,
249                                    unsigned MaxSVEVectorSizeInBitsOverride)
250     : AArch64GenSubtargetInfo(TT, CPU, TuneCPU, FS),
251       ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()),
252       CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()),
253       IsLittle(LittleEndian),
254       MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride),
255       MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT),
256       InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU)),
257       TLInfo(TM, *this) {
258   if (AArch64::isX18ReservedByDefault(TT))
259     ReserveXRegister.set(18);
260 
261   CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering()));
262   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
263   Legalizer.reset(new AArch64LegalizerInfo(*this));
264 
265   auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo());
266 
267   // FIXME: At this point, we can't rely on Subtarget having RBI.
268   // It's awkward to mix passing RBI and the Subtarget; should we pass
269   // TII/TRI as well?
270   InstSelector.reset(createAArch64InstructionSelector(
271       *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI));
272 
273   RegBankInfo.reset(RBI);
274 }
275 
276 const CallLowering *AArch64Subtarget::getCallLowering() const {
277   return CallLoweringInfo.get();
278 }
279 
280 const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const {
281   return InlineAsmLoweringInfo.get();
282 }
283 
284 InstructionSelector *AArch64Subtarget::getInstructionSelector() const {
285   return InstSelector.get();
286 }
287 
288 const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const {
289   return Legalizer.get();
290 }
291 
292 const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const {
293   return RegBankInfo.get();
294 }
295 
296 /// Find the target operand flags that describe how a global value should be
297 /// referenced for the current subtarget.
298 unsigned
299 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
300                                           const TargetMachine &TM) const {
301   // MachO large model always goes via a GOT, simply to get a single 8-byte
302   // absolute relocation on all global addresses.
303   if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
304     return AArch64II::MO_GOT;
305 
306   if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) {
307     if (GV->hasDLLImportStorageClass())
308       return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT;
309     if (getTargetTriple().isOSWindows())
310       return AArch64II::MO_GOT | AArch64II::MO_COFFSTUB;
311     return AArch64II::MO_GOT;
312   }
313 
314   // The small code model's direct accesses use ADRP, which cannot
315   // necessarily produce the value 0 (if the code is above 4GB).
316   // Same for the tiny code model, where we have a pc relative LDR.
317   if ((useSmallAddressing() || TM.getCodeModel() == CodeModel::Tiny) &&
318       GV->hasExternalWeakLinkage())
319     return AArch64II::MO_GOT;
320 
321   // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate
322   // that their nominal addresses are tagged and outside of the code model. In
323   // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the
324   // tag if necessary based on MO_TAGGED.
325   if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType()))
326     return AArch64II::MO_NC | AArch64II::MO_TAGGED;
327 
328   return AArch64II::MO_NO_FLAG;
329 }
330 
331 unsigned AArch64Subtarget::classifyGlobalFunctionReference(
332     const GlobalValue *GV, const TargetMachine &TM) const {
333   // MachO large model always goes via a GOT, because we don't have the
334   // relocations available to do anything else..
335   if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() &&
336       !GV->hasInternalLinkage())
337     return AArch64II::MO_GOT;
338 
339   // NonLazyBind goes via GOT unless we know it's available locally.
340   auto *F = dyn_cast<Function>(GV);
341   if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) &&
342       !TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
343     return AArch64II::MO_GOT;
344 
345   // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB.
346   if (getTargetTriple().isOSWindows())
347     return ClassifyGlobalReference(GV, TM);
348 
349   return AArch64II::MO_NO_FLAG;
350 }
351 
352 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
353                                            unsigned NumRegionInstrs) const {
354   // LNT run (at least on Cyclone) showed reasonably significant gains for
355   // bi-directional scheduling. 253.perlbmk.
356   Policy.OnlyTopDown = false;
357   Policy.OnlyBottomUp = false;
358   // Enabling or Disabling the latency heuristic is a close call: It seems to
359   // help nearly no benchmark on out-of-order architectures, on the other hand
360   // it regresses register pressure on a few benchmarking.
361   Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;
362 }
363 
364 bool AArch64Subtarget::enableEarlyIfConversion() const {
365   return EnableEarlyIfConvert;
366 }
367 
368 bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
369   if (!UseAddressTopByteIgnored)
370     return false;
371 
372   if (TargetTriple.isDriverKit())
373     return true;
374   if (TargetTriple.isiOS()) {
375     return TargetTriple.getiOSVersion() >= VersionTuple(8);
376   }
377 
378   return false;
379 }
380 
381 std::unique_ptr<PBQPRAConstraint>
382 AArch64Subtarget::getCustomPBQPConstraints() const {
383   return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr;
384 }
385 
386 void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const {
387   // We usually compute max call frame size after ISel. Do the computation now
388   // if the .mir file didn't specify it. Note that this will probably give you
389   // bogus values after PEI has eliminated the callframe setup/destroy pseudo
390   // instructions, specify explicitly if you need it to be correct.
391   MachineFrameInfo &MFI = MF.getFrameInfo();
392   if (!MFI.isMaxCallFrameSizeComputed())
393     MFI.computeMaxCallFrameSize(MF);
394 }
395 
396 bool AArch64Subtarget::useAA() const { return UseAA; }
397