1 //===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the AArch64 specific subclass of TargetSubtarget.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "AArch64Subtarget.h"
14
15 #include "AArch64.h"
16 #include "AArch64InstrInfo.h"
17 #include "AArch64PBQPRegAlloc.h"
18 #include "AArch64TargetMachine.h"
19 #include "GISel/AArch64CallLowering.h"
20 #include "GISel/AArch64LegalizerInfo.h"
21 #include "GISel/AArch64RegisterBankInfo.h"
22 #include "MCTargetDesc/AArch64AddressingModes.h"
23 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
24 #include "llvm/CodeGen/MachineFrameInfo.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/IR/GlobalValue.h"
27 #include "llvm/Support/AArch64TargetParser.h"
28 #include "llvm/Support/TargetParser.h"
29
30 using namespace llvm;
31
32 #define DEBUG_TYPE "aarch64-subtarget"
33
34 #define GET_SUBTARGETINFO_CTOR
35 #define GET_SUBTARGETINFO_TARGET_DESC
36 #include "AArch64GenSubtargetInfo.inc"
37
38 static cl::opt<bool>
39 EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if "
40 "converter pass"), cl::init(true), cl::Hidden);
41
42 // If OS supports TBI, use this flag to enable it.
43 static cl::opt<bool>
44 UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of "
45 "an address is ignored"), cl::init(false), cl::Hidden);
46
47 static cl::opt<bool>
48 UseNonLazyBind("aarch64-enable-nonlazybind",
49 cl::desc("Call nonlazybind functions via direct GOT load"),
50 cl::init(false), cl::Hidden);
51
52 static cl::opt<bool> UseAA("aarch64-use-aa", cl::init(true),
53 cl::desc("Enable the use of AA during codegen."));
54
55 static cl::opt<unsigned> OverrideVectorInsertExtractBaseCost(
56 "aarch64-insert-extract-base-cost",
57 cl::desc("Base cost of vector insert/extract element"), cl::Hidden);
58
getVectorInsertExtractBaseCost() const59 unsigned AArch64Subtarget::getVectorInsertExtractBaseCost() const {
60 if (OverrideVectorInsertExtractBaseCost.getNumOccurrences() > 0)
61 return OverrideVectorInsertExtractBaseCost;
62 return VectorInsertExtractBaseCost;
63 }
64
initializeSubtargetDependencies(StringRef FS,StringRef CPUString,StringRef TuneCPUString)65 AArch64Subtarget &AArch64Subtarget::initializeSubtargetDependencies(
66 StringRef FS, StringRef CPUString, StringRef TuneCPUString) {
67 // Determine default and user-specified characteristics
68
69 if (CPUString.empty())
70 CPUString = "generic";
71
72 if (TuneCPUString.empty())
73 TuneCPUString = CPUString;
74
75 ParseSubtargetFeatures(CPUString, TuneCPUString, FS);
76 initializeProperties();
77
78 return *this;
79 }
80
initializeProperties()81 void AArch64Subtarget::initializeProperties() {
82 // Initialize CPU specific properties. We should add a tablegen feature for
83 // this in the future so we can specify it together with the subtarget
84 // features.
85 switch (ARMProcFamily) {
86 case Others:
87 break;
88 case Carmel:
89 CacheLineSize = 64;
90 break;
91 case CortexA35:
92 case CortexA53:
93 case CortexA55:
94 PrefFunctionLogAlignment = 4;
95 PrefLoopLogAlignment = 4;
96 MaxBytesForLoopAlignment = 8;
97 break;
98 case CortexA57:
99 MaxInterleaveFactor = 4;
100 PrefFunctionLogAlignment = 4;
101 PrefLoopLogAlignment = 4;
102 MaxBytesForLoopAlignment = 8;
103 break;
104 case CortexA65:
105 PrefFunctionLogAlignment = 3;
106 break;
107 case CortexA72:
108 case CortexA73:
109 case CortexA75:
110 PrefFunctionLogAlignment = 4;
111 PrefLoopLogAlignment = 4;
112 MaxBytesForLoopAlignment = 8;
113 break;
114 case CortexA76:
115 case CortexA77:
116 case CortexA78:
117 case CortexA78C:
118 case CortexR82:
119 case CortexX1:
120 case CortexX1C:
121 PrefFunctionLogAlignment = 4;
122 PrefLoopLogAlignment = 5;
123 MaxBytesForLoopAlignment = 16;
124 break;
125 case CortexA510:
126 PrefFunctionLogAlignment = 4;
127 VScaleForTuning = 1;
128 PrefLoopLogAlignment = 4;
129 MaxBytesForLoopAlignment = 8;
130 break;
131 case CortexA710:
132 case CortexX2:
133 PrefFunctionLogAlignment = 4;
134 VScaleForTuning = 1;
135 PrefLoopLogAlignment = 5;
136 MaxBytesForLoopAlignment = 16;
137 break;
138 case A64FX:
139 CacheLineSize = 256;
140 PrefFunctionLogAlignment = 3;
141 PrefLoopLogAlignment = 2;
142 MaxInterleaveFactor = 4;
143 PrefetchDistance = 128;
144 MinPrefetchStride = 1024;
145 MaxPrefetchIterationsAhead = 4;
146 VScaleForTuning = 4;
147 break;
148 case AppleA7:
149 case AppleA10:
150 case AppleA11:
151 case AppleA12:
152 case AppleA13:
153 case AppleA14:
154 CacheLineSize = 64;
155 PrefetchDistance = 280;
156 MinPrefetchStride = 2048;
157 MaxPrefetchIterationsAhead = 3;
158 break;
159 case ExynosM3:
160 MaxInterleaveFactor = 4;
161 MaxJumpTableSize = 20;
162 PrefFunctionLogAlignment = 5;
163 PrefLoopLogAlignment = 4;
164 break;
165 case Falkor:
166 MaxInterleaveFactor = 4;
167 // FIXME: remove this to enable 64-bit SLP if performance looks good.
168 MinVectorRegisterBitWidth = 128;
169 CacheLineSize = 128;
170 PrefetchDistance = 820;
171 MinPrefetchStride = 2048;
172 MaxPrefetchIterationsAhead = 8;
173 break;
174 case Kryo:
175 MaxInterleaveFactor = 4;
176 VectorInsertExtractBaseCost = 2;
177 CacheLineSize = 128;
178 PrefetchDistance = 740;
179 MinPrefetchStride = 1024;
180 MaxPrefetchIterationsAhead = 11;
181 // FIXME: remove this to enable 64-bit SLP if performance looks good.
182 MinVectorRegisterBitWidth = 128;
183 break;
184 case NeoverseE1:
185 PrefFunctionLogAlignment = 3;
186 break;
187 case NeoverseN1:
188 PrefFunctionLogAlignment = 4;
189 PrefLoopLogAlignment = 5;
190 MaxBytesForLoopAlignment = 16;
191 break;
192 case NeoverseN2:
193 PrefFunctionLogAlignment = 4;
194 PrefLoopLogAlignment = 5;
195 MaxBytesForLoopAlignment = 16;
196 VScaleForTuning = 1;
197 break;
198 case NeoverseV1:
199 PrefFunctionLogAlignment = 4;
200 PrefLoopLogAlignment = 5;
201 MaxBytesForLoopAlignment = 16;
202 VScaleForTuning = 2;
203 break;
204 case Neoverse512TVB:
205 PrefFunctionLogAlignment = 4;
206 VScaleForTuning = 1;
207 MaxInterleaveFactor = 4;
208 break;
209 case Saphira:
210 MaxInterleaveFactor = 4;
211 // FIXME: remove this to enable 64-bit SLP if performance looks good.
212 MinVectorRegisterBitWidth = 128;
213 break;
214 case ThunderX2T99:
215 CacheLineSize = 64;
216 PrefFunctionLogAlignment = 3;
217 PrefLoopLogAlignment = 2;
218 MaxInterleaveFactor = 4;
219 PrefetchDistance = 128;
220 MinPrefetchStride = 1024;
221 MaxPrefetchIterationsAhead = 4;
222 // FIXME: remove this to enable 64-bit SLP if performance looks good.
223 MinVectorRegisterBitWidth = 128;
224 break;
225 case ThunderX:
226 case ThunderXT88:
227 case ThunderXT81:
228 case ThunderXT83:
229 CacheLineSize = 128;
230 PrefFunctionLogAlignment = 3;
231 PrefLoopLogAlignment = 2;
232 // FIXME: remove this to enable 64-bit SLP if performance looks good.
233 MinVectorRegisterBitWidth = 128;
234 break;
235 case TSV110:
236 CacheLineSize = 64;
237 PrefFunctionLogAlignment = 4;
238 PrefLoopLogAlignment = 2;
239 break;
240 case ThunderX3T110:
241 CacheLineSize = 64;
242 PrefFunctionLogAlignment = 4;
243 PrefLoopLogAlignment = 2;
244 MaxInterleaveFactor = 4;
245 PrefetchDistance = 128;
246 MinPrefetchStride = 1024;
247 MaxPrefetchIterationsAhead = 4;
248 // FIXME: remove this to enable 64-bit SLP if performance looks good.
249 MinVectorRegisterBitWidth = 128;
250 break;
251 case Ampere1:
252 CacheLineSize = 64;
253 PrefFunctionLogAlignment = 6;
254 PrefLoopLogAlignment = 6;
255 MaxInterleaveFactor = 4;
256 break;
257 }
258 }
259
AArch64Subtarget(const Triple & TT,const std::string & CPU,const std::string & TuneCPU,const std::string & FS,const TargetMachine & TM,bool LittleEndian,unsigned MinSVEVectorSizeInBitsOverride,unsigned MaxSVEVectorSizeInBitsOverride)260 AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
261 const std::string &TuneCPU,
262 const std::string &FS,
263 const TargetMachine &TM, bool LittleEndian,
264 unsigned MinSVEVectorSizeInBitsOverride,
265 unsigned MaxSVEVectorSizeInBitsOverride)
266 : AArch64GenSubtargetInfo(TT, CPU, TuneCPU, FS),
267 ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()),
268 CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()),
269 IsLittle(LittleEndian),
270 MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride),
271 MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT),
272 InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU)),
273 TLInfo(TM, *this) {
274 if (AArch64::isX18ReservedByDefault(TT))
275 ReserveXRegister.set(18);
276
277 CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering()));
278 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
279 Legalizer.reset(new AArch64LegalizerInfo(*this));
280
281 auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo());
282
283 // FIXME: At this point, we can't rely on Subtarget having RBI.
284 // It's awkward to mix passing RBI and the Subtarget; should we pass
285 // TII/TRI as well?
286 InstSelector.reset(createAArch64InstructionSelector(
287 *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI));
288
289 RegBankInfo.reset(RBI);
290 }
291
getCallLowering() const292 const CallLowering *AArch64Subtarget::getCallLowering() const {
293 return CallLoweringInfo.get();
294 }
295
getInlineAsmLowering() const296 const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const {
297 return InlineAsmLoweringInfo.get();
298 }
299
getInstructionSelector() const300 InstructionSelector *AArch64Subtarget::getInstructionSelector() const {
301 return InstSelector.get();
302 }
303
getLegalizerInfo() const304 const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const {
305 return Legalizer.get();
306 }
307
getRegBankInfo() const308 const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const {
309 return RegBankInfo.get();
310 }
311
312 /// Find the target operand flags that describe how a global value should be
313 /// referenced for the current subtarget.
314 unsigned
ClassifyGlobalReference(const GlobalValue * GV,const TargetMachine & TM) const315 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
316 const TargetMachine &TM) const {
317 // MachO large model always goes via a GOT, simply to get a single 8-byte
318 // absolute relocation on all global addresses.
319 if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
320 return AArch64II::MO_GOT;
321
322 if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) {
323 if (GV->hasDLLImportStorageClass())
324 return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT;
325 if (getTargetTriple().isOSWindows())
326 return AArch64II::MO_GOT | AArch64II::MO_COFFSTUB;
327 return AArch64II::MO_GOT;
328 }
329
330 // The small code model's direct accesses use ADRP, which cannot
331 // necessarily produce the value 0 (if the code is above 4GB).
332 // Same for the tiny code model, where we have a pc relative LDR.
333 if ((useSmallAddressing() || TM.getCodeModel() == CodeModel::Tiny) &&
334 GV->hasExternalWeakLinkage())
335 return AArch64II::MO_GOT;
336
337 // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate
338 // that their nominal addresses are tagged and outside of the code model. In
339 // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the
340 // tag if necessary based on MO_TAGGED.
341 if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType()))
342 return AArch64II::MO_NC | AArch64II::MO_TAGGED;
343
344 return AArch64II::MO_NO_FLAG;
345 }
346
classifyGlobalFunctionReference(const GlobalValue * GV,const TargetMachine & TM) const347 unsigned AArch64Subtarget::classifyGlobalFunctionReference(
348 const GlobalValue *GV, const TargetMachine &TM) const {
349 // MachO large model always goes via a GOT, because we don't have the
350 // relocations available to do anything else..
351 if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() &&
352 !GV->hasInternalLinkage())
353 return AArch64II::MO_GOT;
354
355 // NonLazyBind goes via GOT unless we know it's available locally.
356 auto *F = dyn_cast<Function>(GV);
357 if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) &&
358 !TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
359 return AArch64II::MO_GOT;
360
361 // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB.
362 if (getTargetTriple().isOSWindows())
363 return ClassifyGlobalReference(GV, TM);
364
365 return AArch64II::MO_NO_FLAG;
366 }
367
overrideSchedPolicy(MachineSchedPolicy & Policy,unsigned NumRegionInstrs) const368 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
369 unsigned NumRegionInstrs) const {
370 // LNT run (at least on Cyclone) showed reasonably significant gains for
371 // bi-directional scheduling. 253.perlbmk.
372 Policy.OnlyTopDown = false;
373 Policy.OnlyBottomUp = false;
374 // Enabling or Disabling the latency heuristic is a close call: It seems to
375 // help nearly no benchmark on out-of-order architectures, on the other hand
376 // it regresses register pressure on a few benchmarking.
377 Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;
378 }
379
enableEarlyIfConversion() const380 bool AArch64Subtarget::enableEarlyIfConversion() const {
381 return EnableEarlyIfConvert;
382 }
383
supportsAddressTopByteIgnored() const384 bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
385 if (!UseAddressTopByteIgnored)
386 return false;
387
388 if (TargetTriple.isDriverKit())
389 return true;
390 if (TargetTriple.isiOS()) {
391 return TargetTriple.getiOSVersion() >= VersionTuple(8);
392 }
393
394 return false;
395 }
396
397 std::unique_ptr<PBQPRAConstraint>
getCustomPBQPConstraints() const398 AArch64Subtarget::getCustomPBQPConstraints() const {
399 return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr;
400 }
401
mirFileLoaded(MachineFunction & MF) const402 void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const {
403 // We usually compute max call frame size after ISel. Do the computation now
404 // if the .mir file didn't specify it. Note that this will probably give you
405 // bogus values after PEI has eliminated the callframe setup/destroy pseudo
406 // instructions, specify explicitly if you need it to be correct.
407 MachineFrameInfo &MFI = MF.getFrameInfo();
408 if (!MFI.isMaxCallFrameSizeComputed())
409 MFI.computeMaxCallFrameSize(MF);
410 }
411
useAA() const412 bool AArch64Subtarget::useAA() const { return UseAA; }
413