1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// The AMDGPU target machine contains all of the hardware specific 11 /// information needed to emit code for SI+ GPUs. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUTargetMachine.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUAliasAnalysis.h" 18 #include "AMDGPUExportClustering.h" 19 #include "AMDGPUMFMAIGroupLP.h" 20 #include "AMDGPUMacroFusion.h" 21 #include "AMDGPUTargetObjectFile.h" 22 #include "AMDGPUTargetTransformInfo.h" 23 #include "GCNIterativeScheduler.h" 24 #include "GCNSchedStrategy.h" 25 #include "R600.h" 26 #include "R600TargetMachine.h" 27 #include "SIMachineFunctionInfo.h" 28 #include "SIMachineScheduler.h" 29 #include "TargetInfo/AMDGPUTargetInfo.h" 30 #include "llvm/Analysis/CGSCCPassManager.h" 31 #include "llvm/CodeGen/GlobalISel/CSEInfo.h" 32 #include "llvm/CodeGen/GlobalISel/IRTranslator.h" 33 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" 34 #include "llvm/CodeGen/GlobalISel/Legalizer.h" 35 #include "llvm/CodeGen/GlobalISel/Localizer.h" 36 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" 37 #include "llvm/CodeGen/MIRParser/MIParser.h" 38 #include "llvm/CodeGen/Passes.h" 39 #include "llvm/CodeGen/RegAllocRegistry.h" 40 #include "llvm/CodeGen/TargetPassConfig.h" 41 #include "llvm/IR/IntrinsicsAMDGPU.h" 42 #include "llvm/IR/LegacyPassManager.h" 43 #include "llvm/IR/PassManager.h" 44 #include "llvm/IR/PatternMatch.h" 45 #include "llvm/InitializePasses.h" 46 #include "llvm/MC/TargetRegistry.h" 47 #include "llvm/Passes/PassBuilder.h" 48 #include "llvm/Transforms/IPO.h" 49 #include "llvm/Transforms/IPO/AlwaysInliner.h" 50 #include "llvm/Transforms/IPO/GlobalDCE.h" 51 #include "llvm/Transforms/IPO/Internalize.h" 52 #include "llvm/Transforms/IPO/PassManagerBuilder.h" 53 #include "llvm/Transforms/Scalar.h" 54 #include "llvm/Transforms/Scalar/GVN.h" 55 #include "llvm/Transforms/Scalar/InferAddressSpaces.h" 56 #include "llvm/Transforms/Utils.h" 57 #include "llvm/Transforms/Utils/SimplifyLibCalls.h" 58 #include "llvm/Transforms/Vectorize.h" 59 60 using namespace llvm; 61 using namespace llvm::PatternMatch; 62 63 namespace { 64 class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> { 65 public: 66 SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) 67 : RegisterRegAllocBase(N, D, C) {} 68 }; 69 70 class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> { 71 public: 72 VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) 73 : RegisterRegAllocBase(N, D, C) {} 74 }; 75 76 static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI, 77 const TargetRegisterClass &RC) { 78 return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC); 79 } 80 81 static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI, 82 const TargetRegisterClass &RC) { 83 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC); 84 } 85 86 87 /// -{sgpr|vgpr}-regalloc=... command line option. 88 static FunctionPass *useDefaultRegisterAllocator() { return nullptr; } 89 90 /// A dummy default pass factory indicates whether the register allocator is 91 /// overridden on the command line. 92 static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag; 93 static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag; 94 95 static SGPRRegisterRegAlloc 96 defaultSGPRRegAlloc("default", 97 "pick SGPR register allocator based on -O option", 98 useDefaultRegisterAllocator); 99 100 static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false, 101 RegisterPassParser<SGPRRegisterRegAlloc>> 102 SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), 103 cl::desc("Register allocator to use for SGPRs")); 104 105 static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false, 106 RegisterPassParser<VGPRRegisterRegAlloc>> 107 VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), 108 cl::desc("Register allocator to use for VGPRs")); 109 110 111 static void initializeDefaultSGPRRegisterAllocatorOnce() { 112 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); 113 114 if (!Ctor) { 115 Ctor = SGPRRegAlloc; 116 SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc); 117 } 118 } 119 120 static void initializeDefaultVGPRRegisterAllocatorOnce() { 121 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); 122 123 if (!Ctor) { 124 Ctor = VGPRRegAlloc; 125 VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc); 126 } 127 } 128 129 static FunctionPass *createBasicSGPRRegisterAllocator() { 130 return createBasicRegisterAllocator(onlyAllocateSGPRs); 131 } 132 133 static FunctionPass *createGreedySGPRRegisterAllocator() { 134 return createGreedyRegisterAllocator(onlyAllocateSGPRs); 135 } 136 137 static FunctionPass *createFastSGPRRegisterAllocator() { 138 return createFastRegisterAllocator(onlyAllocateSGPRs, false); 139 } 140 141 static FunctionPass *createBasicVGPRRegisterAllocator() { 142 return createBasicRegisterAllocator(onlyAllocateVGPRs); 143 } 144 145 static FunctionPass *createGreedyVGPRRegisterAllocator() { 146 return createGreedyRegisterAllocator(onlyAllocateVGPRs); 147 } 148 149 static FunctionPass *createFastVGPRRegisterAllocator() { 150 return createFastRegisterAllocator(onlyAllocateVGPRs, true); 151 } 152 153 static SGPRRegisterRegAlloc basicRegAllocSGPR( 154 "basic", "basic register allocator", createBasicSGPRRegisterAllocator); 155 static SGPRRegisterRegAlloc greedyRegAllocSGPR( 156 "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator); 157 158 static SGPRRegisterRegAlloc fastRegAllocSGPR( 159 "fast", "fast register allocator", createFastSGPRRegisterAllocator); 160 161 162 static VGPRRegisterRegAlloc basicRegAllocVGPR( 163 "basic", "basic register allocator", createBasicVGPRRegisterAllocator); 164 static VGPRRegisterRegAlloc greedyRegAllocVGPR( 165 "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator); 166 167 static VGPRRegisterRegAlloc fastRegAllocVGPR( 168 "fast", "fast register allocator", createFastVGPRRegisterAllocator); 169 } 170 171 static cl::opt<bool> EnableSROA( 172 "amdgpu-sroa", 173 cl::desc("Run SROA after promote alloca pass"), 174 cl::ReallyHidden, 175 cl::init(true)); 176 177 static cl::opt<bool> 178 EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden, 179 cl::desc("Run early if-conversion"), 180 cl::init(false)); 181 182 static cl::opt<bool> 183 OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden, 184 cl::desc("Run pre-RA exec mask optimizations"), 185 cl::init(true)); 186 187 // Option to disable vectorizer for tests. 188 static cl::opt<bool> EnableLoadStoreVectorizer( 189 "amdgpu-load-store-vectorizer", 190 cl::desc("Enable load store vectorizer"), 191 cl::init(true), 192 cl::Hidden); 193 194 // Option to control global loads scalarization 195 static cl::opt<bool> ScalarizeGlobal( 196 "amdgpu-scalarize-global-loads", 197 cl::desc("Enable global load scalarization"), 198 cl::init(true), 199 cl::Hidden); 200 201 // Option to run internalize pass. 202 static cl::opt<bool> InternalizeSymbols( 203 "amdgpu-internalize-symbols", 204 cl::desc("Enable elimination of non-kernel functions and unused globals"), 205 cl::init(false), 206 cl::Hidden); 207 208 // Option to inline all early. 209 static cl::opt<bool> EarlyInlineAll( 210 "amdgpu-early-inline-all", 211 cl::desc("Inline all functions early"), 212 cl::init(false), 213 cl::Hidden); 214 215 static cl::opt<bool> EnableSDWAPeephole( 216 "amdgpu-sdwa-peephole", 217 cl::desc("Enable SDWA peepholer"), 218 cl::init(true)); 219 220 static cl::opt<bool> EnableDPPCombine( 221 "amdgpu-dpp-combine", 222 cl::desc("Enable DPP combiner"), 223 cl::init(true)); 224 225 // Enable address space based alias analysis 226 static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, 227 cl::desc("Enable AMDGPU Alias Analysis"), 228 cl::init(true)); 229 230 // Option to run late CFG structurizer 231 static cl::opt<bool, true> LateCFGStructurize( 232 "amdgpu-late-structurize", 233 cl::desc("Enable late CFG structurization"), 234 cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG), 235 cl::Hidden); 236 237 // Enable lib calls simplifications 238 static cl::opt<bool> EnableLibCallSimplify( 239 "amdgpu-simplify-libcall", 240 cl::desc("Enable amdgpu library simplifications"), 241 cl::init(true), 242 cl::Hidden); 243 244 static cl::opt<bool> EnableLowerKernelArguments( 245 "amdgpu-ir-lower-kernel-arguments", 246 cl::desc("Lower kernel argument loads in IR pass"), 247 cl::init(true), 248 cl::Hidden); 249 250 static cl::opt<bool> EnableRegReassign( 251 "amdgpu-reassign-regs", 252 cl::desc("Enable register reassign optimizations on gfx10+"), 253 cl::init(true), 254 cl::Hidden); 255 256 static cl::opt<bool> OptVGPRLiveRange( 257 "amdgpu-opt-vgpr-liverange", 258 cl::desc("Enable VGPR liverange optimizations for if-else structure"), 259 cl::init(true), cl::Hidden); 260 261 // Enable atomic optimization 262 static cl::opt<bool> EnableAtomicOptimizations( 263 "amdgpu-atomic-optimizations", 264 cl::desc("Enable atomic optimizations"), 265 cl::init(false), 266 cl::Hidden); 267 268 // Enable Mode register optimization 269 static cl::opt<bool> EnableSIModeRegisterPass( 270 "amdgpu-mode-register", 271 cl::desc("Enable mode register pass"), 272 cl::init(true), 273 cl::Hidden); 274 275 // Option is used in lit tests to prevent deadcoding of patterns inspected. 276 static cl::opt<bool> 277 EnableDCEInRA("amdgpu-dce-in-ra", 278 cl::init(true), cl::Hidden, 279 cl::desc("Enable machine DCE inside regalloc")); 280 281 static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority", 282 cl::desc("Adjust wave priority"), 283 cl::init(false), cl::Hidden); 284 285 static cl::opt<bool> EnableScalarIRPasses( 286 "amdgpu-scalar-ir-passes", 287 cl::desc("Enable scalar IR passes"), 288 cl::init(true), 289 cl::Hidden); 290 291 static cl::opt<bool> EnableStructurizerWorkarounds( 292 "amdgpu-enable-structurizer-workarounds", 293 cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true), 294 cl::Hidden); 295 296 static cl::opt<bool> EnableLDSReplaceWithPointer( 297 "amdgpu-enable-lds-replace-with-pointer", 298 cl::desc("Enable LDS replace with pointer pass"), cl::init(false), 299 cl::Hidden); 300 301 static cl::opt<bool, true> EnableLowerModuleLDS( 302 "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"), 303 cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true), 304 cl::Hidden); 305 306 static cl::opt<bool> EnablePreRAOptimizations( 307 "amdgpu-enable-pre-ra-optimizations", 308 cl::desc("Enable Pre-RA optimizations pass"), cl::init(true), 309 cl::Hidden); 310 311 static cl::opt<bool> EnablePromoteKernelArguments( 312 "amdgpu-enable-promote-kernel-arguments", 313 cl::desc("Enable promotion of flat kernel pointer arguments to global"), 314 cl::Hidden, cl::init(true)); 315 316 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { 317 // Register the target 318 RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget()); 319 RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget()); 320 321 PassRegistry *PR = PassRegistry::getPassRegistry(); 322 initializeR600ClauseMergePassPass(*PR); 323 initializeR600ControlFlowFinalizerPass(*PR); 324 initializeR600PacketizerPass(*PR); 325 initializeR600ExpandSpecialInstrsPassPass(*PR); 326 initializeR600VectorRegMergerPass(*PR); 327 initializeGlobalISel(*PR); 328 initializeAMDGPUDAGToDAGISelPass(*PR); 329 initializeGCNDPPCombinePass(*PR); 330 initializeSILowerI1CopiesPass(*PR); 331 initializeSILowerSGPRSpillsPass(*PR); 332 initializeSIFixSGPRCopiesPass(*PR); 333 initializeSIFixVGPRCopiesPass(*PR); 334 initializeSIFoldOperandsPass(*PR); 335 initializeSIPeepholeSDWAPass(*PR); 336 initializeSIShrinkInstructionsPass(*PR); 337 initializeSIOptimizeExecMaskingPreRAPass(*PR); 338 initializeSIOptimizeVGPRLiveRangePass(*PR); 339 initializeSILoadStoreOptimizerPass(*PR); 340 initializeAMDGPUCtorDtorLoweringPass(*PR); 341 initializeAMDGPUAlwaysInlinePass(*PR); 342 initializeAMDGPUAttributorPass(*PR); 343 initializeAMDGPUAnnotateKernelFeaturesPass(*PR); 344 initializeAMDGPUAnnotateUniformValuesPass(*PR); 345 initializeAMDGPUArgumentUsageInfoPass(*PR); 346 initializeAMDGPUAtomicOptimizerPass(*PR); 347 initializeAMDGPULowerKernelArgumentsPass(*PR); 348 initializeAMDGPUPromoteKernelArgumentsPass(*PR); 349 initializeAMDGPULowerKernelAttributesPass(*PR); 350 initializeAMDGPULowerIntrinsicsPass(*PR); 351 initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR); 352 initializeAMDGPUPostLegalizerCombinerPass(*PR); 353 initializeAMDGPUPreLegalizerCombinerPass(*PR); 354 initializeAMDGPURegBankCombinerPass(*PR); 355 initializeAMDGPUPromoteAllocaPass(*PR); 356 initializeAMDGPUPromoteAllocaToVectorPass(*PR); 357 initializeAMDGPUCodeGenPreparePass(*PR); 358 initializeAMDGPULateCodeGenPreparePass(*PR); 359 initializeAMDGPUPropagateAttributesEarlyPass(*PR); 360 initializeAMDGPUPropagateAttributesLatePass(*PR); 361 initializeAMDGPUReplaceLDSUseWithPointerPass(*PR); 362 initializeAMDGPULowerModuleLDSPass(*PR); 363 initializeAMDGPURewriteOutArgumentsPass(*PR); 364 initializeAMDGPUUnifyMetadataPass(*PR); 365 initializeSIAnnotateControlFlowPass(*PR); 366 initializeSIInsertHardClausesPass(*PR); 367 initializeSIInsertWaitcntsPass(*PR); 368 initializeSIModeRegisterPass(*PR); 369 initializeSIWholeQuadModePass(*PR); 370 initializeSILowerControlFlowPass(*PR); 371 initializeSIPreEmitPeepholePass(*PR); 372 initializeSILateBranchLoweringPass(*PR); 373 initializeSIMemoryLegalizerPass(*PR); 374 initializeSIOptimizeExecMaskingPass(*PR); 375 initializeSIPreAllocateWWMRegsPass(*PR); 376 initializeSIFormMemoryClausesPass(*PR); 377 initializeSIPostRABundlerPass(*PR); 378 initializeAMDGPUUnifyDivergentExitNodesPass(*PR); 379 initializeAMDGPUAAWrapperPassPass(*PR); 380 initializeAMDGPUExternalAAWrapperPass(*PR); 381 initializeAMDGPUUseNativeCallsPass(*PR); 382 initializeAMDGPUSimplifyLibCallsPass(*PR); 383 initializeAMDGPUPrintfRuntimeBindingPass(*PR); 384 initializeAMDGPUResourceUsageAnalysisPass(*PR); 385 initializeGCNNSAReassignPass(*PR); 386 initializeGCNPreRAOptimizationsPass(*PR); 387 } 388 389 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { 390 return std::make_unique<AMDGPUTargetObjectFile>(); 391 } 392 393 static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) { 394 return new SIScheduleDAGMI(C); 395 } 396 397 static ScheduleDAGInstrs * 398 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { 399 ScheduleDAGMILive *DAG = 400 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C)); 401 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 402 DAG->addMutation(createMFMAIGroupLPDAGMutation()); 403 DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); 404 DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); 405 return DAG; 406 } 407 408 static ScheduleDAGInstrs * 409 createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { 410 auto DAG = new GCNIterativeScheduler(C, 411 GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY); 412 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 413 return DAG; 414 } 415 416 static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) { 417 return new GCNIterativeScheduler(C, 418 GCNIterativeScheduler::SCHEDULE_MINREGFORCED); 419 } 420 421 static ScheduleDAGInstrs * 422 createIterativeILPMachineScheduler(MachineSchedContext *C) { 423 auto DAG = new GCNIterativeScheduler(C, 424 GCNIterativeScheduler::SCHEDULE_ILP); 425 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 426 DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); 427 return DAG; 428 } 429 430 static MachineSchedRegistry 431 SISchedRegistry("si", "Run SI's custom scheduler", 432 createSIMachineScheduler); 433 434 static MachineSchedRegistry 435 GCNMaxOccupancySchedRegistry("gcn-max-occupancy", 436 "Run GCN scheduler to maximize occupancy", 437 createGCNMaxOccupancyMachineScheduler); 438 439 static MachineSchedRegistry 440 IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental", 441 "Run GCN scheduler to maximize occupancy (experimental)", 442 createIterativeGCNMaxOccupancyMachineScheduler); 443 444 static MachineSchedRegistry 445 GCNMinRegSchedRegistry("gcn-minreg", 446 "Run GCN iterative scheduler for minimal register usage (experimental)", 447 createMinRegScheduler); 448 449 static MachineSchedRegistry 450 GCNILPSchedRegistry("gcn-ilp", 451 "Run GCN iterative scheduler for ILP scheduling (experimental)", 452 createIterativeILPMachineScheduler); 453 454 static StringRef computeDataLayout(const Triple &TT) { 455 if (TT.getArch() == Triple::r600) { 456 // 32-bit pointers. 457 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" 458 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1"; 459 } 460 461 // 32-bit private, local, and region pointers. 64-bit global, constant and 462 // flat, non-integral buffer fat pointers. 463 return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32" 464 "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" 465 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1" 466 "-ni:7"; 467 } 468 469 LLVM_READNONE 470 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) { 471 if (!GPU.empty()) 472 return GPU; 473 474 // Need to default to a target with flat support for HSA. 475 if (TT.getArch() == Triple::amdgcn) 476 return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic"; 477 478 return "r600"; 479 } 480 481 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) { 482 // The AMDGPU toolchain only supports generating shared objects, so we 483 // must always use PIC. 484 return Reloc::PIC_; 485 } 486 487 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, 488 StringRef CPU, StringRef FS, 489 TargetOptions Options, 490 Optional<Reloc::Model> RM, 491 Optional<CodeModel::Model> CM, 492 CodeGenOpt::Level OptLevel) 493 : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), 494 FS, Options, getEffectiveRelocModel(RM), 495 getEffectiveCodeModel(CM, CodeModel::Small), OptLevel), 496 TLOF(createTLOF(getTargetTriple())) { 497 initAsmInfo(); 498 if (TT.getArch() == Triple::amdgcn) { 499 if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64")) 500 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64)); 501 else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32")) 502 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32)); 503 } 504 } 505 506 bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; 507 bool AMDGPUTargetMachine::EnableFunctionCalls = false; 508 bool AMDGPUTargetMachine::EnableLowerModuleLDS = true; 509 510 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; 511 512 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const { 513 Attribute GPUAttr = F.getFnAttribute("target-cpu"); 514 return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU(); 515 } 516 517 StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const { 518 Attribute FSAttr = F.getFnAttribute("target-features"); 519 520 return FSAttr.isValid() ? FSAttr.getValueAsString() 521 : getTargetFeatureString(); 522 } 523 524 /// Predicate for Internalize pass. 525 static bool mustPreserveGV(const GlobalValue &GV) { 526 if (const Function *F = dyn_cast<Function>(&GV)) 527 return F->isDeclaration() || F->getName().startswith("__asan_") || 528 F->getName().startswith("__sanitizer_") || 529 AMDGPU::isEntryFunctionCC(F->getCallingConv()); 530 531 GV.removeDeadConstantUsers(); 532 return !GV.use_empty(); 533 } 534 535 void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { 536 Builder.DivergentTarget = true; 537 538 bool EnableOpt = getOptLevel() > CodeGenOpt::None; 539 bool Internalize = InternalizeSymbols; 540 bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableFunctionCalls; 541 bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt; 542 bool LibCallSimplify = EnableLibCallSimplify && EnableOpt; 543 bool PromoteKernelArguments = 544 EnablePromoteKernelArguments && getOptLevel() > CodeGenOpt::Less; 545 546 if (EnableFunctionCalls) { 547 delete Builder.Inliner; 548 Builder.Inliner = createFunctionInliningPass(); 549 } 550 551 Builder.addExtension( 552 PassManagerBuilder::EP_ModuleOptimizerEarly, 553 [Internalize, EarlyInline, AMDGPUAA, this](const PassManagerBuilder &, 554 legacy::PassManagerBase &PM) { 555 if (AMDGPUAA) { 556 PM.add(createAMDGPUAAWrapperPass()); 557 PM.add(createAMDGPUExternalAAWrapperPass()); 558 } 559 PM.add(createAMDGPUUnifyMetadataPass()); 560 PM.add(createAMDGPUPrintfRuntimeBinding()); 561 if (Internalize) 562 PM.add(createInternalizePass(mustPreserveGV)); 563 PM.add(createAMDGPUPropagateAttributesLatePass(this)); 564 if (Internalize) 565 PM.add(createGlobalDCEPass()); 566 if (EarlyInline) 567 PM.add(createAMDGPUAlwaysInlinePass(false)); 568 }); 569 570 Builder.addExtension( 571 PassManagerBuilder::EP_EarlyAsPossible, 572 [AMDGPUAA, LibCallSimplify, this](const PassManagerBuilder &, 573 legacy::PassManagerBase &PM) { 574 if (AMDGPUAA) { 575 PM.add(createAMDGPUAAWrapperPass()); 576 PM.add(createAMDGPUExternalAAWrapperPass()); 577 } 578 PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this)); 579 PM.add(llvm::createAMDGPUUseNativeCallsPass()); 580 if (LibCallSimplify) 581 PM.add(llvm::createAMDGPUSimplifyLibCallsPass(this)); 582 }); 583 584 Builder.addExtension( 585 PassManagerBuilder::EP_CGSCCOptimizerLate, 586 [EnableOpt, PromoteKernelArguments](const PassManagerBuilder &, 587 legacy::PassManagerBase &PM) { 588 // Add promote kernel arguments pass to the opt pipeline right before 589 // infer address spaces which is needed to do actual address space 590 // rewriting. 591 if (PromoteKernelArguments) 592 PM.add(createAMDGPUPromoteKernelArgumentsPass()); 593 594 // Add infer address spaces pass to the opt pipeline after inlining 595 // but before SROA to increase SROA opportunities. 596 PM.add(createInferAddressSpacesPass()); 597 598 // This should run after inlining to have any chance of doing anything, 599 // and before other cleanup optimizations. 600 PM.add(createAMDGPULowerKernelAttributesPass()); 601 602 // Promote alloca to vector before SROA and loop unroll. If we manage 603 // to eliminate allocas before unroll we may choose to unroll less. 604 if (EnableOpt) 605 PM.add(createAMDGPUPromoteAllocaToVector()); 606 }); 607 } 608 609 void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) { 610 AAM.registerFunctionAnalysis<AMDGPUAA>(); 611 } 612 613 void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { 614 PB.registerPipelineParsingCallback( 615 [this](StringRef PassName, ModulePassManager &PM, 616 ArrayRef<PassBuilder::PipelineElement>) { 617 if (PassName == "amdgpu-propagate-attributes-late") { 618 PM.addPass(AMDGPUPropagateAttributesLatePass(*this)); 619 return true; 620 } 621 if (PassName == "amdgpu-unify-metadata") { 622 PM.addPass(AMDGPUUnifyMetadataPass()); 623 return true; 624 } 625 if (PassName == "amdgpu-printf-runtime-binding") { 626 PM.addPass(AMDGPUPrintfRuntimeBindingPass()); 627 return true; 628 } 629 if (PassName == "amdgpu-always-inline") { 630 PM.addPass(AMDGPUAlwaysInlinePass()); 631 return true; 632 } 633 if (PassName == "amdgpu-replace-lds-use-with-pointer") { 634 PM.addPass(AMDGPUReplaceLDSUseWithPointerPass()); 635 return true; 636 } 637 if (PassName == "amdgpu-lower-module-lds") { 638 PM.addPass(AMDGPULowerModuleLDSPass()); 639 return true; 640 } 641 return false; 642 }); 643 PB.registerPipelineParsingCallback( 644 [this](StringRef PassName, FunctionPassManager &PM, 645 ArrayRef<PassBuilder::PipelineElement>) { 646 if (PassName == "amdgpu-simplifylib") { 647 PM.addPass(AMDGPUSimplifyLibCallsPass(*this)); 648 return true; 649 } 650 if (PassName == "amdgpu-usenative") { 651 PM.addPass(AMDGPUUseNativeCallsPass()); 652 return true; 653 } 654 if (PassName == "amdgpu-promote-alloca") { 655 PM.addPass(AMDGPUPromoteAllocaPass(*this)); 656 return true; 657 } 658 if (PassName == "amdgpu-promote-alloca-to-vector") { 659 PM.addPass(AMDGPUPromoteAllocaToVectorPass(*this)); 660 return true; 661 } 662 if (PassName == "amdgpu-lower-kernel-attributes") { 663 PM.addPass(AMDGPULowerKernelAttributesPass()); 664 return true; 665 } 666 if (PassName == "amdgpu-propagate-attributes-early") { 667 PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this)); 668 return true; 669 } 670 if (PassName == "amdgpu-promote-kernel-arguments") { 671 PM.addPass(AMDGPUPromoteKernelArgumentsPass()); 672 return true; 673 } 674 return false; 675 }); 676 677 PB.registerAnalysisRegistrationCallback([](FunctionAnalysisManager &FAM) { 678 FAM.registerPass([&] { return AMDGPUAA(); }); 679 }); 680 681 PB.registerParseAACallback([](StringRef AAName, AAManager &AAM) { 682 if (AAName == "amdgpu-aa") { 683 AAM.registerFunctionAnalysis<AMDGPUAA>(); 684 return true; 685 } 686 return false; 687 }); 688 689 PB.registerPipelineStartEPCallback( 690 [this](ModulePassManager &PM, OptimizationLevel Level) { 691 FunctionPassManager FPM; 692 FPM.addPass(AMDGPUPropagateAttributesEarlyPass(*this)); 693 FPM.addPass(AMDGPUUseNativeCallsPass()); 694 if (EnableLibCallSimplify && Level != OptimizationLevel::O0) 695 FPM.addPass(AMDGPUSimplifyLibCallsPass(*this)); 696 PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 697 }); 698 699 PB.registerPipelineEarlySimplificationEPCallback( 700 [this](ModulePassManager &PM, OptimizationLevel Level) { 701 if (Level == OptimizationLevel::O0) 702 return; 703 704 PM.addPass(AMDGPUUnifyMetadataPass()); 705 PM.addPass(AMDGPUPrintfRuntimeBindingPass()); 706 707 if (InternalizeSymbols) { 708 PM.addPass(InternalizePass(mustPreserveGV)); 709 } 710 PM.addPass(AMDGPUPropagateAttributesLatePass(*this)); 711 if (InternalizeSymbols) { 712 PM.addPass(GlobalDCEPass()); 713 } 714 if (EarlyInlineAll && !EnableFunctionCalls) 715 PM.addPass(AMDGPUAlwaysInlinePass()); 716 }); 717 718 PB.registerCGSCCOptimizerLateEPCallback( 719 [this](CGSCCPassManager &PM, OptimizationLevel Level) { 720 if (Level == OptimizationLevel::O0) 721 return; 722 723 FunctionPassManager FPM; 724 725 // Add promote kernel arguments pass to the opt pipeline right before 726 // infer address spaces which is needed to do actual address space 727 // rewriting. 728 if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() && 729 EnablePromoteKernelArguments) 730 FPM.addPass(AMDGPUPromoteKernelArgumentsPass()); 731 732 // Add infer address spaces pass to the opt pipeline after inlining 733 // but before SROA to increase SROA opportunities. 734 FPM.addPass(InferAddressSpacesPass()); 735 736 // This should run after inlining to have any chance of doing 737 // anything, and before other cleanup optimizations. 738 FPM.addPass(AMDGPULowerKernelAttributesPass()); 739 740 if (Level != OptimizationLevel::O0) { 741 // Promote alloca to vector before SROA and loop unroll. If we 742 // manage to eliminate allocas before unroll we may choose to unroll 743 // less. 744 FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this)); 745 } 746 747 PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM))); 748 }); 749 } 750 751 int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) { 752 return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || 753 AddrSpace == AMDGPUAS::PRIVATE_ADDRESS || 754 AddrSpace == AMDGPUAS::REGION_ADDRESS) 755 ? -1 756 : 0; 757 } 758 759 bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS, 760 unsigned DestAS) const { 761 return AMDGPU::isFlatGlobalAddrSpace(SrcAS) && 762 AMDGPU::isFlatGlobalAddrSpace(DestAS); 763 } 764 765 unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const { 766 const auto *LD = dyn_cast<LoadInst>(V); 767 if (!LD) 768 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; 769 770 // It must be a generic pointer loaded. 771 assert(V->getType()->isPointerTy() && 772 V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS); 773 774 const auto *Ptr = LD->getPointerOperand(); 775 if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) 776 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; 777 // For a generic pointer loaded from the constant memory, it could be assumed 778 // as a global pointer since the constant memory is only populated on the 779 // host side. As implied by the offload programming model, only global 780 // pointers could be referenced on the host side. 781 return AMDGPUAS::GLOBAL_ADDRESS; 782 } 783 784 std::pair<const Value *, unsigned> 785 AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const { 786 if (auto *II = dyn_cast<IntrinsicInst>(V)) { 787 switch (II->getIntrinsicID()) { 788 case Intrinsic::amdgcn_is_shared: 789 return std::make_pair(II->getArgOperand(0), AMDGPUAS::LOCAL_ADDRESS); 790 case Intrinsic::amdgcn_is_private: 791 return std::make_pair(II->getArgOperand(0), AMDGPUAS::PRIVATE_ADDRESS); 792 default: 793 break; 794 } 795 return std::make_pair(nullptr, -1); 796 } 797 // Check the global pointer predication based on 798 // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and 799 // the order of 'is_shared' and 'is_private' is not significant. 800 Value *Ptr; 801 if (match( 802 const_cast<Value *>(V), 803 m_c_And(m_Not(m_Intrinsic<Intrinsic::amdgcn_is_shared>(m_Value(Ptr))), 804 m_Not(m_Intrinsic<Intrinsic::amdgcn_is_private>( 805 m_Deferred(Ptr)))))) 806 return std::make_pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS); 807 808 return std::make_pair(nullptr, -1); 809 } 810 811 unsigned 812 AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const { 813 switch (Kind) { 814 case PseudoSourceValue::Stack: 815 case PseudoSourceValue::FixedStack: 816 return AMDGPUAS::PRIVATE_ADDRESS; 817 case PseudoSourceValue::ConstantPool: 818 case PseudoSourceValue::GOT: 819 case PseudoSourceValue::JumpTable: 820 case PseudoSourceValue::GlobalValueCallEntry: 821 case PseudoSourceValue::ExternalSymbolCallEntry: 822 case PseudoSourceValue::TargetCustom: 823 return AMDGPUAS::CONSTANT_ADDRESS; 824 } 825 return AMDGPUAS::FLAT_ADDRESS; 826 } 827 828 //===----------------------------------------------------------------------===// 829 // GCN Target Machine (SI+) 830 //===----------------------------------------------------------------------===// 831 832 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, 833 StringRef CPU, StringRef FS, 834 TargetOptions Options, 835 Optional<Reloc::Model> RM, 836 Optional<CodeModel::Model> CM, 837 CodeGenOpt::Level OL, bool JIT) 838 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} 839 840 const TargetSubtargetInfo * 841 GCNTargetMachine::getSubtargetImpl(const Function &F) const { 842 StringRef GPU = getGPUName(F); 843 StringRef FS = getFeatureString(F); 844 845 SmallString<128> SubtargetKey(GPU); 846 SubtargetKey.append(FS); 847 848 auto &I = SubtargetMap[SubtargetKey]; 849 if (!I) { 850 // This needs to be done before we create a new subtarget since any 851 // creation will depend on the TM and the code generation flags on the 852 // function that reside in TargetOptions. 853 resetTargetOptions(F); 854 I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this); 855 } 856 857 I->setScalarizeGlobalBehavior(ScalarizeGlobal); 858 859 return I.get(); 860 } 861 862 TargetTransformInfo 863 GCNTargetMachine::getTargetTransformInfo(const Function &F) const { 864 return TargetTransformInfo(GCNTTIImpl(this, F)); 865 } 866 867 //===----------------------------------------------------------------------===// 868 // AMDGPU Pass Setup 869 //===----------------------------------------------------------------------===// 870 871 std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const { 872 return getStandardCSEConfigForOpt(TM->getOptLevel()); 873 } 874 875 namespace { 876 877 class GCNPassConfig final : public AMDGPUPassConfig { 878 public: 879 GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) 880 : AMDGPUPassConfig(TM, PM) { 881 // It is necessary to know the register usage of the entire call graph. We 882 // allow calls without EnableAMDGPUFunctionCalls if they are marked 883 // noinline, so this is always required. 884 setRequiresCodeGenSCCOrder(true); 885 substitutePass(&PostRASchedulerID, &PostMachineSchedulerID); 886 } 887 888 GCNTargetMachine &getGCNTargetMachine() const { 889 return getTM<GCNTargetMachine>(); 890 } 891 892 ScheduleDAGInstrs * 893 createMachineScheduler(MachineSchedContext *C) const override; 894 895 ScheduleDAGInstrs * 896 createPostMachineScheduler(MachineSchedContext *C) const override { 897 ScheduleDAGMI *DAG = createGenericSchedPostRA(C); 898 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 899 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 900 DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII)); 901 DAG->addMutation(createMFMAIGroupLPDAGMutation()); 902 return DAG; 903 } 904 905 bool addPreISel() override; 906 void addMachineSSAOptimization() override; 907 bool addILPOpts() override; 908 bool addInstSelector() override; 909 bool addIRTranslator() override; 910 void addPreLegalizeMachineIR() override; 911 bool addLegalizeMachineIR() override; 912 void addPreRegBankSelect() override; 913 bool addRegBankSelect() override; 914 void addPreGlobalInstructionSelect() override; 915 bool addGlobalInstructionSelect() override; 916 void addFastRegAlloc() override; 917 void addOptimizedRegAlloc() override; 918 919 FunctionPass *createSGPRAllocPass(bool Optimized); 920 FunctionPass *createVGPRAllocPass(bool Optimized); 921 FunctionPass *createRegAllocPass(bool Optimized) override; 922 923 bool addRegAssignAndRewriteFast() override; 924 bool addRegAssignAndRewriteOptimized() override; 925 926 void addPreRegAlloc() override; 927 bool addPreRewrite() override; 928 void addPostRegAlloc() override; 929 void addPreSched2() override; 930 void addPreEmitPass() override; 931 }; 932 933 } // end anonymous namespace 934 935 AMDGPUPassConfig::AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) 936 : TargetPassConfig(TM, PM) { 937 // Exceptions and StackMaps are not supported, so these passes will never do 938 // anything. 939 disablePass(&StackMapLivenessID); 940 disablePass(&FuncletLayoutID); 941 // Garbage collection is not supported. 942 disablePass(&GCLoweringID); 943 disablePass(&ShadowStackGCLoweringID); 944 } 945 946 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { 947 if (getOptLevel() == CodeGenOpt::Aggressive) 948 addPass(createGVNPass()); 949 else 950 addPass(createEarlyCSEPass()); 951 } 952 953 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { 954 addPass(createLICMPass()); 955 addPass(createSeparateConstOffsetFromGEPPass()); 956 addPass(createSpeculativeExecutionPass()); 957 // ReassociateGEPs exposes more opportunities for SLSR. See 958 // the example in reassociate-geps-and-slsr.ll. 959 addPass(createStraightLineStrengthReducePass()); 960 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or 961 // EarlyCSE can reuse. 962 addEarlyCSEOrGVNPass(); 963 // Run NaryReassociate after EarlyCSE/GVN to be more effective. 964 addPass(createNaryReassociatePass()); 965 // NaryReassociate on GEPs creates redundant common expressions, so run 966 // EarlyCSE after it. 967 addPass(createEarlyCSEPass()); 968 } 969 970 void AMDGPUPassConfig::addIRPasses() { 971 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); 972 973 // There is no reason to run these. 974 disablePass(&StackMapLivenessID); 975 disablePass(&FuncletLayoutID); 976 disablePass(&PatchableFunctionID); 977 978 addPass(createAMDGPUPrintfRuntimeBinding()); 979 addPass(createAMDGPUCtorDtorLoweringPass()); 980 981 // A call to propagate attributes pass in the backend in case opt was not run. 982 addPass(createAMDGPUPropagateAttributesEarlyPass(&TM)); 983 984 addPass(createAMDGPULowerIntrinsicsPass()); 985 986 // Function calls are not supported, so make sure we inline everything. 987 addPass(createAMDGPUAlwaysInlinePass()); 988 addPass(createAlwaysInlinerLegacyPass()); 989 // We need to add the barrier noop pass, otherwise adding the function 990 // inlining pass will cause all of the PassConfigs passes to be run 991 // one function at a time, which means if we have a module with two 992 // functions, then we will generate code for the first function 993 // without ever running any passes on the second. 994 addPass(createBarrierNoopPass()); 995 996 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. 997 if (TM.getTargetTriple().getArch() == Triple::r600) 998 addPass(createR600OpenCLImageTypeLoweringPass()); 999 1000 // Replace OpenCL enqueued block function pointers with global variables. 1001 addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass()); 1002 1003 // Can increase LDS used by kernel so runs before PromoteAlloca 1004 if (EnableLowerModuleLDS) { 1005 // The pass "amdgpu-replace-lds-use-with-pointer" need to be run before the 1006 // pass "amdgpu-lower-module-lds", and also it required to be run only if 1007 // "amdgpu-lower-module-lds" pass is enabled. 1008 if (EnableLDSReplaceWithPointer) 1009 addPass(createAMDGPUReplaceLDSUseWithPointerPass()); 1010 1011 addPass(createAMDGPULowerModuleLDSPass()); 1012 } 1013 1014 if (TM.getOptLevel() > CodeGenOpt::None) 1015 addPass(createInferAddressSpacesPass()); 1016 1017 addPass(createAtomicExpandPass()); 1018 1019 if (TM.getOptLevel() > CodeGenOpt::None) { 1020 addPass(createAMDGPUPromoteAlloca()); 1021 1022 if (EnableSROA) 1023 addPass(createSROAPass()); 1024 if (isPassEnabled(EnableScalarIRPasses)) 1025 addStraightLineScalarOptimizationPasses(); 1026 1027 if (EnableAMDGPUAliasAnalysis) { 1028 addPass(createAMDGPUAAWrapperPass()); 1029 addPass(createExternalAAWrapperPass([](Pass &P, Function &, 1030 AAResults &AAR) { 1031 if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>()) 1032 AAR.addAAResult(WrapperPass->getResult()); 1033 })); 1034 } 1035 1036 if (TM.getTargetTriple().getArch() == Triple::amdgcn) { 1037 // TODO: May want to move later or split into an early and late one. 1038 addPass(createAMDGPUCodeGenPreparePass()); 1039 } 1040 } 1041 1042 TargetPassConfig::addIRPasses(); 1043 1044 // EarlyCSE is not always strong enough to clean up what LSR produces. For 1045 // example, GVN can combine 1046 // 1047 // %0 = add %a, %b 1048 // %1 = add %b, %a 1049 // 1050 // and 1051 // 1052 // %0 = shl nsw %a, 2 1053 // %1 = shl %a, 2 1054 // 1055 // but EarlyCSE can do neither of them. 1056 if (isPassEnabled(EnableScalarIRPasses)) 1057 addEarlyCSEOrGVNPass(); 1058 } 1059 1060 void AMDGPUPassConfig::addCodeGenPrepare() { 1061 if (TM->getTargetTriple().getArch() == Triple::amdgcn) { 1062 addPass(createAMDGPUAttributorPass()); 1063 1064 // FIXME: This pass adds 2 hacky attributes that can be replaced with an 1065 // analysis, and should be removed. 1066 addPass(createAMDGPUAnnotateKernelFeaturesPass()); 1067 } 1068 1069 if (TM->getTargetTriple().getArch() == Triple::amdgcn && 1070 EnableLowerKernelArguments) 1071 addPass(createAMDGPULowerKernelArgumentsPass()); 1072 1073 TargetPassConfig::addCodeGenPrepare(); 1074 1075 if (isPassEnabled(EnableLoadStoreVectorizer)) 1076 addPass(createLoadStoreVectorizerPass()); 1077 1078 // LowerSwitch pass may introduce unreachable blocks that can 1079 // cause unexpected behavior for subsequent passes. Placing it 1080 // here seems better that these blocks would get cleaned up by 1081 // UnreachableBlockElim inserted next in the pass flow. 1082 addPass(createLowerSwitchPass()); 1083 } 1084 1085 bool AMDGPUPassConfig::addPreISel() { 1086 if (TM->getOptLevel() > CodeGenOpt::None) 1087 addPass(createFlattenCFGPass()); 1088 return false; 1089 } 1090 1091 bool AMDGPUPassConfig::addInstSelector() { 1092 addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel())); 1093 return false; 1094 } 1095 1096 bool AMDGPUPassConfig::addGCPasses() { 1097 // Do nothing. GC is not supported. 1098 return false; 1099 } 1100 1101 llvm::ScheduleDAGInstrs * 1102 AMDGPUPassConfig::createMachineScheduler(MachineSchedContext *C) const { 1103 ScheduleDAGMILive *DAG = createGenericSchedLive(C); 1104 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 1105 return DAG; 1106 } 1107 1108 //===----------------------------------------------------------------------===// 1109 // GCN Pass Setup 1110 //===----------------------------------------------------------------------===// 1111 1112 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler( 1113 MachineSchedContext *C) const { 1114 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 1115 if (ST.enableSIScheduler()) 1116 return createSIMachineScheduler(C); 1117 return createGCNMaxOccupancyMachineScheduler(C); 1118 } 1119 1120 bool GCNPassConfig::addPreISel() { 1121 AMDGPUPassConfig::addPreISel(); 1122 1123 if (TM->getOptLevel() > CodeGenOpt::None) 1124 addPass(createAMDGPULateCodeGenPreparePass()); 1125 1126 if (isPassEnabled(EnableAtomicOptimizations, CodeGenOpt::Less)) { 1127 addPass(createAMDGPUAtomicOptimizerPass()); 1128 } 1129 1130 if (TM->getOptLevel() > CodeGenOpt::None) 1131 addPass(createSinkingPass()); 1132 1133 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit 1134 // regions formed by them. 1135 addPass(&AMDGPUUnifyDivergentExitNodesID); 1136 if (!LateCFGStructurize) { 1137 if (EnableStructurizerWorkarounds) { 1138 addPass(createFixIrreduciblePass()); 1139 addPass(createUnifyLoopExitsPass()); 1140 } 1141 addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions 1142 } 1143 addPass(createAMDGPUAnnotateUniformValues()); 1144 if (!LateCFGStructurize) { 1145 addPass(createSIAnnotateControlFlowPass()); 1146 } 1147 addPass(createLCSSAPass()); 1148 1149 if (TM->getOptLevel() > CodeGenOpt::Less) 1150 addPass(&AMDGPUPerfHintAnalysisID); 1151 1152 return false; 1153 } 1154 1155 void GCNPassConfig::addMachineSSAOptimization() { 1156 TargetPassConfig::addMachineSSAOptimization(); 1157 1158 // We want to fold operands after PeepholeOptimizer has run (or as part of 1159 // it), because it will eliminate extra copies making it easier to fold the 1160 // real source operand. We want to eliminate dead instructions after, so that 1161 // we see fewer uses of the copies. We then need to clean up the dead 1162 // instructions leftover after the operands are folded as well. 1163 // 1164 // XXX - Can we get away without running DeadMachineInstructionElim again? 1165 addPass(&SIFoldOperandsID); 1166 if (EnableDPPCombine) 1167 addPass(&GCNDPPCombineID); 1168 addPass(&SILoadStoreOptimizerID); 1169 if (isPassEnabled(EnableSDWAPeephole)) { 1170 addPass(&SIPeepholeSDWAID); 1171 addPass(&EarlyMachineLICMID); 1172 addPass(&MachineCSEID); 1173 addPass(&SIFoldOperandsID); 1174 } 1175 addPass(&DeadMachineInstructionElimID); 1176 addPass(createSIShrinkInstructionsPass()); 1177 } 1178 1179 bool GCNPassConfig::addILPOpts() { 1180 if (EnableEarlyIfConversion) 1181 addPass(&EarlyIfConverterID); 1182 1183 TargetPassConfig::addILPOpts(); 1184 return false; 1185 } 1186 1187 bool GCNPassConfig::addInstSelector() { 1188 AMDGPUPassConfig::addInstSelector(); 1189 addPass(&SIFixSGPRCopiesID); 1190 addPass(createSILowerI1CopiesPass()); 1191 return false; 1192 } 1193 1194 bool GCNPassConfig::addIRTranslator() { 1195 addPass(new IRTranslator(getOptLevel())); 1196 return false; 1197 } 1198 1199 void GCNPassConfig::addPreLegalizeMachineIR() { 1200 bool IsOptNone = getOptLevel() == CodeGenOpt::None; 1201 addPass(createAMDGPUPreLegalizeCombiner(IsOptNone)); 1202 addPass(new Localizer()); 1203 } 1204 1205 bool GCNPassConfig::addLegalizeMachineIR() { 1206 addPass(new Legalizer()); 1207 return false; 1208 } 1209 1210 void GCNPassConfig::addPreRegBankSelect() { 1211 bool IsOptNone = getOptLevel() == CodeGenOpt::None; 1212 addPass(createAMDGPUPostLegalizeCombiner(IsOptNone)); 1213 } 1214 1215 bool GCNPassConfig::addRegBankSelect() { 1216 addPass(new RegBankSelect()); 1217 return false; 1218 } 1219 1220 void GCNPassConfig::addPreGlobalInstructionSelect() { 1221 bool IsOptNone = getOptLevel() == CodeGenOpt::None; 1222 addPass(createAMDGPURegBankCombiner(IsOptNone)); 1223 } 1224 1225 bool GCNPassConfig::addGlobalInstructionSelect() { 1226 addPass(new InstructionSelect(getOptLevel())); 1227 return false; 1228 } 1229 1230 void GCNPassConfig::addPreRegAlloc() { 1231 if (LateCFGStructurize) { 1232 addPass(createAMDGPUMachineCFGStructurizerPass()); 1233 } 1234 } 1235 1236 void GCNPassConfig::addFastRegAlloc() { 1237 // FIXME: We have to disable the verifier here because of PHIElimination + 1238 // TwoAddressInstructions disabling it. 1239 1240 // This must be run immediately after phi elimination and before 1241 // TwoAddressInstructions, otherwise the processing of the tied operand of 1242 // SI_ELSE will introduce a copy of the tied operand source after the else. 1243 insertPass(&PHIEliminationID, &SILowerControlFlowID); 1244 1245 insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID); 1246 insertPass(&TwoAddressInstructionPassID, &SIPreAllocateWWMRegsID); 1247 1248 TargetPassConfig::addFastRegAlloc(); 1249 } 1250 1251 void GCNPassConfig::addOptimizedRegAlloc() { 1252 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation 1253 // instructions that cause scheduling barriers. 1254 insertPass(&MachineSchedulerID, &SIWholeQuadModeID); 1255 insertPass(&MachineSchedulerID, &SIPreAllocateWWMRegsID); 1256 1257 if (OptExecMaskPreRA) 1258 insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID); 1259 1260 if (isPassEnabled(EnablePreRAOptimizations)) 1261 insertPass(&RenameIndependentSubregsID, &GCNPreRAOptimizationsID); 1262 1263 // This is not an essential optimization and it has a noticeable impact on 1264 // compilation time, so we only enable it from O2. 1265 if (TM->getOptLevel() > CodeGenOpt::Less) 1266 insertPass(&MachineSchedulerID, &SIFormMemoryClausesID); 1267 1268 // FIXME: when an instruction has a Killed operand, and the instruction is 1269 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of 1270 // the register in LiveVariables, this would trigger a failure in verifier, 1271 // we should fix it and enable the verifier. 1272 if (OptVGPRLiveRange) 1273 insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeID); 1274 // This must be run immediately after phi elimination and before 1275 // TwoAddressInstructions, otherwise the processing of the tied operand of 1276 // SI_ELSE will introduce a copy of the tied operand source after the else. 1277 insertPass(&PHIEliminationID, &SILowerControlFlowID); 1278 1279 if (EnableDCEInRA) 1280 insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID); 1281 1282 TargetPassConfig::addOptimizedRegAlloc(); 1283 } 1284 1285 bool GCNPassConfig::addPreRewrite() { 1286 if (EnableRegReassign) 1287 addPass(&GCNNSAReassignID); 1288 return true; 1289 } 1290 1291 FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) { 1292 // Initialize the global default. 1293 llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag, 1294 initializeDefaultSGPRRegisterAllocatorOnce); 1295 1296 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); 1297 if (Ctor != useDefaultRegisterAllocator) 1298 return Ctor(); 1299 1300 if (Optimized) 1301 return createGreedyRegisterAllocator(onlyAllocateSGPRs); 1302 1303 return createFastRegisterAllocator(onlyAllocateSGPRs, false); 1304 } 1305 1306 FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) { 1307 // Initialize the global default. 1308 llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag, 1309 initializeDefaultVGPRRegisterAllocatorOnce); 1310 1311 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); 1312 if (Ctor != useDefaultRegisterAllocator) 1313 return Ctor(); 1314 1315 if (Optimized) 1316 return createGreedyVGPRRegisterAllocator(); 1317 1318 return createFastVGPRRegisterAllocator(); 1319 } 1320 1321 FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) { 1322 llvm_unreachable("should not be used"); 1323 } 1324 1325 static const char RegAllocOptNotSupportedMessage[] = 1326 "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc"; 1327 1328 bool GCNPassConfig::addRegAssignAndRewriteFast() { 1329 if (!usingDefaultRegAlloc()) 1330 report_fatal_error(RegAllocOptNotSupportedMessage); 1331 1332 addPass(createSGPRAllocPass(false)); 1333 1334 // Equivalent of PEI for SGPRs. 1335 addPass(&SILowerSGPRSpillsID); 1336 1337 addPass(createVGPRAllocPass(false)); 1338 return true; 1339 } 1340 1341 bool GCNPassConfig::addRegAssignAndRewriteOptimized() { 1342 if (!usingDefaultRegAlloc()) 1343 report_fatal_error(RegAllocOptNotSupportedMessage); 1344 1345 addPass(createSGPRAllocPass(true)); 1346 1347 // Commit allocated register changes. This is mostly necessary because too 1348 // many things rely on the use lists of the physical registers, such as the 1349 // verifier. This is only necessary with allocators which use LiveIntervals, 1350 // since FastRegAlloc does the replacements itself. 1351 addPass(createVirtRegRewriter(false)); 1352 1353 // Equivalent of PEI for SGPRs. 1354 addPass(&SILowerSGPRSpillsID); 1355 1356 addPass(createVGPRAllocPass(true)); 1357 1358 addPreRewrite(); 1359 addPass(&VirtRegRewriterID); 1360 1361 return true; 1362 } 1363 1364 void GCNPassConfig::addPostRegAlloc() { 1365 addPass(&SIFixVGPRCopiesID); 1366 if (getOptLevel() > CodeGenOpt::None) 1367 addPass(&SIOptimizeExecMaskingID); 1368 TargetPassConfig::addPostRegAlloc(); 1369 } 1370 1371 void GCNPassConfig::addPreSched2() { 1372 if (TM->getOptLevel() > CodeGenOpt::None) 1373 addPass(createSIShrinkInstructionsPass()); 1374 addPass(&SIPostRABundlerID); 1375 } 1376 1377 void GCNPassConfig::addPreEmitPass() { 1378 addPass(createSIMemoryLegalizerPass()); 1379 addPass(createSIInsertWaitcntsPass()); 1380 1381 addPass(createSIModeRegisterPass()); 1382 1383 if (getOptLevel() > CodeGenOpt::None) 1384 addPass(&SIInsertHardClausesID); 1385 1386 addPass(&SILateBranchLoweringPassID); 1387 if (isPassEnabled(EnableSetWavePriority, CodeGenOpt::Less)) 1388 addPass(createAMDGPUSetWavePriorityPass()); 1389 if (getOptLevel() > CodeGenOpt::None) 1390 addPass(&SIPreEmitPeepholeID); 1391 // The hazard recognizer that runs as part of the post-ra scheduler does not 1392 // guarantee to be able handle all hazards correctly. This is because if there 1393 // are multiple scheduling regions in a basic block, the regions are scheduled 1394 // bottom up, so when we begin to schedule a region we don't know what 1395 // instructions were emitted directly before it. 1396 // 1397 // Here we add a stand-alone hazard recognizer pass which can handle all 1398 // cases. 1399 addPass(&PostRAHazardRecognizerID); 1400 addPass(&BranchRelaxationPassID); 1401 } 1402 1403 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { 1404 return new GCNPassConfig(*this, PM); 1405 } 1406 1407 yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const { 1408 return new yaml::SIMachineFunctionInfo(); 1409 } 1410 1411 yaml::MachineFunctionInfo * 1412 GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const { 1413 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1414 return new yaml::SIMachineFunctionInfo( 1415 *MFI, *MF.getSubtarget().getRegisterInfo(), MF); 1416 } 1417 1418 bool GCNTargetMachine::parseMachineFunctionInfo( 1419 const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS, 1420 SMDiagnostic &Error, SMRange &SourceRange) const { 1421 const yaml::SIMachineFunctionInfo &YamlMFI = 1422 static_cast<const yaml::SIMachineFunctionInfo &>(MFI_); 1423 MachineFunction &MF = PFS.MF; 1424 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1425 1426 if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange)) 1427 return true; 1428 1429 if (MFI->Occupancy == 0) { 1430 // Fixup the subtarget dependent default value. 1431 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1432 MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize()); 1433 } 1434 1435 auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) { 1436 Register TempReg; 1437 if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) { 1438 SourceRange = RegName.SourceRange; 1439 return true; 1440 } 1441 RegVal = TempReg; 1442 1443 return false; 1444 }; 1445 1446 auto parseOptionalRegister = [&](const yaml::StringValue &RegName, 1447 Register &RegVal) { 1448 return !RegName.Value.empty() && parseRegister(RegName, RegVal); 1449 }; 1450 1451 if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy)) 1452 return true; 1453 1454 auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) { 1455 // Create a diagnostic for a the register string literal. 1456 const MemoryBuffer &Buffer = 1457 *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID()); 1458 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1, 1459 RegName.Value.size(), SourceMgr::DK_Error, 1460 "incorrect register class for field", RegName.Value, 1461 None, None); 1462 SourceRange = RegName.SourceRange; 1463 return true; 1464 }; 1465 1466 if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) || 1467 parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) || 1468 parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg)) 1469 return true; 1470 1471 if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG && 1472 !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) { 1473 return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg); 1474 } 1475 1476 if (MFI->FrameOffsetReg != AMDGPU::FP_REG && 1477 !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) { 1478 return diagnoseRegisterClass(YamlMFI.FrameOffsetReg); 1479 } 1480 1481 if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG && 1482 !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) { 1483 return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg); 1484 } 1485 1486 for (const auto &YamlReg : YamlMFI.WWMReservedRegs) { 1487 Register ParsedReg; 1488 if (parseRegister(YamlReg, ParsedReg)) 1489 return true; 1490 1491 MFI->reserveWWMRegister(ParsedReg); 1492 } 1493 1494 auto parseAndCheckArgument = [&](const Optional<yaml::SIArgument> &A, 1495 const TargetRegisterClass &RC, 1496 ArgDescriptor &Arg, unsigned UserSGPRs, 1497 unsigned SystemSGPRs) { 1498 // Skip parsing if it's not present. 1499 if (!A) 1500 return false; 1501 1502 if (A->IsRegister) { 1503 Register Reg; 1504 if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) { 1505 SourceRange = A->RegisterName.SourceRange; 1506 return true; 1507 } 1508 if (!RC.contains(Reg)) 1509 return diagnoseRegisterClass(A->RegisterName); 1510 Arg = ArgDescriptor::createRegister(Reg); 1511 } else 1512 Arg = ArgDescriptor::createStack(A->StackOffset); 1513 // Check and apply the optional mask. 1514 if (A->Mask) 1515 Arg = ArgDescriptor::createArg(Arg, A->Mask.getValue()); 1516 1517 MFI->NumUserSGPRs += UserSGPRs; 1518 MFI->NumSystemSGPRs += SystemSGPRs; 1519 return false; 1520 }; 1521 1522 if (YamlMFI.ArgInfo && 1523 (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer, 1524 AMDGPU::SGPR_128RegClass, 1525 MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) || 1526 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr, 1527 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr, 1528 2, 0) || 1529 parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass, 1530 MFI->ArgInfo.QueuePtr, 2, 0) || 1531 parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr, 1532 AMDGPU::SReg_64RegClass, 1533 MFI->ArgInfo.KernargSegmentPtr, 2, 0) || 1534 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID, 1535 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID, 1536 2, 0) || 1537 parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit, 1538 AMDGPU::SReg_64RegClass, 1539 MFI->ArgInfo.FlatScratchInit, 2, 0) || 1540 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize, 1541 AMDGPU::SGPR_32RegClass, 1542 MFI->ArgInfo.PrivateSegmentSize, 0, 0) || 1543 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX, 1544 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX, 1545 0, 1) || 1546 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY, 1547 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY, 1548 0, 1) || 1549 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ, 1550 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ, 1551 0, 1) || 1552 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo, 1553 AMDGPU::SGPR_32RegClass, 1554 MFI->ArgInfo.WorkGroupInfo, 0, 1) || 1555 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset, 1556 AMDGPU::SGPR_32RegClass, 1557 MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) || 1558 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr, 1559 AMDGPU::SReg_64RegClass, 1560 MFI->ArgInfo.ImplicitArgPtr, 0, 0) || 1561 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr, 1562 AMDGPU::SReg_64RegClass, 1563 MFI->ArgInfo.ImplicitBufferPtr, 2, 0) || 1564 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX, 1565 AMDGPU::VGPR_32RegClass, 1566 MFI->ArgInfo.WorkItemIDX, 0, 0) || 1567 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY, 1568 AMDGPU::VGPR_32RegClass, 1569 MFI->ArgInfo.WorkItemIDY, 0, 0) || 1570 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ, 1571 AMDGPU::VGPR_32RegClass, 1572 MFI->ArgInfo.WorkItemIDZ, 0, 0))) 1573 return true; 1574 1575 MFI->Mode.IEEE = YamlMFI.Mode.IEEE; 1576 MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp; 1577 MFI->Mode.FP32InputDenormals = YamlMFI.Mode.FP32InputDenormals; 1578 MFI->Mode.FP32OutputDenormals = YamlMFI.Mode.FP32OutputDenormals; 1579 MFI->Mode.FP64FP16InputDenormals = YamlMFI.Mode.FP64FP16InputDenormals; 1580 MFI->Mode.FP64FP16OutputDenormals = YamlMFI.Mode.FP64FP16OutputDenormals; 1581 1582 return false; 1583 } 1584