1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// The AMDGPU target machine contains all of the hardware specific 11 /// information needed to emit code for SI+ GPUs. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUTargetMachine.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUAliasAnalysis.h" 18 #include "AMDGPUExportClustering.h" 19 #include "AMDGPUIGroupLP.h" 20 #include "AMDGPUMacroFusion.h" 21 #include "AMDGPUTargetObjectFile.h" 22 #include "AMDGPUTargetTransformInfo.h" 23 #include "GCNIterativeScheduler.h" 24 #include "GCNSchedStrategy.h" 25 #include "R600.h" 26 #include "R600TargetMachine.h" 27 #include "SIMachineFunctionInfo.h" 28 #include "SIMachineScheduler.h" 29 #include "TargetInfo/AMDGPUTargetInfo.h" 30 #include "llvm/Analysis/CGSCCPassManager.h" 31 #include "llvm/CodeGen/GlobalISel/CSEInfo.h" 32 #include "llvm/CodeGen/GlobalISel/IRTranslator.h" 33 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" 34 #include "llvm/CodeGen/GlobalISel/Legalizer.h" 35 #include "llvm/CodeGen/GlobalISel/Localizer.h" 36 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" 37 #include "llvm/CodeGen/MIRParser/MIParser.h" 38 #include "llvm/CodeGen/Passes.h" 39 #include "llvm/CodeGen/RegAllocRegistry.h" 40 #include "llvm/CodeGen/TargetPassConfig.h" 41 #include "llvm/IR/IntrinsicsAMDGPU.h" 42 #include "llvm/IR/LegacyPassManager.h" 43 #include "llvm/IR/PassManager.h" 44 #include "llvm/IR/PatternMatch.h" 45 #include "llvm/InitializePasses.h" 46 #include "llvm/MC/TargetRegistry.h" 47 #include "llvm/Passes/PassBuilder.h" 48 #include "llvm/Transforms/IPO.h" 49 #include "llvm/Transforms/IPO/AlwaysInliner.h" 50 #include "llvm/Transforms/IPO/GlobalDCE.h" 51 #include "llvm/Transforms/IPO/Internalize.h" 52 #include "llvm/Transforms/IPO/PassManagerBuilder.h" 53 #include "llvm/Transforms/Scalar.h" 54 #include "llvm/Transforms/Scalar/GVN.h" 55 #include "llvm/Transforms/Scalar/InferAddressSpaces.h" 56 #include "llvm/Transforms/Utils.h" 57 #include "llvm/Transforms/Utils/SimplifyLibCalls.h" 58 #include "llvm/Transforms/Vectorize.h" 59 60 using namespace llvm; 61 using namespace llvm::PatternMatch; 62 63 namespace { 64 class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> { 65 public: 66 SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) 67 : RegisterRegAllocBase(N, D, C) {} 68 }; 69 70 class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> { 71 public: 72 VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) 73 : RegisterRegAllocBase(N, D, C) {} 74 }; 75 76 static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI, 77 const TargetRegisterClass &RC) { 78 return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC); 79 } 80 81 static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI, 82 const TargetRegisterClass &RC) { 83 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC); 84 } 85 86 87 /// -{sgpr|vgpr}-regalloc=... command line option. 88 static FunctionPass *useDefaultRegisterAllocator() { return nullptr; } 89 90 /// A dummy default pass factory indicates whether the register allocator is 91 /// overridden on the command line. 92 static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag; 93 static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag; 94 95 static SGPRRegisterRegAlloc 96 defaultSGPRRegAlloc("default", 97 "pick SGPR register allocator based on -O option", 98 useDefaultRegisterAllocator); 99 100 static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false, 101 RegisterPassParser<SGPRRegisterRegAlloc>> 102 SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), 103 cl::desc("Register allocator to use for SGPRs")); 104 105 static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false, 106 RegisterPassParser<VGPRRegisterRegAlloc>> 107 VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), 108 cl::desc("Register allocator to use for VGPRs")); 109 110 111 static void initializeDefaultSGPRRegisterAllocatorOnce() { 112 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); 113 114 if (!Ctor) { 115 Ctor = SGPRRegAlloc; 116 SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc); 117 } 118 } 119 120 static void initializeDefaultVGPRRegisterAllocatorOnce() { 121 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); 122 123 if (!Ctor) { 124 Ctor = VGPRRegAlloc; 125 VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc); 126 } 127 } 128 129 static FunctionPass *createBasicSGPRRegisterAllocator() { 130 return createBasicRegisterAllocator(onlyAllocateSGPRs); 131 } 132 133 static FunctionPass *createGreedySGPRRegisterAllocator() { 134 return createGreedyRegisterAllocator(onlyAllocateSGPRs); 135 } 136 137 static FunctionPass *createFastSGPRRegisterAllocator() { 138 return createFastRegisterAllocator(onlyAllocateSGPRs, false); 139 } 140 141 static FunctionPass *createBasicVGPRRegisterAllocator() { 142 return createBasicRegisterAllocator(onlyAllocateVGPRs); 143 } 144 145 static FunctionPass *createGreedyVGPRRegisterAllocator() { 146 return createGreedyRegisterAllocator(onlyAllocateVGPRs); 147 } 148 149 static FunctionPass *createFastVGPRRegisterAllocator() { 150 return createFastRegisterAllocator(onlyAllocateVGPRs, true); 151 } 152 153 static SGPRRegisterRegAlloc basicRegAllocSGPR( 154 "basic", "basic register allocator", createBasicSGPRRegisterAllocator); 155 static SGPRRegisterRegAlloc greedyRegAllocSGPR( 156 "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator); 157 158 static SGPRRegisterRegAlloc fastRegAllocSGPR( 159 "fast", "fast register allocator", createFastSGPRRegisterAllocator); 160 161 162 static VGPRRegisterRegAlloc basicRegAllocVGPR( 163 "basic", "basic register allocator", createBasicVGPRRegisterAllocator); 164 static VGPRRegisterRegAlloc greedyRegAllocVGPR( 165 "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator); 166 167 static VGPRRegisterRegAlloc fastRegAllocVGPR( 168 "fast", "fast register allocator", createFastVGPRRegisterAllocator); 169 } 170 171 static cl::opt<bool> EnableSROA( 172 "amdgpu-sroa", 173 cl::desc("Run SROA after promote alloca pass"), 174 cl::ReallyHidden, 175 cl::init(true)); 176 177 static cl::opt<bool> 178 EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden, 179 cl::desc("Run early if-conversion"), 180 cl::init(false)); 181 182 static cl::opt<bool> 183 OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden, 184 cl::desc("Run pre-RA exec mask optimizations"), 185 cl::init(true)); 186 187 // Option to disable vectorizer for tests. 188 static cl::opt<bool> EnableLoadStoreVectorizer( 189 "amdgpu-load-store-vectorizer", 190 cl::desc("Enable load store vectorizer"), 191 cl::init(true), 192 cl::Hidden); 193 194 // Option to control global loads scalarization 195 static cl::opt<bool> ScalarizeGlobal( 196 "amdgpu-scalarize-global-loads", 197 cl::desc("Enable global load scalarization"), 198 cl::init(true), 199 cl::Hidden); 200 201 // Option to run internalize pass. 202 static cl::opt<bool> InternalizeSymbols( 203 "amdgpu-internalize-symbols", 204 cl::desc("Enable elimination of non-kernel functions and unused globals"), 205 cl::init(false), 206 cl::Hidden); 207 208 // Option to inline all early. 209 static cl::opt<bool> EarlyInlineAll( 210 "amdgpu-early-inline-all", 211 cl::desc("Inline all functions early"), 212 cl::init(false), 213 cl::Hidden); 214 215 static cl::opt<bool> EnableSDWAPeephole( 216 "amdgpu-sdwa-peephole", 217 cl::desc("Enable SDWA peepholer"), 218 cl::init(true)); 219 220 static cl::opt<bool> EnableDPPCombine( 221 "amdgpu-dpp-combine", 222 cl::desc("Enable DPP combiner"), 223 cl::init(true)); 224 225 // Enable address space based alias analysis 226 static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, 227 cl::desc("Enable AMDGPU Alias Analysis"), 228 cl::init(true)); 229 230 // Option to run late CFG structurizer 231 static cl::opt<bool, true> LateCFGStructurize( 232 "amdgpu-late-structurize", 233 cl::desc("Enable late CFG structurization"), 234 cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG), 235 cl::Hidden); 236 237 // Enable lib calls simplifications 238 static cl::opt<bool> EnableLibCallSimplify( 239 "amdgpu-simplify-libcall", 240 cl::desc("Enable amdgpu library simplifications"), 241 cl::init(true), 242 cl::Hidden); 243 244 static cl::opt<bool> EnableLowerKernelArguments( 245 "amdgpu-ir-lower-kernel-arguments", 246 cl::desc("Lower kernel argument loads in IR pass"), 247 cl::init(true), 248 cl::Hidden); 249 250 static cl::opt<bool> EnableRegReassign( 251 "amdgpu-reassign-regs", 252 cl::desc("Enable register reassign optimizations on gfx10+"), 253 cl::init(true), 254 cl::Hidden); 255 256 static cl::opt<bool> OptVGPRLiveRange( 257 "amdgpu-opt-vgpr-liverange", 258 cl::desc("Enable VGPR liverange optimizations for if-else structure"), 259 cl::init(true), cl::Hidden); 260 261 // Enable atomic optimization 262 static cl::opt<bool> EnableAtomicOptimizations( 263 "amdgpu-atomic-optimizations", 264 cl::desc("Enable atomic optimizations"), 265 cl::init(false), 266 cl::Hidden); 267 268 // Enable Mode register optimization 269 static cl::opt<bool> EnableSIModeRegisterPass( 270 "amdgpu-mode-register", 271 cl::desc("Enable mode register pass"), 272 cl::init(true), 273 cl::Hidden); 274 275 // Option is used in lit tests to prevent deadcoding of patterns inspected. 276 static cl::opt<bool> 277 EnableDCEInRA("amdgpu-dce-in-ra", 278 cl::init(true), cl::Hidden, 279 cl::desc("Enable machine DCE inside regalloc")); 280 281 static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority", 282 cl::desc("Adjust wave priority"), 283 cl::init(false), cl::Hidden); 284 285 static cl::opt<bool> EnableScalarIRPasses( 286 "amdgpu-scalar-ir-passes", 287 cl::desc("Enable scalar IR passes"), 288 cl::init(true), 289 cl::Hidden); 290 291 static cl::opt<bool> EnableStructurizerWorkarounds( 292 "amdgpu-enable-structurizer-workarounds", 293 cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true), 294 cl::Hidden); 295 296 static cl::opt<bool> EnableLDSReplaceWithPointer( 297 "amdgpu-enable-lds-replace-with-pointer", 298 cl::desc("Enable LDS replace with pointer pass"), cl::init(false), 299 cl::Hidden); 300 301 static cl::opt<bool, true> EnableLowerModuleLDS( 302 "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"), 303 cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true), 304 cl::Hidden); 305 306 static cl::opt<bool> EnablePreRAOptimizations( 307 "amdgpu-enable-pre-ra-optimizations", 308 cl::desc("Enable Pre-RA optimizations pass"), cl::init(true), 309 cl::Hidden); 310 311 static cl::opt<bool> EnablePromoteKernelArguments( 312 "amdgpu-enable-promote-kernel-arguments", 313 cl::desc("Enable promotion of flat kernel pointer arguments to global"), 314 cl::Hidden, cl::init(true)); 315 316 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { 317 // Register the target 318 RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget()); 319 RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget()); 320 321 PassRegistry *PR = PassRegistry::getPassRegistry(); 322 initializeR600ClauseMergePassPass(*PR); 323 initializeR600ControlFlowFinalizerPass(*PR); 324 initializeR600PacketizerPass(*PR); 325 initializeR600ExpandSpecialInstrsPassPass(*PR); 326 initializeR600VectorRegMergerPass(*PR); 327 initializeGlobalISel(*PR); 328 initializeAMDGPUDAGToDAGISelPass(*PR); 329 initializeGCNDPPCombinePass(*PR); 330 initializeSILowerI1CopiesPass(*PR); 331 initializeSILowerSGPRSpillsPass(*PR); 332 initializeSIFixSGPRCopiesPass(*PR); 333 initializeSIFixVGPRCopiesPass(*PR); 334 initializeSIFoldOperandsPass(*PR); 335 initializeSIPeepholeSDWAPass(*PR); 336 initializeSIShrinkInstructionsPass(*PR); 337 initializeSIOptimizeExecMaskingPreRAPass(*PR); 338 initializeSIOptimizeVGPRLiveRangePass(*PR); 339 initializeSILoadStoreOptimizerPass(*PR); 340 initializeAMDGPUCtorDtorLoweringPass(*PR); 341 initializeAMDGPUAlwaysInlinePass(*PR); 342 initializeAMDGPUAttributorPass(*PR); 343 initializeAMDGPUAnnotateKernelFeaturesPass(*PR); 344 initializeAMDGPUAnnotateUniformValuesPass(*PR); 345 initializeAMDGPUArgumentUsageInfoPass(*PR); 346 initializeAMDGPUAtomicOptimizerPass(*PR); 347 initializeAMDGPULowerKernelArgumentsPass(*PR); 348 initializeAMDGPUPromoteKernelArgumentsPass(*PR); 349 initializeAMDGPULowerKernelAttributesPass(*PR); 350 initializeAMDGPULowerIntrinsicsPass(*PR); 351 initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR); 352 initializeAMDGPUPostLegalizerCombinerPass(*PR); 353 initializeAMDGPUPreLegalizerCombinerPass(*PR); 354 initializeAMDGPURegBankCombinerPass(*PR); 355 initializeAMDGPUPromoteAllocaPass(*PR); 356 initializeAMDGPUPromoteAllocaToVectorPass(*PR); 357 initializeAMDGPUCodeGenPreparePass(*PR); 358 initializeAMDGPULateCodeGenPreparePass(*PR); 359 initializeAMDGPUPropagateAttributesEarlyPass(*PR); 360 initializeAMDGPUPropagateAttributesLatePass(*PR); 361 initializeAMDGPUReplaceLDSUseWithPointerPass(*PR); 362 initializeAMDGPULowerModuleLDSPass(*PR); 363 initializeAMDGPURewriteOutArgumentsPass(*PR); 364 initializeAMDGPUUnifyMetadataPass(*PR); 365 initializeSIAnnotateControlFlowPass(*PR); 366 initializeSIInsertHardClausesPass(*PR); 367 initializeSIInsertWaitcntsPass(*PR); 368 initializeSIModeRegisterPass(*PR); 369 initializeSIWholeQuadModePass(*PR); 370 initializeSILowerControlFlowPass(*PR); 371 initializeSIPreEmitPeepholePass(*PR); 372 initializeSILateBranchLoweringPass(*PR); 373 initializeSIMemoryLegalizerPass(*PR); 374 initializeSIOptimizeExecMaskingPass(*PR); 375 initializeSIPreAllocateWWMRegsPass(*PR); 376 initializeSIFormMemoryClausesPass(*PR); 377 initializeSIPostRABundlerPass(*PR); 378 initializeAMDGPUUnifyDivergentExitNodesPass(*PR); 379 initializeAMDGPUAAWrapperPassPass(*PR); 380 initializeAMDGPUExternalAAWrapperPass(*PR); 381 initializeAMDGPUUseNativeCallsPass(*PR); 382 initializeAMDGPUSimplifyLibCallsPass(*PR); 383 initializeAMDGPUPrintfRuntimeBindingPass(*PR); 384 initializeAMDGPUResourceUsageAnalysisPass(*PR); 385 initializeGCNNSAReassignPass(*PR); 386 initializeGCNPreRAOptimizationsPass(*PR); 387 } 388 389 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { 390 return std::make_unique<AMDGPUTargetObjectFile>(); 391 } 392 393 static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) { 394 return new SIScheduleDAGMI(C); 395 } 396 397 static ScheduleDAGInstrs * 398 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { 399 ScheduleDAGMILive *DAG = 400 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C)); 401 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 402 DAG->addMutation(createIGroupLPDAGMutation()); 403 DAG->addMutation(createSchedBarrierDAGMutation()); 404 DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); 405 DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); 406 return DAG; 407 } 408 409 static ScheduleDAGInstrs * 410 createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { 411 auto DAG = new GCNIterativeScheduler(C, 412 GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY); 413 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 414 return DAG; 415 } 416 417 static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) { 418 return new GCNIterativeScheduler(C, 419 GCNIterativeScheduler::SCHEDULE_MINREGFORCED); 420 } 421 422 static ScheduleDAGInstrs * 423 createIterativeILPMachineScheduler(MachineSchedContext *C) { 424 auto DAG = new GCNIterativeScheduler(C, 425 GCNIterativeScheduler::SCHEDULE_ILP); 426 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 427 DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); 428 return DAG; 429 } 430 431 static MachineSchedRegistry 432 SISchedRegistry("si", "Run SI's custom scheduler", 433 createSIMachineScheduler); 434 435 static MachineSchedRegistry 436 GCNMaxOccupancySchedRegistry("gcn-max-occupancy", 437 "Run GCN scheduler to maximize occupancy", 438 createGCNMaxOccupancyMachineScheduler); 439 440 static MachineSchedRegistry 441 IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental", 442 "Run GCN scheduler to maximize occupancy (experimental)", 443 createIterativeGCNMaxOccupancyMachineScheduler); 444 445 static MachineSchedRegistry 446 GCNMinRegSchedRegistry("gcn-minreg", 447 "Run GCN iterative scheduler for minimal register usage (experimental)", 448 createMinRegScheduler); 449 450 static MachineSchedRegistry 451 GCNILPSchedRegistry("gcn-ilp", 452 "Run GCN iterative scheduler for ILP scheduling (experimental)", 453 createIterativeILPMachineScheduler); 454 455 static StringRef computeDataLayout(const Triple &TT) { 456 if (TT.getArch() == Triple::r600) { 457 // 32-bit pointers. 458 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" 459 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1"; 460 } 461 462 // 32-bit private, local, and region pointers. 64-bit global, constant and 463 // flat, non-integral buffer fat pointers. 464 return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32" 465 "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" 466 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1" 467 "-ni:7"; 468 } 469 470 LLVM_READNONE 471 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) { 472 if (!GPU.empty()) 473 return GPU; 474 475 // Need to default to a target with flat support for HSA. 476 if (TT.getArch() == Triple::amdgcn) 477 return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic"; 478 479 return "r600"; 480 } 481 482 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) { 483 // The AMDGPU toolchain only supports generating shared objects, so we 484 // must always use PIC. 485 return Reloc::PIC_; 486 } 487 488 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, 489 StringRef CPU, StringRef FS, 490 TargetOptions Options, 491 Optional<Reloc::Model> RM, 492 Optional<CodeModel::Model> CM, 493 CodeGenOpt::Level OptLevel) 494 : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), 495 FS, Options, getEffectiveRelocModel(RM), 496 getEffectiveCodeModel(CM, CodeModel::Small), OptLevel), 497 TLOF(createTLOF(getTargetTriple())) { 498 initAsmInfo(); 499 if (TT.getArch() == Triple::amdgcn) { 500 if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64")) 501 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64)); 502 else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32")) 503 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32)); 504 } 505 } 506 507 bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; 508 bool AMDGPUTargetMachine::EnableFunctionCalls = false; 509 bool AMDGPUTargetMachine::EnableLowerModuleLDS = true; 510 511 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; 512 513 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const { 514 Attribute GPUAttr = F.getFnAttribute("target-cpu"); 515 return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU(); 516 } 517 518 StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const { 519 Attribute FSAttr = F.getFnAttribute("target-features"); 520 521 return FSAttr.isValid() ? FSAttr.getValueAsString() 522 : getTargetFeatureString(); 523 } 524 525 /// Predicate for Internalize pass. 526 static bool mustPreserveGV(const GlobalValue &GV) { 527 if (const Function *F = dyn_cast<Function>(&GV)) 528 return F->isDeclaration() || F->getName().startswith("__asan_") || 529 F->getName().startswith("__sanitizer_") || 530 AMDGPU::isEntryFunctionCC(F->getCallingConv()); 531 532 GV.removeDeadConstantUsers(); 533 return !GV.use_empty(); 534 } 535 536 void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { 537 Builder.DivergentTarget = true; 538 539 bool EnableOpt = getOptLevel() > CodeGenOpt::None; 540 bool Internalize = InternalizeSymbols; 541 bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableFunctionCalls; 542 bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt; 543 bool LibCallSimplify = EnableLibCallSimplify && EnableOpt; 544 bool PromoteKernelArguments = 545 EnablePromoteKernelArguments && getOptLevel() > CodeGenOpt::Less; 546 547 if (EnableFunctionCalls) { 548 delete Builder.Inliner; 549 Builder.Inliner = createFunctionInliningPass(); 550 } 551 552 Builder.addExtension( 553 PassManagerBuilder::EP_ModuleOptimizerEarly, 554 [Internalize, EarlyInline, AMDGPUAA, this](const PassManagerBuilder &, 555 legacy::PassManagerBase &PM) { 556 if (AMDGPUAA) { 557 PM.add(createAMDGPUAAWrapperPass()); 558 PM.add(createAMDGPUExternalAAWrapperPass()); 559 } 560 PM.add(createAMDGPUUnifyMetadataPass()); 561 PM.add(createAMDGPUPrintfRuntimeBinding()); 562 if (Internalize) 563 PM.add(createInternalizePass(mustPreserveGV)); 564 PM.add(createAMDGPUPropagateAttributesLatePass(this)); 565 if (Internalize) 566 PM.add(createGlobalDCEPass()); 567 if (EarlyInline) 568 PM.add(createAMDGPUAlwaysInlinePass(false)); 569 }); 570 571 Builder.addExtension( 572 PassManagerBuilder::EP_EarlyAsPossible, 573 [AMDGPUAA, LibCallSimplify, this](const PassManagerBuilder &, 574 legacy::PassManagerBase &PM) { 575 if (AMDGPUAA) { 576 PM.add(createAMDGPUAAWrapperPass()); 577 PM.add(createAMDGPUExternalAAWrapperPass()); 578 } 579 PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this)); 580 PM.add(llvm::createAMDGPUUseNativeCallsPass()); 581 if (LibCallSimplify) 582 PM.add(llvm::createAMDGPUSimplifyLibCallsPass(this)); 583 }); 584 585 Builder.addExtension( 586 PassManagerBuilder::EP_CGSCCOptimizerLate, 587 [EnableOpt, PromoteKernelArguments](const PassManagerBuilder &, 588 legacy::PassManagerBase &PM) { 589 // Add promote kernel arguments pass to the opt pipeline right before 590 // infer address spaces which is needed to do actual address space 591 // rewriting. 592 if (PromoteKernelArguments) 593 PM.add(createAMDGPUPromoteKernelArgumentsPass()); 594 595 // Add infer address spaces pass to the opt pipeline after inlining 596 // but before SROA to increase SROA opportunities. 597 PM.add(createInferAddressSpacesPass()); 598 599 // This should run after inlining to have any chance of doing anything, 600 // and before other cleanup optimizations. 601 PM.add(createAMDGPULowerKernelAttributesPass()); 602 603 // Promote alloca to vector before SROA and loop unroll. If we manage 604 // to eliminate allocas before unroll we may choose to unroll less. 605 if (EnableOpt) 606 PM.add(createAMDGPUPromoteAllocaToVector()); 607 }); 608 } 609 610 void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) { 611 AAM.registerFunctionAnalysis<AMDGPUAA>(); 612 } 613 614 void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { 615 PB.registerPipelineParsingCallback( 616 [this](StringRef PassName, ModulePassManager &PM, 617 ArrayRef<PassBuilder::PipelineElement>) { 618 if (PassName == "amdgpu-propagate-attributes-late") { 619 PM.addPass(AMDGPUPropagateAttributesLatePass(*this)); 620 return true; 621 } 622 if (PassName == "amdgpu-unify-metadata") { 623 PM.addPass(AMDGPUUnifyMetadataPass()); 624 return true; 625 } 626 if (PassName == "amdgpu-printf-runtime-binding") { 627 PM.addPass(AMDGPUPrintfRuntimeBindingPass()); 628 return true; 629 } 630 if (PassName == "amdgpu-always-inline") { 631 PM.addPass(AMDGPUAlwaysInlinePass()); 632 return true; 633 } 634 if (PassName == "amdgpu-replace-lds-use-with-pointer") { 635 PM.addPass(AMDGPUReplaceLDSUseWithPointerPass()); 636 return true; 637 } 638 if (PassName == "amdgpu-lower-module-lds") { 639 PM.addPass(AMDGPULowerModuleLDSPass()); 640 return true; 641 } 642 return false; 643 }); 644 PB.registerPipelineParsingCallback( 645 [this](StringRef PassName, FunctionPassManager &PM, 646 ArrayRef<PassBuilder::PipelineElement>) { 647 if (PassName == "amdgpu-simplifylib") { 648 PM.addPass(AMDGPUSimplifyLibCallsPass(*this)); 649 return true; 650 } 651 if (PassName == "amdgpu-usenative") { 652 PM.addPass(AMDGPUUseNativeCallsPass()); 653 return true; 654 } 655 if (PassName == "amdgpu-promote-alloca") { 656 PM.addPass(AMDGPUPromoteAllocaPass(*this)); 657 return true; 658 } 659 if (PassName == "amdgpu-promote-alloca-to-vector") { 660 PM.addPass(AMDGPUPromoteAllocaToVectorPass(*this)); 661 return true; 662 } 663 if (PassName == "amdgpu-lower-kernel-attributes") { 664 PM.addPass(AMDGPULowerKernelAttributesPass()); 665 return true; 666 } 667 if (PassName == "amdgpu-propagate-attributes-early") { 668 PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this)); 669 return true; 670 } 671 if (PassName == "amdgpu-promote-kernel-arguments") { 672 PM.addPass(AMDGPUPromoteKernelArgumentsPass()); 673 return true; 674 } 675 return false; 676 }); 677 678 PB.registerAnalysisRegistrationCallback([](FunctionAnalysisManager &FAM) { 679 FAM.registerPass([&] { return AMDGPUAA(); }); 680 }); 681 682 PB.registerParseAACallback([](StringRef AAName, AAManager &AAM) { 683 if (AAName == "amdgpu-aa") { 684 AAM.registerFunctionAnalysis<AMDGPUAA>(); 685 return true; 686 } 687 return false; 688 }); 689 690 PB.registerPipelineStartEPCallback( 691 [this](ModulePassManager &PM, OptimizationLevel Level) { 692 FunctionPassManager FPM; 693 FPM.addPass(AMDGPUPropagateAttributesEarlyPass(*this)); 694 FPM.addPass(AMDGPUUseNativeCallsPass()); 695 if (EnableLibCallSimplify && Level != OptimizationLevel::O0) 696 FPM.addPass(AMDGPUSimplifyLibCallsPass(*this)); 697 PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 698 }); 699 700 PB.registerPipelineEarlySimplificationEPCallback( 701 [this](ModulePassManager &PM, OptimizationLevel Level) { 702 if (Level == OptimizationLevel::O0) 703 return; 704 705 PM.addPass(AMDGPUUnifyMetadataPass()); 706 PM.addPass(AMDGPUPrintfRuntimeBindingPass()); 707 708 if (InternalizeSymbols) { 709 PM.addPass(InternalizePass(mustPreserveGV)); 710 } 711 PM.addPass(AMDGPUPropagateAttributesLatePass(*this)); 712 if (InternalizeSymbols) { 713 PM.addPass(GlobalDCEPass()); 714 } 715 if (EarlyInlineAll && !EnableFunctionCalls) 716 PM.addPass(AMDGPUAlwaysInlinePass()); 717 }); 718 719 PB.registerCGSCCOptimizerLateEPCallback( 720 [this](CGSCCPassManager &PM, OptimizationLevel Level) { 721 if (Level == OptimizationLevel::O0) 722 return; 723 724 FunctionPassManager FPM; 725 726 // Add promote kernel arguments pass to the opt pipeline right before 727 // infer address spaces which is needed to do actual address space 728 // rewriting. 729 if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() && 730 EnablePromoteKernelArguments) 731 FPM.addPass(AMDGPUPromoteKernelArgumentsPass()); 732 733 // Add infer address spaces pass to the opt pipeline after inlining 734 // but before SROA to increase SROA opportunities. 735 FPM.addPass(InferAddressSpacesPass()); 736 737 // This should run after inlining to have any chance of doing 738 // anything, and before other cleanup optimizations. 739 FPM.addPass(AMDGPULowerKernelAttributesPass()); 740 741 if (Level != OptimizationLevel::O0) { 742 // Promote alloca to vector before SROA and loop unroll. If we 743 // manage to eliminate allocas before unroll we may choose to unroll 744 // less. 745 FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this)); 746 } 747 748 PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM))); 749 }); 750 } 751 752 int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) { 753 return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || 754 AddrSpace == AMDGPUAS::PRIVATE_ADDRESS || 755 AddrSpace == AMDGPUAS::REGION_ADDRESS) 756 ? -1 757 : 0; 758 } 759 760 bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS, 761 unsigned DestAS) const { 762 return AMDGPU::isFlatGlobalAddrSpace(SrcAS) && 763 AMDGPU::isFlatGlobalAddrSpace(DestAS); 764 } 765 766 unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const { 767 const auto *LD = dyn_cast<LoadInst>(V); 768 if (!LD) 769 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; 770 771 // It must be a generic pointer loaded. 772 assert(V->getType()->isPointerTy() && 773 V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS); 774 775 const auto *Ptr = LD->getPointerOperand(); 776 if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) 777 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; 778 // For a generic pointer loaded from the constant memory, it could be assumed 779 // as a global pointer since the constant memory is only populated on the 780 // host side. As implied by the offload programming model, only global 781 // pointers could be referenced on the host side. 782 return AMDGPUAS::GLOBAL_ADDRESS; 783 } 784 785 std::pair<const Value *, unsigned> 786 AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const { 787 if (auto *II = dyn_cast<IntrinsicInst>(V)) { 788 switch (II->getIntrinsicID()) { 789 case Intrinsic::amdgcn_is_shared: 790 return std::make_pair(II->getArgOperand(0), AMDGPUAS::LOCAL_ADDRESS); 791 case Intrinsic::amdgcn_is_private: 792 return std::make_pair(II->getArgOperand(0), AMDGPUAS::PRIVATE_ADDRESS); 793 default: 794 break; 795 } 796 return std::make_pair(nullptr, -1); 797 } 798 // Check the global pointer predication based on 799 // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and 800 // the order of 'is_shared' and 'is_private' is not significant. 801 Value *Ptr; 802 if (match( 803 const_cast<Value *>(V), 804 m_c_And(m_Not(m_Intrinsic<Intrinsic::amdgcn_is_shared>(m_Value(Ptr))), 805 m_Not(m_Intrinsic<Intrinsic::amdgcn_is_private>( 806 m_Deferred(Ptr)))))) 807 return std::make_pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS); 808 809 return std::make_pair(nullptr, -1); 810 } 811 812 unsigned 813 AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const { 814 switch (Kind) { 815 case PseudoSourceValue::Stack: 816 case PseudoSourceValue::FixedStack: 817 return AMDGPUAS::PRIVATE_ADDRESS; 818 case PseudoSourceValue::ConstantPool: 819 case PseudoSourceValue::GOT: 820 case PseudoSourceValue::JumpTable: 821 case PseudoSourceValue::GlobalValueCallEntry: 822 case PseudoSourceValue::ExternalSymbolCallEntry: 823 case PseudoSourceValue::TargetCustom: 824 return AMDGPUAS::CONSTANT_ADDRESS; 825 } 826 return AMDGPUAS::FLAT_ADDRESS; 827 } 828 829 //===----------------------------------------------------------------------===// 830 // GCN Target Machine (SI+) 831 //===----------------------------------------------------------------------===// 832 833 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, 834 StringRef CPU, StringRef FS, 835 TargetOptions Options, 836 Optional<Reloc::Model> RM, 837 Optional<CodeModel::Model> CM, 838 CodeGenOpt::Level OL, bool JIT) 839 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} 840 841 const TargetSubtargetInfo * 842 GCNTargetMachine::getSubtargetImpl(const Function &F) const { 843 StringRef GPU = getGPUName(F); 844 StringRef FS = getFeatureString(F); 845 846 SmallString<128> SubtargetKey(GPU); 847 SubtargetKey.append(FS); 848 849 auto &I = SubtargetMap[SubtargetKey]; 850 if (!I) { 851 // This needs to be done before we create a new subtarget since any 852 // creation will depend on the TM and the code generation flags on the 853 // function that reside in TargetOptions. 854 resetTargetOptions(F); 855 I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this); 856 } 857 858 I->setScalarizeGlobalBehavior(ScalarizeGlobal); 859 860 return I.get(); 861 } 862 863 TargetTransformInfo 864 GCNTargetMachine::getTargetTransformInfo(const Function &F) const { 865 return TargetTransformInfo(GCNTTIImpl(this, F)); 866 } 867 868 //===----------------------------------------------------------------------===// 869 // AMDGPU Pass Setup 870 //===----------------------------------------------------------------------===// 871 872 std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const { 873 return getStandardCSEConfigForOpt(TM->getOptLevel()); 874 } 875 876 namespace { 877 878 class GCNPassConfig final : public AMDGPUPassConfig { 879 public: 880 GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) 881 : AMDGPUPassConfig(TM, PM) { 882 // It is necessary to know the register usage of the entire call graph. We 883 // allow calls without EnableAMDGPUFunctionCalls if they are marked 884 // noinline, so this is always required. 885 setRequiresCodeGenSCCOrder(true); 886 substitutePass(&PostRASchedulerID, &PostMachineSchedulerID); 887 } 888 889 GCNTargetMachine &getGCNTargetMachine() const { 890 return getTM<GCNTargetMachine>(); 891 } 892 893 ScheduleDAGInstrs * 894 createMachineScheduler(MachineSchedContext *C) const override; 895 896 ScheduleDAGInstrs * 897 createPostMachineScheduler(MachineSchedContext *C) const override { 898 ScheduleDAGMI *DAG = createGenericSchedPostRA(C); 899 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 900 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 901 DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII)); 902 DAG->addMutation(createIGroupLPDAGMutation()); 903 DAG->addMutation(createSchedBarrierDAGMutation()); 904 return DAG; 905 } 906 907 bool addPreISel() override; 908 void addMachineSSAOptimization() override; 909 bool addILPOpts() override; 910 bool addInstSelector() override; 911 bool addIRTranslator() override; 912 void addPreLegalizeMachineIR() override; 913 bool addLegalizeMachineIR() override; 914 void addPreRegBankSelect() override; 915 bool addRegBankSelect() override; 916 void addPreGlobalInstructionSelect() override; 917 bool addGlobalInstructionSelect() override; 918 void addFastRegAlloc() override; 919 void addOptimizedRegAlloc() override; 920 921 FunctionPass *createSGPRAllocPass(bool Optimized); 922 FunctionPass *createVGPRAllocPass(bool Optimized); 923 FunctionPass *createRegAllocPass(bool Optimized) override; 924 925 bool addRegAssignAndRewriteFast() override; 926 bool addRegAssignAndRewriteOptimized() override; 927 928 void addPreRegAlloc() override; 929 bool addPreRewrite() override; 930 void addPostRegAlloc() override; 931 void addPreSched2() override; 932 void addPreEmitPass() override; 933 }; 934 935 } // end anonymous namespace 936 937 AMDGPUPassConfig::AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) 938 : TargetPassConfig(TM, PM) { 939 // Exceptions and StackMaps are not supported, so these passes will never do 940 // anything. 941 disablePass(&StackMapLivenessID); 942 disablePass(&FuncletLayoutID); 943 // Garbage collection is not supported. 944 disablePass(&GCLoweringID); 945 disablePass(&ShadowStackGCLoweringID); 946 } 947 948 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { 949 if (getOptLevel() == CodeGenOpt::Aggressive) 950 addPass(createGVNPass()); 951 else 952 addPass(createEarlyCSEPass()); 953 } 954 955 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { 956 addPass(createLICMPass()); 957 addPass(createSeparateConstOffsetFromGEPPass()); 958 addPass(createSpeculativeExecutionPass()); 959 // ReassociateGEPs exposes more opportunities for SLSR. See 960 // the example in reassociate-geps-and-slsr.ll. 961 addPass(createStraightLineStrengthReducePass()); 962 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or 963 // EarlyCSE can reuse. 964 addEarlyCSEOrGVNPass(); 965 // Run NaryReassociate after EarlyCSE/GVN to be more effective. 966 addPass(createNaryReassociatePass()); 967 // NaryReassociate on GEPs creates redundant common expressions, so run 968 // EarlyCSE after it. 969 addPass(createEarlyCSEPass()); 970 } 971 972 void AMDGPUPassConfig::addIRPasses() { 973 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); 974 975 // There is no reason to run these. 976 disablePass(&StackMapLivenessID); 977 disablePass(&FuncletLayoutID); 978 disablePass(&PatchableFunctionID); 979 980 addPass(createAMDGPUPrintfRuntimeBinding()); 981 addPass(createAMDGPUCtorDtorLoweringPass()); 982 983 // A call to propagate attributes pass in the backend in case opt was not run. 984 addPass(createAMDGPUPropagateAttributesEarlyPass(&TM)); 985 986 addPass(createAMDGPULowerIntrinsicsPass()); 987 988 // Function calls are not supported, so make sure we inline everything. 989 addPass(createAMDGPUAlwaysInlinePass()); 990 addPass(createAlwaysInlinerLegacyPass()); 991 // We need to add the barrier noop pass, otherwise adding the function 992 // inlining pass will cause all of the PassConfigs passes to be run 993 // one function at a time, which means if we have a module with two 994 // functions, then we will generate code for the first function 995 // without ever running any passes on the second. 996 addPass(createBarrierNoopPass()); 997 998 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. 999 if (TM.getTargetTriple().getArch() == Triple::r600) 1000 addPass(createR600OpenCLImageTypeLoweringPass()); 1001 1002 // Replace OpenCL enqueued block function pointers with global variables. 1003 addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass()); 1004 1005 // Can increase LDS used by kernel so runs before PromoteAlloca 1006 if (EnableLowerModuleLDS) { 1007 // The pass "amdgpu-replace-lds-use-with-pointer" need to be run before the 1008 // pass "amdgpu-lower-module-lds", and also it required to be run only if 1009 // "amdgpu-lower-module-lds" pass is enabled. 1010 if (EnableLDSReplaceWithPointer) 1011 addPass(createAMDGPUReplaceLDSUseWithPointerPass()); 1012 1013 addPass(createAMDGPULowerModuleLDSPass()); 1014 } 1015 1016 if (TM.getOptLevel() > CodeGenOpt::None) 1017 addPass(createInferAddressSpacesPass()); 1018 1019 addPass(createAtomicExpandPass()); 1020 1021 if (TM.getOptLevel() > CodeGenOpt::None) { 1022 addPass(createAMDGPUPromoteAlloca()); 1023 1024 if (EnableSROA) 1025 addPass(createSROAPass()); 1026 if (isPassEnabled(EnableScalarIRPasses)) 1027 addStraightLineScalarOptimizationPasses(); 1028 1029 if (EnableAMDGPUAliasAnalysis) { 1030 addPass(createAMDGPUAAWrapperPass()); 1031 addPass(createExternalAAWrapperPass([](Pass &P, Function &, 1032 AAResults &AAR) { 1033 if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>()) 1034 AAR.addAAResult(WrapperPass->getResult()); 1035 })); 1036 } 1037 1038 if (TM.getTargetTriple().getArch() == Triple::amdgcn) { 1039 // TODO: May want to move later or split into an early and late one. 1040 addPass(createAMDGPUCodeGenPreparePass()); 1041 } 1042 } 1043 1044 TargetPassConfig::addIRPasses(); 1045 1046 // EarlyCSE is not always strong enough to clean up what LSR produces. For 1047 // example, GVN can combine 1048 // 1049 // %0 = add %a, %b 1050 // %1 = add %b, %a 1051 // 1052 // and 1053 // 1054 // %0 = shl nsw %a, 2 1055 // %1 = shl %a, 2 1056 // 1057 // but EarlyCSE can do neither of them. 1058 if (isPassEnabled(EnableScalarIRPasses)) 1059 addEarlyCSEOrGVNPass(); 1060 } 1061 1062 void AMDGPUPassConfig::addCodeGenPrepare() { 1063 if (TM->getTargetTriple().getArch() == Triple::amdgcn) { 1064 addPass(createAMDGPUAttributorPass()); 1065 1066 // FIXME: This pass adds 2 hacky attributes that can be replaced with an 1067 // analysis, and should be removed. 1068 addPass(createAMDGPUAnnotateKernelFeaturesPass()); 1069 } 1070 1071 if (TM->getTargetTriple().getArch() == Triple::amdgcn && 1072 EnableLowerKernelArguments) 1073 addPass(createAMDGPULowerKernelArgumentsPass()); 1074 1075 TargetPassConfig::addCodeGenPrepare(); 1076 1077 if (isPassEnabled(EnableLoadStoreVectorizer)) 1078 addPass(createLoadStoreVectorizerPass()); 1079 1080 // LowerSwitch pass may introduce unreachable blocks that can 1081 // cause unexpected behavior for subsequent passes. Placing it 1082 // here seems better that these blocks would get cleaned up by 1083 // UnreachableBlockElim inserted next in the pass flow. 1084 addPass(createLowerSwitchPass()); 1085 } 1086 1087 bool AMDGPUPassConfig::addPreISel() { 1088 if (TM->getOptLevel() > CodeGenOpt::None) 1089 addPass(createFlattenCFGPass()); 1090 return false; 1091 } 1092 1093 bool AMDGPUPassConfig::addInstSelector() { 1094 addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel())); 1095 return false; 1096 } 1097 1098 bool AMDGPUPassConfig::addGCPasses() { 1099 // Do nothing. GC is not supported. 1100 return false; 1101 } 1102 1103 llvm::ScheduleDAGInstrs * 1104 AMDGPUPassConfig::createMachineScheduler(MachineSchedContext *C) const { 1105 ScheduleDAGMILive *DAG = createGenericSchedLive(C); 1106 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 1107 return DAG; 1108 } 1109 1110 //===----------------------------------------------------------------------===// 1111 // GCN Pass Setup 1112 //===----------------------------------------------------------------------===// 1113 1114 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler( 1115 MachineSchedContext *C) const { 1116 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 1117 if (ST.enableSIScheduler()) 1118 return createSIMachineScheduler(C); 1119 return createGCNMaxOccupancyMachineScheduler(C); 1120 } 1121 1122 bool GCNPassConfig::addPreISel() { 1123 AMDGPUPassConfig::addPreISel(); 1124 1125 if (TM->getOptLevel() > CodeGenOpt::None) 1126 addPass(createAMDGPULateCodeGenPreparePass()); 1127 1128 if (isPassEnabled(EnableAtomicOptimizations, CodeGenOpt::Less)) { 1129 addPass(createAMDGPUAtomicOptimizerPass()); 1130 } 1131 1132 if (TM->getOptLevel() > CodeGenOpt::None) 1133 addPass(createSinkingPass()); 1134 1135 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit 1136 // regions formed by them. 1137 addPass(&AMDGPUUnifyDivergentExitNodesID); 1138 if (!LateCFGStructurize) { 1139 if (EnableStructurizerWorkarounds) { 1140 addPass(createFixIrreduciblePass()); 1141 addPass(createUnifyLoopExitsPass()); 1142 } 1143 addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions 1144 } 1145 addPass(createAMDGPUAnnotateUniformValues()); 1146 if (!LateCFGStructurize) { 1147 addPass(createSIAnnotateControlFlowPass()); 1148 } 1149 addPass(createLCSSAPass()); 1150 1151 if (TM->getOptLevel() > CodeGenOpt::Less) 1152 addPass(&AMDGPUPerfHintAnalysisID); 1153 1154 return false; 1155 } 1156 1157 void GCNPassConfig::addMachineSSAOptimization() { 1158 TargetPassConfig::addMachineSSAOptimization(); 1159 1160 // We want to fold operands after PeepholeOptimizer has run (or as part of 1161 // it), because it will eliminate extra copies making it easier to fold the 1162 // real source operand. We want to eliminate dead instructions after, so that 1163 // we see fewer uses of the copies. We then need to clean up the dead 1164 // instructions leftover after the operands are folded as well. 1165 // 1166 // XXX - Can we get away without running DeadMachineInstructionElim again? 1167 addPass(&SIFoldOperandsID); 1168 if (EnableDPPCombine) 1169 addPass(&GCNDPPCombineID); 1170 addPass(&SILoadStoreOptimizerID); 1171 if (isPassEnabled(EnableSDWAPeephole)) { 1172 addPass(&SIPeepholeSDWAID); 1173 addPass(&EarlyMachineLICMID); 1174 addPass(&MachineCSEID); 1175 addPass(&SIFoldOperandsID); 1176 } 1177 addPass(&DeadMachineInstructionElimID); 1178 addPass(createSIShrinkInstructionsPass()); 1179 } 1180 1181 bool GCNPassConfig::addILPOpts() { 1182 if (EnableEarlyIfConversion) 1183 addPass(&EarlyIfConverterID); 1184 1185 TargetPassConfig::addILPOpts(); 1186 return false; 1187 } 1188 1189 bool GCNPassConfig::addInstSelector() { 1190 AMDGPUPassConfig::addInstSelector(); 1191 addPass(&SIFixSGPRCopiesID); 1192 addPass(createSILowerI1CopiesPass()); 1193 return false; 1194 } 1195 1196 bool GCNPassConfig::addIRTranslator() { 1197 addPass(new IRTranslator(getOptLevel())); 1198 return false; 1199 } 1200 1201 void GCNPassConfig::addPreLegalizeMachineIR() { 1202 bool IsOptNone = getOptLevel() == CodeGenOpt::None; 1203 addPass(createAMDGPUPreLegalizeCombiner(IsOptNone)); 1204 addPass(new Localizer()); 1205 } 1206 1207 bool GCNPassConfig::addLegalizeMachineIR() { 1208 addPass(new Legalizer()); 1209 return false; 1210 } 1211 1212 void GCNPassConfig::addPreRegBankSelect() { 1213 bool IsOptNone = getOptLevel() == CodeGenOpt::None; 1214 addPass(createAMDGPUPostLegalizeCombiner(IsOptNone)); 1215 } 1216 1217 bool GCNPassConfig::addRegBankSelect() { 1218 addPass(new RegBankSelect()); 1219 return false; 1220 } 1221 1222 void GCNPassConfig::addPreGlobalInstructionSelect() { 1223 bool IsOptNone = getOptLevel() == CodeGenOpt::None; 1224 addPass(createAMDGPURegBankCombiner(IsOptNone)); 1225 } 1226 1227 bool GCNPassConfig::addGlobalInstructionSelect() { 1228 addPass(new InstructionSelect(getOptLevel())); 1229 return false; 1230 } 1231 1232 void GCNPassConfig::addPreRegAlloc() { 1233 if (LateCFGStructurize) { 1234 addPass(createAMDGPUMachineCFGStructurizerPass()); 1235 } 1236 } 1237 1238 void GCNPassConfig::addFastRegAlloc() { 1239 // FIXME: We have to disable the verifier here because of PHIElimination + 1240 // TwoAddressInstructions disabling it. 1241 1242 // This must be run immediately after phi elimination and before 1243 // TwoAddressInstructions, otherwise the processing of the tied operand of 1244 // SI_ELSE will introduce a copy of the tied operand source after the else. 1245 insertPass(&PHIEliminationID, &SILowerControlFlowID); 1246 1247 insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID); 1248 insertPass(&TwoAddressInstructionPassID, &SIPreAllocateWWMRegsID); 1249 1250 TargetPassConfig::addFastRegAlloc(); 1251 } 1252 1253 void GCNPassConfig::addOptimizedRegAlloc() { 1254 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation 1255 // instructions that cause scheduling barriers. 1256 insertPass(&MachineSchedulerID, &SIWholeQuadModeID); 1257 insertPass(&MachineSchedulerID, &SIPreAllocateWWMRegsID); 1258 1259 if (OptExecMaskPreRA) 1260 insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID); 1261 1262 if (isPassEnabled(EnablePreRAOptimizations)) 1263 insertPass(&RenameIndependentSubregsID, &GCNPreRAOptimizationsID); 1264 1265 // This is not an essential optimization and it has a noticeable impact on 1266 // compilation time, so we only enable it from O2. 1267 if (TM->getOptLevel() > CodeGenOpt::Less) 1268 insertPass(&MachineSchedulerID, &SIFormMemoryClausesID); 1269 1270 // FIXME: when an instruction has a Killed operand, and the instruction is 1271 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of 1272 // the register in LiveVariables, this would trigger a failure in verifier, 1273 // we should fix it and enable the verifier. 1274 if (OptVGPRLiveRange) 1275 insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeID); 1276 // This must be run immediately after phi elimination and before 1277 // TwoAddressInstructions, otherwise the processing of the tied operand of 1278 // SI_ELSE will introduce a copy of the tied operand source after the else. 1279 insertPass(&PHIEliminationID, &SILowerControlFlowID); 1280 1281 if (EnableDCEInRA) 1282 insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID); 1283 1284 TargetPassConfig::addOptimizedRegAlloc(); 1285 } 1286 1287 bool GCNPassConfig::addPreRewrite() { 1288 if (EnableRegReassign) 1289 addPass(&GCNNSAReassignID); 1290 return true; 1291 } 1292 1293 FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) { 1294 // Initialize the global default. 1295 llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag, 1296 initializeDefaultSGPRRegisterAllocatorOnce); 1297 1298 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); 1299 if (Ctor != useDefaultRegisterAllocator) 1300 return Ctor(); 1301 1302 if (Optimized) 1303 return createGreedyRegisterAllocator(onlyAllocateSGPRs); 1304 1305 return createFastRegisterAllocator(onlyAllocateSGPRs, false); 1306 } 1307 1308 FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) { 1309 // Initialize the global default. 1310 llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag, 1311 initializeDefaultVGPRRegisterAllocatorOnce); 1312 1313 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); 1314 if (Ctor != useDefaultRegisterAllocator) 1315 return Ctor(); 1316 1317 if (Optimized) 1318 return createGreedyVGPRRegisterAllocator(); 1319 1320 return createFastVGPRRegisterAllocator(); 1321 } 1322 1323 FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) { 1324 llvm_unreachable("should not be used"); 1325 } 1326 1327 static const char RegAllocOptNotSupportedMessage[] = 1328 "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc"; 1329 1330 bool GCNPassConfig::addRegAssignAndRewriteFast() { 1331 if (!usingDefaultRegAlloc()) 1332 report_fatal_error(RegAllocOptNotSupportedMessage); 1333 1334 addPass(createSGPRAllocPass(false)); 1335 1336 // Equivalent of PEI for SGPRs. 1337 addPass(&SILowerSGPRSpillsID); 1338 1339 addPass(createVGPRAllocPass(false)); 1340 return true; 1341 } 1342 1343 bool GCNPassConfig::addRegAssignAndRewriteOptimized() { 1344 if (!usingDefaultRegAlloc()) 1345 report_fatal_error(RegAllocOptNotSupportedMessage); 1346 1347 addPass(createSGPRAllocPass(true)); 1348 1349 // Commit allocated register changes. This is mostly necessary because too 1350 // many things rely on the use lists of the physical registers, such as the 1351 // verifier. This is only necessary with allocators which use LiveIntervals, 1352 // since FastRegAlloc does the replacements itself. 1353 addPass(createVirtRegRewriter(false)); 1354 1355 // Equivalent of PEI for SGPRs. 1356 addPass(&SILowerSGPRSpillsID); 1357 1358 addPass(createVGPRAllocPass(true)); 1359 1360 addPreRewrite(); 1361 addPass(&VirtRegRewriterID); 1362 1363 return true; 1364 } 1365 1366 void GCNPassConfig::addPostRegAlloc() { 1367 addPass(&SIFixVGPRCopiesID); 1368 if (getOptLevel() > CodeGenOpt::None) 1369 addPass(&SIOptimizeExecMaskingID); 1370 TargetPassConfig::addPostRegAlloc(); 1371 } 1372 1373 void GCNPassConfig::addPreSched2() { 1374 if (TM->getOptLevel() > CodeGenOpt::None) 1375 addPass(createSIShrinkInstructionsPass()); 1376 addPass(&SIPostRABundlerID); 1377 } 1378 1379 void GCNPassConfig::addPreEmitPass() { 1380 addPass(createSIMemoryLegalizerPass()); 1381 addPass(createSIInsertWaitcntsPass()); 1382 1383 addPass(createSIModeRegisterPass()); 1384 1385 if (getOptLevel() > CodeGenOpt::None) 1386 addPass(&SIInsertHardClausesID); 1387 1388 addPass(&SILateBranchLoweringPassID); 1389 if (isPassEnabled(EnableSetWavePriority, CodeGenOpt::Less)) 1390 addPass(createAMDGPUSetWavePriorityPass()); 1391 if (getOptLevel() > CodeGenOpt::None) 1392 addPass(&SIPreEmitPeepholeID); 1393 // The hazard recognizer that runs as part of the post-ra scheduler does not 1394 // guarantee to be able handle all hazards correctly. This is because if there 1395 // are multiple scheduling regions in a basic block, the regions are scheduled 1396 // bottom up, so when we begin to schedule a region we don't know what 1397 // instructions were emitted directly before it. 1398 // 1399 // Here we add a stand-alone hazard recognizer pass which can handle all 1400 // cases. 1401 addPass(&PostRAHazardRecognizerID); 1402 addPass(&BranchRelaxationPassID); 1403 } 1404 1405 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { 1406 return new GCNPassConfig(*this, PM); 1407 } 1408 1409 yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const { 1410 return new yaml::SIMachineFunctionInfo(); 1411 } 1412 1413 yaml::MachineFunctionInfo * 1414 GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const { 1415 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1416 return new yaml::SIMachineFunctionInfo( 1417 *MFI, *MF.getSubtarget().getRegisterInfo(), MF); 1418 } 1419 1420 bool GCNTargetMachine::parseMachineFunctionInfo( 1421 const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS, 1422 SMDiagnostic &Error, SMRange &SourceRange) const { 1423 const yaml::SIMachineFunctionInfo &YamlMFI = 1424 static_cast<const yaml::SIMachineFunctionInfo &>(MFI_); 1425 MachineFunction &MF = PFS.MF; 1426 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1427 1428 if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange)) 1429 return true; 1430 1431 if (MFI->Occupancy == 0) { 1432 // Fixup the subtarget dependent default value. 1433 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1434 MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize()); 1435 } 1436 1437 auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) { 1438 Register TempReg; 1439 if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) { 1440 SourceRange = RegName.SourceRange; 1441 return true; 1442 } 1443 RegVal = TempReg; 1444 1445 return false; 1446 }; 1447 1448 auto parseOptionalRegister = [&](const yaml::StringValue &RegName, 1449 Register &RegVal) { 1450 return !RegName.Value.empty() && parseRegister(RegName, RegVal); 1451 }; 1452 1453 if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy)) 1454 return true; 1455 1456 auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) { 1457 // Create a diagnostic for a the register string literal. 1458 const MemoryBuffer &Buffer = 1459 *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID()); 1460 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1, 1461 RegName.Value.size(), SourceMgr::DK_Error, 1462 "incorrect register class for field", RegName.Value, 1463 None, None); 1464 SourceRange = RegName.SourceRange; 1465 return true; 1466 }; 1467 1468 if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) || 1469 parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) || 1470 parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg)) 1471 return true; 1472 1473 if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG && 1474 !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) { 1475 return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg); 1476 } 1477 1478 if (MFI->FrameOffsetReg != AMDGPU::FP_REG && 1479 !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) { 1480 return diagnoseRegisterClass(YamlMFI.FrameOffsetReg); 1481 } 1482 1483 if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG && 1484 !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) { 1485 return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg); 1486 } 1487 1488 for (const auto &YamlReg : YamlMFI.WWMReservedRegs) { 1489 Register ParsedReg; 1490 if (parseRegister(YamlReg, ParsedReg)) 1491 return true; 1492 1493 MFI->reserveWWMRegister(ParsedReg); 1494 } 1495 1496 auto parseAndCheckArgument = [&](const Optional<yaml::SIArgument> &A, 1497 const TargetRegisterClass &RC, 1498 ArgDescriptor &Arg, unsigned UserSGPRs, 1499 unsigned SystemSGPRs) { 1500 // Skip parsing if it's not present. 1501 if (!A) 1502 return false; 1503 1504 if (A->IsRegister) { 1505 Register Reg; 1506 if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) { 1507 SourceRange = A->RegisterName.SourceRange; 1508 return true; 1509 } 1510 if (!RC.contains(Reg)) 1511 return diagnoseRegisterClass(A->RegisterName); 1512 Arg = ArgDescriptor::createRegister(Reg); 1513 } else 1514 Arg = ArgDescriptor::createStack(A->StackOffset); 1515 // Check and apply the optional mask. 1516 if (A->Mask) 1517 Arg = ArgDescriptor::createArg(Arg, A->Mask.getValue()); 1518 1519 MFI->NumUserSGPRs += UserSGPRs; 1520 MFI->NumSystemSGPRs += SystemSGPRs; 1521 return false; 1522 }; 1523 1524 if (YamlMFI.ArgInfo && 1525 (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer, 1526 AMDGPU::SGPR_128RegClass, 1527 MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) || 1528 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr, 1529 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr, 1530 2, 0) || 1531 parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass, 1532 MFI->ArgInfo.QueuePtr, 2, 0) || 1533 parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr, 1534 AMDGPU::SReg_64RegClass, 1535 MFI->ArgInfo.KernargSegmentPtr, 2, 0) || 1536 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID, 1537 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID, 1538 2, 0) || 1539 parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit, 1540 AMDGPU::SReg_64RegClass, 1541 MFI->ArgInfo.FlatScratchInit, 2, 0) || 1542 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize, 1543 AMDGPU::SGPR_32RegClass, 1544 MFI->ArgInfo.PrivateSegmentSize, 0, 0) || 1545 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX, 1546 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX, 1547 0, 1) || 1548 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY, 1549 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY, 1550 0, 1) || 1551 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ, 1552 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ, 1553 0, 1) || 1554 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo, 1555 AMDGPU::SGPR_32RegClass, 1556 MFI->ArgInfo.WorkGroupInfo, 0, 1) || 1557 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset, 1558 AMDGPU::SGPR_32RegClass, 1559 MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) || 1560 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr, 1561 AMDGPU::SReg_64RegClass, 1562 MFI->ArgInfo.ImplicitArgPtr, 0, 0) || 1563 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr, 1564 AMDGPU::SReg_64RegClass, 1565 MFI->ArgInfo.ImplicitBufferPtr, 2, 0) || 1566 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX, 1567 AMDGPU::VGPR_32RegClass, 1568 MFI->ArgInfo.WorkItemIDX, 0, 0) || 1569 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY, 1570 AMDGPU::VGPR_32RegClass, 1571 MFI->ArgInfo.WorkItemIDY, 0, 0) || 1572 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ, 1573 AMDGPU::VGPR_32RegClass, 1574 MFI->ArgInfo.WorkItemIDZ, 0, 0))) 1575 return true; 1576 1577 MFI->Mode.IEEE = YamlMFI.Mode.IEEE; 1578 MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp; 1579 MFI->Mode.FP32InputDenormals = YamlMFI.Mode.FP32InputDenormals; 1580 MFI->Mode.FP32OutputDenormals = YamlMFI.Mode.FP32OutputDenormals; 1581 MFI->Mode.FP64FP16InputDenormals = YamlMFI.Mode.FP64FP16InputDenormals; 1582 MFI->Mode.FP64FP16OutputDenormals = YamlMFI.Mode.FP64FP16OutputDenormals; 1583 1584 return false; 1585 } 1586