1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// Implements the AMDGPU specific subclass of TargetSubtarget. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUSubtarget.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUTargetMachine.h" 18 #include "AMDGPUCallLowering.h" 19 #include "AMDGPUInstructionSelector.h" 20 #include "AMDGPULegalizerInfo.h" 21 #include "AMDGPURegisterBankInfo.h" 22 #include "SIMachineFunctionInfo.h" 23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 24 #include "llvm/ADT/SmallString.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/IR/MDBuilder.h" 27 #include "llvm/CodeGen/TargetFrameLowering.h" 28 #include <algorithm> 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "amdgpu-subtarget" 33 34 #define GET_SUBTARGETINFO_TARGET_DESC 35 #define GET_SUBTARGETINFO_CTOR 36 #include "AMDGPUGenSubtargetInfo.inc" 37 38 AMDGPUSubtarget::~AMDGPUSubtarget() = default; 39 40 AMDGPUSubtarget & 41 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, 42 StringRef GPU, StringRef FS) { 43 // Determine default and user-specified characteristics 44 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 45 // enabled, but some instructions do not respect them and they run at the 46 // double precision rate, so don't enable by default. 47 // 48 // We want to be able to turn these off, but making this a subtarget feature 49 // for SI has the unhelpful behavior that it unsets everything else if you 50 // disable it. 51 52 SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,"); 53 54 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 55 FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 56 57 // FIXME: I don't think think Evergreen has any useful support for 58 // denormals, but should be checked. Should we issue a warning somewhere 59 // if someone tries to enable these? 60 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 61 FullFS += "+fp64-fp16-denormals,"; 62 } else { 63 FullFS += "-fp32-denormals,"; 64 } 65 66 FullFS += FS; 67 68 ParseSubtargetFeatures(GPU, FullFS); 69 70 // We don't support FP64 for EG/NI atm. 71 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 72 73 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 74 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 75 // variants of MUBUF instructions. 76 if (!hasAddr64() && !FS.contains("flat-for-global")) { 77 FlatForGlobal = true; 78 } 79 80 // Set defaults if needed. 81 if (MaxPrivateElementSize == 0) 82 MaxPrivateElementSize = 4; 83 84 if (LDSBankCount == 0) 85 LDSBankCount = 32; 86 87 if (TT.getArch() == Triple::amdgcn) { 88 if (LocalMemorySize == 0) 89 LocalMemorySize = 32768; 90 91 // Do something sensible for unspecified target. 92 if (!HasMovrel && !HasVGPRIndexMode) 93 HasMovrel = true; 94 } 95 96 return *this; 97 } 98 99 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 100 const TargetMachine &TM) 101 : AMDGPUGenSubtargetInfo(TT, GPU, FS), 102 TargetTriple(TT), 103 Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600), 104 IsaVersion(ISAVersion0_0_0), 105 WavefrontSize(0), 106 LocalMemorySize(0), 107 LDSBankCount(0), 108 MaxPrivateElementSize(0), 109 110 FastFMAF32(false), 111 HalfRate64Ops(false), 112 113 FP32Denormals(false), 114 FP64FP16Denormals(false), 115 FPExceptions(false), 116 DX10Clamp(false), 117 FlatForGlobal(false), 118 AutoWaitcntBeforeBarrier(false), 119 CodeObjectV3(false), 120 UnalignedScratchAccess(false), 121 UnalignedBufferAccess(false), 122 123 HasApertureRegs(false), 124 EnableXNACK(false), 125 TrapHandler(false), 126 DebuggerInsertNops(false), 127 DebuggerReserveRegs(false), 128 DebuggerEmitPrologue(false), 129 130 EnableHugePrivateBuffer(false), 131 EnableVGPRSpilling(false), 132 EnablePromoteAlloca(false), 133 EnableLoadStoreOpt(false), 134 EnableUnsafeDSOffsetFolding(false), 135 EnableSIScheduler(false), 136 EnableDS128(false), 137 DumpCode(false), 138 139 FP64(false), 140 FMA(false), 141 MIMG_R128(false), 142 IsGCN(false), 143 GCN3Encoding(false), 144 CIInsts(false), 145 GFX9Insts(false), 146 SGPRInitBug(false), 147 HasSMemRealTime(false), 148 Has16BitInsts(false), 149 HasIntClamp(false), 150 HasVOP3PInsts(false), 151 HasMadMixInsts(false), 152 HasFmaMixInsts(false), 153 HasMovrel(false), 154 HasVGPRIndexMode(false), 155 HasScalarStores(false), 156 HasScalarAtomics(false), 157 HasInv2PiInlineImm(false), 158 HasSDWA(false), 159 HasSDWAOmod(false), 160 HasSDWAScalar(false), 161 HasSDWASdst(false), 162 HasSDWAMac(false), 163 HasSDWAOutModsVOPC(false), 164 HasDPP(false), 165 HasDLInsts(false), 166 D16PreservesUnusedBits(false), 167 FlatAddressSpace(false), 168 FlatInstOffsets(false), 169 FlatGlobalInsts(false), 170 FlatScratchInsts(false), 171 AddNoCarryInsts(false), 172 HasUnpackedD16VMem(false), 173 174 R600ALUInst(false), 175 CaymanISA(false), 176 CFALUBug(false), 177 HasVertexCache(false), 178 TexVTXClauseSize(0), 179 ScalarizeGlobal(false), 180 181 FeatureDisable(false), 182 InstrItins(getInstrItineraryForCPU(GPU)) { 183 AS = AMDGPU::getAMDGPUAS(TT); 184 initializeSubtargetDependencies(TT, GPU, FS); 185 } 186 187 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 188 const Function &F) const { 189 if (NWaves == 1) 190 return getLocalMemorySize(); 191 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 192 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 193 unsigned MaxWaves = getMaxWavesPerEU(); 194 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 195 } 196 197 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 198 const Function &F) const { 199 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 200 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 201 unsigned MaxWaves = getMaxWavesPerEU(); 202 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 203 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 204 NumWaves = std::min(NumWaves, MaxWaves); 205 NumWaves = std::max(NumWaves, 1u); 206 return NumWaves; 207 } 208 209 unsigned 210 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 211 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 212 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 213 } 214 215 std::pair<unsigned, unsigned> 216 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 217 switch (CC) { 218 case CallingConv::AMDGPU_CS: 219 case CallingConv::AMDGPU_KERNEL: 220 case CallingConv::SPIR_KERNEL: 221 return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4); 222 case CallingConv::AMDGPU_VS: 223 case CallingConv::AMDGPU_LS: 224 case CallingConv::AMDGPU_HS: 225 case CallingConv::AMDGPU_ES: 226 case CallingConv::AMDGPU_GS: 227 case CallingConv::AMDGPU_PS: 228 return std::make_pair(1, getWavefrontSize()); 229 default: 230 return std::make_pair(1, 16 * getWavefrontSize()); 231 } 232 } 233 234 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 235 const Function &F) const { 236 // FIXME: 1024 if function. 237 // Default minimum/maximum flat work group sizes. 238 std::pair<unsigned, unsigned> Default = 239 getDefaultFlatWorkGroupSize(F.getCallingConv()); 240 241 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 242 // starts using "amdgpu-flat-work-group-size" attribute. 243 Default.second = AMDGPU::getIntegerAttribute( 244 F, "amdgpu-max-work-group-size", Default.second); 245 Default.first = std::min(Default.first, Default.second); 246 247 // Requested minimum/maximum flat work group sizes. 248 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 249 F, "amdgpu-flat-work-group-size", Default); 250 251 // Make sure requested minimum is less than requested maximum. 252 if (Requested.first > Requested.second) 253 return Default; 254 255 // Make sure requested values do not violate subtarget's specifications. 256 if (Requested.first < getMinFlatWorkGroupSize()) 257 return Default; 258 if (Requested.second > getMaxFlatWorkGroupSize()) 259 return Default; 260 261 return Requested; 262 } 263 264 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 265 const Function &F) const { 266 // Default minimum/maximum number of waves per execution unit. 267 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 268 269 // Default/requested minimum/maximum flat work group sizes. 270 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 271 272 // If minimum/maximum flat work group sizes were explicitly requested using 273 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 274 // number of waves per execution unit to values implied by requested 275 // minimum/maximum flat work group sizes. 276 unsigned MinImpliedByFlatWorkGroupSize = 277 getMaxWavesPerEU(FlatWorkGroupSizes.second); 278 bool RequestedFlatWorkGroupSize = false; 279 280 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 281 // starts using "amdgpu-flat-work-group-size" attribute. 282 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 283 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 284 Default.first = MinImpliedByFlatWorkGroupSize; 285 RequestedFlatWorkGroupSize = true; 286 } 287 288 // Requested minimum/maximum number of waves per execution unit. 289 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 290 F, "amdgpu-waves-per-eu", Default, true); 291 292 // Make sure requested minimum is less than requested maximum. 293 if (Requested.second && Requested.first > Requested.second) 294 return Default; 295 296 // Make sure requested values do not violate subtarget's specifications. 297 if (Requested.first < getMinWavesPerEU() || 298 Requested.first > getMaxWavesPerEU()) 299 return Default; 300 if (Requested.second > getMaxWavesPerEU()) 301 return Default; 302 303 // Make sure requested values are compatible with values implied by requested 304 // minimum/maximum flat work group sizes. 305 if (RequestedFlatWorkGroupSize && 306 Requested.first < MinImpliedByFlatWorkGroupSize) 307 return Default; 308 309 return Requested; 310 } 311 312 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 313 Function *Kernel = I->getParent()->getParent(); 314 unsigned MinSize = 0; 315 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 316 bool IdQuery = false; 317 318 // If reqd_work_group_size is present it narrows value down. 319 if (auto *CI = dyn_cast<CallInst>(I)) { 320 const Function *F = CI->getCalledFunction(); 321 if (F) { 322 unsigned Dim = UINT_MAX; 323 switch (F->getIntrinsicID()) { 324 case Intrinsic::amdgcn_workitem_id_x: 325 case Intrinsic::r600_read_tidig_x: 326 IdQuery = true; 327 LLVM_FALLTHROUGH; 328 case Intrinsic::r600_read_local_size_x: 329 Dim = 0; 330 break; 331 case Intrinsic::amdgcn_workitem_id_y: 332 case Intrinsic::r600_read_tidig_y: 333 IdQuery = true; 334 LLVM_FALLTHROUGH; 335 case Intrinsic::r600_read_local_size_y: 336 Dim = 1; 337 break; 338 case Intrinsic::amdgcn_workitem_id_z: 339 case Intrinsic::r600_read_tidig_z: 340 IdQuery = true; 341 LLVM_FALLTHROUGH; 342 case Intrinsic::r600_read_local_size_z: 343 Dim = 2; 344 break; 345 default: 346 break; 347 } 348 if (Dim <= 3) { 349 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 350 if (Node->getNumOperands() == 3) 351 MinSize = MaxSize = mdconst::extract<ConstantInt>( 352 Node->getOperand(Dim))->getZExtValue(); 353 } 354 } 355 } 356 357 if (!MaxSize) 358 return false; 359 360 // Range metadata is [Lo, Hi). For ID query we need to pass max size 361 // as Hi. For size query we need to pass Hi + 1. 362 if (IdQuery) 363 MinSize = 0; 364 else 365 ++MaxSize; 366 367 MDBuilder MDB(I->getContext()); 368 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 369 APInt(32, MaxSize)); 370 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 371 return true; 372 } 373 374 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 375 const TargetMachine &TM) : 376 AMDGPUSubtarget(TT, GPU, FS, TM), 377 InstrInfo(*this), 378 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 379 TLInfo(TM, *this) {} 380 381 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS, 382 const GCNTargetMachine &TM) 383 : AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this), 384 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 385 TLInfo(TM, *this) { 386 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 387 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 388 389 RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); 390 InstSelector.reset(new AMDGPUInstructionSelector( 391 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 392 } 393 394 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 395 unsigned NumRegionInstrs) const { 396 // Track register pressure so the scheduler can try to decrease 397 // pressure once register usage is above the threshold defined by 398 // SIRegisterInfo::getRegPressureSetLimit() 399 Policy.ShouldTrackPressure = true; 400 401 // Enabling both top down and bottom up scheduling seems to give us less 402 // register spills than just using one of these approaches on its own. 403 Policy.OnlyTopDown = false; 404 Policy.OnlyBottomUp = false; 405 406 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 407 if (!enableSIScheduler()) 408 Policy.ShouldTrackLaneMasks = true; 409 } 410 411 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const { 412 return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); 413 } 414 415 unsigned SISubtarget::getKernArgSegmentSize(const Function &F, 416 unsigned ExplicitArgBytes) const { 417 uint64_t TotalSize = ExplicitArgBytes; 418 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 419 420 if (ImplicitBytes != 0) { 421 unsigned Alignment = getAlignmentForImplicitArgPtr(); 422 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 423 } 424 425 // Being able to dereference past the end is useful for emitting scalar loads. 426 return alignTo(TotalSize, 4); 427 } 428 429 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 430 if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 431 if (SGPRs <= 80) 432 return 10; 433 if (SGPRs <= 88) 434 return 9; 435 if (SGPRs <= 100) 436 return 8; 437 return 7; 438 } 439 if (SGPRs <= 48) 440 return 10; 441 if (SGPRs <= 56) 442 return 9; 443 if (SGPRs <= 64) 444 return 8; 445 if (SGPRs <= 72) 446 return 7; 447 if (SGPRs <= 80) 448 return 6; 449 return 5; 450 } 451 452 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 453 if (VGPRs <= 24) 454 return 10; 455 if (VGPRs <= 28) 456 return 9; 457 if (VGPRs <= 32) 458 return 8; 459 if (VGPRs <= 36) 460 return 7; 461 if (VGPRs <= 40) 462 return 6; 463 if (VGPRs <= 48) 464 return 5; 465 if (VGPRs <= 64) 466 return 4; 467 if (VGPRs <= 84) 468 return 3; 469 if (VGPRs <= 128) 470 return 2; 471 return 1; 472 } 473 474 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 475 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 476 if (MFI.hasFlatScratchInit()) { 477 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 478 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 479 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 480 return 4; // FLAT_SCRATCH, VCC (in that order). 481 } 482 483 if (isXNACKEnabled()) 484 return 4; // XNACK, VCC (in that order). 485 return 2; // VCC. 486 } 487 488 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 489 const Function &F = MF.getFunction(); 490 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 491 492 // Compute maximum number of SGPRs function can use using default/requested 493 // minimum number of waves per execution unit. 494 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 495 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 496 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 497 498 // Check if maximum number of SGPRs was explicitly requested using 499 // "amdgpu-num-sgpr" attribute. 500 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 501 unsigned Requested = AMDGPU::getIntegerAttribute( 502 F, "amdgpu-num-sgpr", MaxNumSGPRs); 503 504 // Make sure requested value does not violate subtarget's specifications. 505 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 506 Requested = 0; 507 508 // If more SGPRs are required to support the input user/system SGPRs, 509 // increase to accommodate them. 510 // 511 // FIXME: This really ends up using the requested number of SGPRs + number 512 // of reserved special registers in total. Theoretically you could re-use 513 // the last input registers for these special registers, but this would 514 // require a lot of complexity to deal with the weird aliasing. 515 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 516 if (Requested && Requested < InputNumSGPRs) 517 Requested = InputNumSGPRs; 518 519 // Make sure requested value is compatible with values implied by 520 // default/requested minimum/maximum number of waves per execution unit. 521 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 522 Requested = 0; 523 if (WavesPerEU.second && 524 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 525 Requested = 0; 526 527 if (Requested) 528 MaxNumSGPRs = Requested; 529 } 530 531 if (hasSGPRInitBug()) 532 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 533 534 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 535 MaxAddressableNumSGPRs); 536 } 537 538 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 539 const Function &F = MF.getFunction(); 540 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 541 542 // Compute maximum number of VGPRs function can use using default/requested 543 // minimum number of waves per execution unit. 544 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 545 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 546 547 // Check if maximum number of VGPRs was explicitly requested using 548 // "amdgpu-num-vgpr" attribute. 549 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 550 unsigned Requested = AMDGPU::getIntegerAttribute( 551 F, "amdgpu-num-vgpr", MaxNumVGPRs); 552 553 // Make sure requested value does not violate subtarget's specifications. 554 if (Requested && Requested <= getReservedNumVGPRs(MF)) 555 Requested = 0; 556 557 // Make sure requested value is compatible with values implied by 558 // default/requested minimum/maximum number of waves per execution unit. 559 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 560 Requested = 0; 561 if (WavesPerEU.second && 562 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 563 Requested = 0; 564 565 if (Requested) 566 MaxNumVGPRs = Requested; 567 } 568 569 return MaxNumVGPRs - getReservedNumVGPRs(MF); 570 } 571 572 namespace { 573 struct MemOpClusterMutation : ScheduleDAGMutation { 574 const SIInstrInfo *TII; 575 576 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {} 577 578 void apply(ScheduleDAGInstrs *DAGInstrs) override { 579 ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 580 581 SUnit *SUa = nullptr; 582 // Search for two consequent memory operations and link them 583 // to prevent scheduler from moving them apart. 584 // In DAG pre-process SUnits are in the original order of 585 // the instructions before scheduling. 586 for (SUnit &SU : DAG->SUnits) { 587 MachineInstr &MI2 = *SU.getInstr(); 588 if (!MI2.mayLoad() && !MI2.mayStore()) { 589 SUa = nullptr; 590 continue; 591 } 592 if (!SUa) { 593 SUa = &SU; 594 continue; 595 } 596 597 MachineInstr &MI1 = *SUa->getInstr(); 598 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) || 599 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) || 600 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) || 601 (TII->isDS(MI1) && TII->isDS(MI2))) { 602 SU.addPredBarrier(SUa); 603 604 for (const SDep &SI : SU.Preds) { 605 if (SI.getSUnit() != SUa) 606 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial)); 607 } 608 609 if (&SU != &DAG->ExitSU) { 610 for (const SDep &SI : SUa->Succs) { 611 if (SI.getSUnit() != &SU) 612 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial)); 613 } 614 } 615 } 616 617 SUa = &SU; 618 } 619 } 620 }; 621 } // namespace 622 623 void SISubtarget::getPostRAMutations( 624 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 625 Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo)); 626 } 627