1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// Implements the AMDGPU specific subclass of TargetSubtarget. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUSubtarget.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUTargetMachine.h" 18 #include "AMDGPUCallLowering.h" 19 #include "AMDGPUInstructionSelector.h" 20 #include "AMDGPULegalizerInfo.h" 21 #include "AMDGPURegisterBankInfo.h" 22 #include "SIMachineFunctionInfo.h" 23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 24 #include "llvm/ADT/SmallString.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/IR/MDBuilder.h" 27 #include "llvm/CodeGen/TargetFrameLowering.h" 28 #include <algorithm> 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "amdgpu-subtarget" 33 34 #define GET_SUBTARGETINFO_TARGET_DESC 35 #define GET_SUBTARGETINFO_CTOR 36 #include "AMDGPUGenSubtargetInfo.inc" 37 38 AMDGPUSubtarget::~AMDGPUSubtarget() = default; 39 40 AMDGPUSubtarget & 41 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, 42 StringRef GPU, StringRef FS) { 43 // Determine default and user-specified characteristics 44 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 45 // enabled, but some instructions do not respect them and they run at the 46 // double precision rate, so don't enable by default. 47 // 48 // We want to be able to turn these off, but making this a subtarget feature 49 // for SI has the unhelpful behavior that it unsets everything else if you 50 // disable it. 51 52 SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,"); 53 54 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 55 FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 56 57 // FIXME: I don't think think Evergreen has any useful support for 58 // denormals, but should be checked. Should we issue a warning somewhere 59 // if someone tries to enable these? 60 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 61 FullFS += "+fp64-fp16-denormals,"; 62 } else { 63 FullFS += "-fp32-denormals,"; 64 } 65 66 FullFS += FS; 67 68 ParseSubtargetFeatures(GPU, FullFS); 69 70 // We don't support FP64 for EG/NI atm. 71 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 72 73 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 74 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 75 // variants of MUBUF instructions. 76 if (!hasAddr64() && !FS.contains("flat-for-global")) { 77 FlatForGlobal = true; 78 } 79 80 // Set defaults if needed. 81 if (MaxPrivateElementSize == 0) 82 MaxPrivateElementSize = 4; 83 84 if (LDSBankCount == 0) 85 LDSBankCount = 32; 86 87 if (TT.getArch() == Triple::amdgcn) { 88 if (LocalMemorySize == 0) 89 LocalMemorySize = 32768; 90 91 // Do something sensible for unspecified target. 92 if (!HasMovrel && !HasVGPRIndexMode) 93 HasMovrel = true; 94 } 95 96 return *this; 97 } 98 99 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 100 const TargetMachine &TM) 101 : AMDGPUGenSubtargetInfo(TT, GPU, FS), 102 TargetTriple(TT), 103 Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600), 104 IsaVersion(ISAVersion0_0_0), 105 WavefrontSize(0), 106 LocalMemorySize(0), 107 LDSBankCount(0), 108 MaxPrivateElementSize(0), 109 110 FastFMAF32(false), 111 HalfRate64Ops(false), 112 113 FP32Denormals(false), 114 FP64FP16Denormals(false), 115 FPExceptions(false), 116 DX10Clamp(false), 117 FlatForGlobal(false), 118 AutoWaitcntBeforeBarrier(false), 119 CodeObjectV3(false), 120 UnalignedScratchAccess(false), 121 UnalignedBufferAccess(false), 122 123 HasApertureRegs(false), 124 EnableXNACK(false), 125 TrapHandler(false), 126 DebuggerInsertNops(false), 127 DebuggerEmitPrologue(false), 128 129 EnableHugePrivateBuffer(false), 130 EnableVGPRSpilling(false), 131 EnablePromoteAlloca(false), 132 EnableLoadStoreOpt(false), 133 EnableUnsafeDSOffsetFolding(false), 134 EnableSIScheduler(false), 135 EnableDS128(false), 136 DumpCode(false), 137 138 FP64(false), 139 FMA(false), 140 MIMG_R128(false), 141 IsGCN(false), 142 GCN3Encoding(false), 143 CIInsts(false), 144 GFX9Insts(false), 145 SGPRInitBug(false), 146 HasSMemRealTime(false), 147 Has16BitInsts(false), 148 HasIntClamp(false), 149 HasVOP3PInsts(false), 150 HasMadMixInsts(false), 151 HasFmaMixInsts(false), 152 HasMovrel(false), 153 HasVGPRIndexMode(false), 154 HasScalarStores(false), 155 HasScalarAtomics(false), 156 HasInv2PiInlineImm(false), 157 HasSDWA(false), 158 HasSDWAOmod(false), 159 HasSDWAScalar(false), 160 HasSDWASdst(false), 161 HasSDWAMac(false), 162 HasSDWAOutModsVOPC(false), 163 HasDPP(false), 164 HasDLInsts(false), 165 D16PreservesUnusedBits(false), 166 FlatAddressSpace(false), 167 FlatInstOffsets(false), 168 FlatGlobalInsts(false), 169 FlatScratchInsts(false), 170 AddNoCarryInsts(false), 171 HasUnpackedD16VMem(false), 172 173 R600ALUInst(false), 174 CaymanISA(false), 175 CFALUBug(false), 176 HasVertexCache(false), 177 TexVTXClauseSize(0), 178 ScalarizeGlobal(false), 179 180 FeatureDisable(false), 181 InstrItins(getInstrItineraryForCPU(GPU)) { 182 AS = AMDGPU::getAMDGPUAS(TT); 183 initializeSubtargetDependencies(TT, GPU, FS); 184 } 185 186 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 187 const Function &F) const { 188 if (NWaves == 1) 189 return getLocalMemorySize(); 190 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 191 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 192 unsigned MaxWaves = getMaxWavesPerEU(); 193 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 194 } 195 196 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 197 const Function &F) const { 198 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 199 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 200 unsigned MaxWaves = getMaxWavesPerEU(); 201 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 202 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 203 NumWaves = std::min(NumWaves, MaxWaves); 204 NumWaves = std::max(NumWaves, 1u); 205 return NumWaves; 206 } 207 208 unsigned 209 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 210 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 211 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 212 } 213 214 std::pair<unsigned, unsigned> 215 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 216 switch (CC) { 217 case CallingConv::AMDGPU_CS: 218 case CallingConv::AMDGPU_KERNEL: 219 case CallingConv::SPIR_KERNEL: 220 return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4); 221 case CallingConv::AMDGPU_VS: 222 case CallingConv::AMDGPU_LS: 223 case CallingConv::AMDGPU_HS: 224 case CallingConv::AMDGPU_ES: 225 case CallingConv::AMDGPU_GS: 226 case CallingConv::AMDGPU_PS: 227 return std::make_pair(1, getWavefrontSize()); 228 default: 229 return std::make_pair(1, 16 * getWavefrontSize()); 230 } 231 } 232 233 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 234 const Function &F) const { 235 // FIXME: 1024 if function. 236 // Default minimum/maximum flat work group sizes. 237 std::pair<unsigned, unsigned> Default = 238 getDefaultFlatWorkGroupSize(F.getCallingConv()); 239 240 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 241 // starts using "amdgpu-flat-work-group-size" attribute. 242 Default.second = AMDGPU::getIntegerAttribute( 243 F, "amdgpu-max-work-group-size", Default.second); 244 Default.first = std::min(Default.first, Default.second); 245 246 // Requested minimum/maximum flat work group sizes. 247 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 248 F, "amdgpu-flat-work-group-size", Default); 249 250 // Make sure requested minimum is less than requested maximum. 251 if (Requested.first > Requested.second) 252 return Default; 253 254 // Make sure requested values do not violate subtarget's specifications. 255 if (Requested.first < getMinFlatWorkGroupSize()) 256 return Default; 257 if (Requested.second > getMaxFlatWorkGroupSize()) 258 return Default; 259 260 return Requested; 261 } 262 263 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 264 const Function &F) const { 265 // Default minimum/maximum number of waves per execution unit. 266 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 267 268 // Default/requested minimum/maximum flat work group sizes. 269 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 270 271 // If minimum/maximum flat work group sizes were explicitly requested using 272 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 273 // number of waves per execution unit to values implied by requested 274 // minimum/maximum flat work group sizes. 275 unsigned MinImpliedByFlatWorkGroupSize = 276 getMaxWavesPerEU(FlatWorkGroupSizes.second); 277 bool RequestedFlatWorkGroupSize = false; 278 279 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 280 // starts using "amdgpu-flat-work-group-size" attribute. 281 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 282 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 283 Default.first = MinImpliedByFlatWorkGroupSize; 284 RequestedFlatWorkGroupSize = true; 285 } 286 287 // Requested minimum/maximum number of waves per execution unit. 288 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 289 F, "amdgpu-waves-per-eu", Default, true); 290 291 // Make sure requested minimum is less than requested maximum. 292 if (Requested.second && Requested.first > Requested.second) 293 return Default; 294 295 // Make sure requested values do not violate subtarget's specifications. 296 if (Requested.first < getMinWavesPerEU() || 297 Requested.first > getMaxWavesPerEU()) 298 return Default; 299 if (Requested.second > getMaxWavesPerEU()) 300 return Default; 301 302 // Make sure requested values are compatible with values implied by requested 303 // minimum/maximum flat work group sizes. 304 if (RequestedFlatWorkGroupSize && 305 Requested.first < MinImpliedByFlatWorkGroupSize) 306 return Default; 307 308 return Requested; 309 } 310 311 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 312 Function *Kernel = I->getParent()->getParent(); 313 unsigned MinSize = 0; 314 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 315 bool IdQuery = false; 316 317 // If reqd_work_group_size is present it narrows value down. 318 if (auto *CI = dyn_cast<CallInst>(I)) { 319 const Function *F = CI->getCalledFunction(); 320 if (F) { 321 unsigned Dim = UINT_MAX; 322 switch (F->getIntrinsicID()) { 323 case Intrinsic::amdgcn_workitem_id_x: 324 case Intrinsic::r600_read_tidig_x: 325 IdQuery = true; 326 LLVM_FALLTHROUGH; 327 case Intrinsic::r600_read_local_size_x: 328 Dim = 0; 329 break; 330 case Intrinsic::amdgcn_workitem_id_y: 331 case Intrinsic::r600_read_tidig_y: 332 IdQuery = true; 333 LLVM_FALLTHROUGH; 334 case Intrinsic::r600_read_local_size_y: 335 Dim = 1; 336 break; 337 case Intrinsic::amdgcn_workitem_id_z: 338 case Intrinsic::r600_read_tidig_z: 339 IdQuery = true; 340 LLVM_FALLTHROUGH; 341 case Intrinsic::r600_read_local_size_z: 342 Dim = 2; 343 break; 344 default: 345 break; 346 } 347 if (Dim <= 3) { 348 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 349 if (Node->getNumOperands() == 3) 350 MinSize = MaxSize = mdconst::extract<ConstantInt>( 351 Node->getOperand(Dim))->getZExtValue(); 352 } 353 } 354 } 355 356 if (!MaxSize) 357 return false; 358 359 // Range metadata is [Lo, Hi). For ID query we need to pass max size 360 // as Hi. For size query we need to pass Hi + 1. 361 if (IdQuery) 362 MinSize = 0; 363 else 364 ++MaxSize; 365 366 MDBuilder MDB(I->getContext()); 367 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 368 APInt(32, MaxSize)); 369 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 370 return true; 371 } 372 373 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 374 const TargetMachine &TM) : 375 AMDGPUSubtarget(TT, GPU, FS, TM), 376 InstrInfo(*this), 377 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 378 TLInfo(TM, *this) {} 379 380 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS, 381 const GCNTargetMachine &TM) 382 : AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this), 383 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 384 TLInfo(TM, *this) { 385 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 386 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 387 388 RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); 389 InstSelector.reset(new AMDGPUInstructionSelector( 390 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 391 } 392 393 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 394 unsigned NumRegionInstrs) const { 395 // Track register pressure so the scheduler can try to decrease 396 // pressure once register usage is above the threshold defined by 397 // SIRegisterInfo::getRegPressureSetLimit() 398 Policy.ShouldTrackPressure = true; 399 400 // Enabling both top down and bottom up scheduling seems to give us less 401 // register spills than just using one of these approaches on its own. 402 Policy.OnlyTopDown = false; 403 Policy.OnlyBottomUp = false; 404 405 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 406 if (!enableSIScheduler()) 407 Policy.ShouldTrackLaneMasks = true; 408 } 409 410 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const { 411 return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); 412 } 413 414 unsigned SISubtarget::getKernArgSegmentSize(const Function &F, 415 unsigned ExplicitArgBytes) const { 416 uint64_t TotalSize = ExplicitArgBytes; 417 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 418 419 if (ImplicitBytes != 0) { 420 unsigned Alignment = getAlignmentForImplicitArgPtr(); 421 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 422 } 423 424 // Being able to dereference past the end is useful for emitting scalar loads. 425 return alignTo(TotalSize, 4); 426 } 427 428 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 429 if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 430 if (SGPRs <= 80) 431 return 10; 432 if (SGPRs <= 88) 433 return 9; 434 if (SGPRs <= 100) 435 return 8; 436 return 7; 437 } 438 if (SGPRs <= 48) 439 return 10; 440 if (SGPRs <= 56) 441 return 9; 442 if (SGPRs <= 64) 443 return 8; 444 if (SGPRs <= 72) 445 return 7; 446 if (SGPRs <= 80) 447 return 6; 448 return 5; 449 } 450 451 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 452 if (VGPRs <= 24) 453 return 10; 454 if (VGPRs <= 28) 455 return 9; 456 if (VGPRs <= 32) 457 return 8; 458 if (VGPRs <= 36) 459 return 7; 460 if (VGPRs <= 40) 461 return 6; 462 if (VGPRs <= 48) 463 return 5; 464 if (VGPRs <= 64) 465 return 4; 466 if (VGPRs <= 84) 467 return 3; 468 if (VGPRs <= 128) 469 return 2; 470 return 1; 471 } 472 473 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 474 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 475 if (MFI.hasFlatScratchInit()) { 476 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 477 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 478 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 479 return 4; // FLAT_SCRATCH, VCC (in that order). 480 } 481 482 if (isXNACKEnabled()) 483 return 4; // XNACK, VCC (in that order). 484 return 2; // VCC. 485 } 486 487 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 488 const Function &F = MF.getFunction(); 489 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 490 491 // Compute maximum number of SGPRs function can use using default/requested 492 // minimum number of waves per execution unit. 493 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 494 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 495 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 496 497 // Check if maximum number of SGPRs was explicitly requested using 498 // "amdgpu-num-sgpr" attribute. 499 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 500 unsigned Requested = AMDGPU::getIntegerAttribute( 501 F, "amdgpu-num-sgpr", MaxNumSGPRs); 502 503 // Make sure requested value does not violate subtarget's specifications. 504 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 505 Requested = 0; 506 507 // If more SGPRs are required to support the input user/system SGPRs, 508 // increase to accommodate them. 509 // 510 // FIXME: This really ends up using the requested number of SGPRs + number 511 // of reserved special registers in total. Theoretically you could re-use 512 // the last input registers for these special registers, but this would 513 // require a lot of complexity to deal with the weird aliasing. 514 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 515 if (Requested && Requested < InputNumSGPRs) 516 Requested = InputNumSGPRs; 517 518 // Make sure requested value is compatible with values implied by 519 // default/requested minimum/maximum number of waves per execution unit. 520 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 521 Requested = 0; 522 if (WavesPerEU.second && 523 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 524 Requested = 0; 525 526 if (Requested) 527 MaxNumSGPRs = Requested; 528 } 529 530 if (hasSGPRInitBug()) 531 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 532 533 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 534 MaxAddressableNumSGPRs); 535 } 536 537 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 538 const Function &F = MF.getFunction(); 539 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 540 541 // Compute maximum number of VGPRs function can use using default/requested 542 // minimum number of waves per execution unit. 543 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 544 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 545 546 // Check if maximum number of VGPRs was explicitly requested using 547 // "amdgpu-num-vgpr" attribute. 548 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 549 unsigned Requested = AMDGPU::getIntegerAttribute( 550 F, "amdgpu-num-vgpr", MaxNumVGPRs); 551 552 // Make sure requested value is compatible with values implied by 553 // default/requested minimum/maximum number of waves per execution unit. 554 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 555 Requested = 0; 556 if (WavesPerEU.second && 557 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 558 Requested = 0; 559 560 if (Requested) 561 MaxNumVGPRs = Requested; 562 } 563 564 return MaxNumVGPRs; 565 } 566 567 namespace { 568 struct MemOpClusterMutation : ScheduleDAGMutation { 569 const SIInstrInfo *TII; 570 571 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {} 572 573 void apply(ScheduleDAGInstrs *DAGInstrs) override { 574 ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 575 576 SUnit *SUa = nullptr; 577 // Search for two consequent memory operations and link them 578 // to prevent scheduler from moving them apart. 579 // In DAG pre-process SUnits are in the original order of 580 // the instructions before scheduling. 581 for (SUnit &SU : DAG->SUnits) { 582 MachineInstr &MI2 = *SU.getInstr(); 583 if (!MI2.mayLoad() && !MI2.mayStore()) { 584 SUa = nullptr; 585 continue; 586 } 587 if (!SUa) { 588 SUa = &SU; 589 continue; 590 } 591 592 MachineInstr &MI1 = *SUa->getInstr(); 593 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) || 594 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) || 595 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) || 596 (TII->isDS(MI1) && TII->isDS(MI2))) { 597 SU.addPredBarrier(SUa); 598 599 for (const SDep &SI : SU.Preds) { 600 if (SI.getSUnit() != SUa) 601 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial)); 602 } 603 604 if (&SU != &DAG->ExitSU) { 605 for (const SDep &SI : SUa->Succs) { 606 if (SI.getSUnit() != &SU) 607 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial)); 608 } 609 } 610 } 611 612 SUa = &SU; 613 } 614 } 615 }; 616 } // namespace 617 618 void SISubtarget::getPostRAMutations( 619 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 620 Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo)); 621 } 622