1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUSubtarget.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUTargetMachine.h" 18 #include "AMDGPUCallLowering.h" 19 #include "AMDGPUInstructionSelector.h" 20 #include "AMDGPULegalizerInfo.h" 21 #include "AMDGPURegisterBankInfo.h" 22 #include "SIMachineFunctionInfo.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/IR/MDBuilder.h" 26 #include "llvm/CodeGen/TargetFrameLowering.h" 27 #include <algorithm> 28 29 using namespace llvm; 30 31 #define DEBUG_TYPE "amdgpu-subtarget" 32 33 #define GET_SUBTARGETINFO_TARGET_DESC 34 #define GET_SUBTARGETINFO_CTOR 35 #include "AMDGPUGenSubtargetInfo.inc" 36 37 AMDGPUSubtarget::~AMDGPUSubtarget() = default; 38 39 AMDGPUSubtarget & 40 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, 41 StringRef GPU, StringRef FS) { 42 // Determine default and user-specified characteristics 43 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 44 // enabled, but some instructions do not respect them and they run at the 45 // double precision rate, so don't enable by default. 46 // 47 // We want to be able to turn these off, but making this a subtarget feature 48 // for SI has the unhelpful behavior that it unsets everything else if you 49 // disable it. 50 51 SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,"); 52 53 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 54 FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 55 56 // FIXME: I don't think think Evergreen has any useful support for 57 // denormals, but should be checked. Should we issue a warning somewhere 58 // if someone tries to enable these? 59 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 60 FullFS += "+fp64-fp16-denormals,"; 61 } else { 62 FullFS += "-fp32-denormals,"; 63 } 64 65 FullFS += FS; 66 67 ParseSubtargetFeatures(GPU, FullFS); 68 69 // We don't support FP64 for EG/NI atm. 70 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 71 72 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 73 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 74 // variants of MUBUF instructions. 75 if (!hasAddr64() && !FS.contains("flat-for-global")) { 76 FlatForGlobal = true; 77 } 78 79 // Set defaults if needed. 80 if (MaxPrivateElementSize == 0) 81 MaxPrivateElementSize = 4; 82 83 if (LDSBankCount == 0) 84 LDSBankCount = 32; 85 86 if (TT.getArch() == Triple::amdgcn) { 87 if (LocalMemorySize == 0) 88 LocalMemorySize = 32768; 89 90 // Do something sensible for unspecified target. 91 if (!HasMovrel && !HasVGPRIndexMode) 92 HasMovrel = true; 93 } 94 95 return *this; 96 } 97 98 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 99 const TargetMachine &TM) 100 : AMDGPUGenSubtargetInfo(TT, GPU, FS), 101 TargetTriple(TT), 102 Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600), 103 IsaVersion(ISAVersion0_0_0), 104 WavefrontSize(0), 105 LocalMemorySize(0), 106 LDSBankCount(0), 107 MaxPrivateElementSize(0), 108 109 FastFMAF32(false), 110 HalfRate64Ops(false), 111 112 FP32Denormals(false), 113 FP64FP16Denormals(false), 114 FPExceptions(false), 115 DX10Clamp(false), 116 FlatForGlobal(false), 117 AutoWaitcntBeforeBarrier(false), 118 CodeObjectV3(false), 119 UnalignedScratchAccess(false), 120 UnalignedBufferAccess(false), 121 122 HasApertureRegs(false), 123 EnableXNACK(false), 124 TrapHandler(false), 125 DebuggerInsertNops(false), 126 DebuggerReserveRegs(false), 127 DebuggerEmitPrologue(false), 128 129 EnableHugePrivateBuffer(false), 130 EnableVGPRSpilling(false), 131 EnablePromoteAlloca(false), 132 EnableLoadStoreOpt(false), 133 EnableUnsafeDSOffsetFolding(false), 134 EnableSIScheduler(false), 135 DumpCode(false), 136 137 FP64(false), 138 FMA(false), 139 MIMG_R128(false), 140 IsGCN(false), 141 GCN3Encoding(false), 142 CIInsts(false), 143 GFX9Insts(false), 144 SGPRInitBug(false), 145 HasSMemRealTime(false), 146 Has16BitInsts(false), 147 HasIntClamp(false), 148 HasVOP3PInsts(false), 149 HasMadMixInsts(false), 150 HasMovrel(false), 151 HasVGPRIndexMode(false), 152 HasScalarStores(false), 153 HasInv2PiInlineImm(false), 154 HasSDWA(false), 155 HasSDWAOmod(false), 156 HasSDWAScalar(false), 157 HasSDWASdst(false), 158 HasSDWAMac(false), 159 HasSDWAOutModsVOPC(false), 160 HasDPP(false), 161 FlatAddressSpace(false), 162 FlatInstOffsets(false), 163 FlatGlobalInsts(false), 164 FlatScratchInsts(false), 165 AddNoCarryInsts(false), 166 HasUnpackedD16VMem(false), 167 168 R600ALUInst(false), 169 CaymanISA(false), 170 CFALUBug(false), 171 HasVertexCache(false), 172 TexVTXClauseSize(0), 173 ScalarizeGlobal(false), 174 175 FeatureDisable(false), 176 InstrItins(getInstrItineraryForCPU(GPU)) { 177 AS = AMDGPU::getAMDGPUAS(TT); 178 initializeSubtargetDependencies(TT, GPU, FS); 179 } 180 181 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 182 const Function &F) const { 183 if (NWaves == 1) 184 return getLocalMemorySize(); 185 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 186 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 187 unsigned MaxWaves = getMaxWavesPerEU(); 188 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 189 } 190 191 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 192 const Function &F) const { 193 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 194 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 195 unsigned MaxWaves = getMaxWavesPerEU(); 196 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 197 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 198 NumWaves = std::min(NumWaves, MaxWaves); 199 NumWaves = std::max(NumWaves, 1u); 200 return NumWaves; 201 } 202 203 std::pair<unsigned, unsigned> 204 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 205 switch (CC) { 206 case CallingConv::AMDGPU_CS: 207 case CallingConv::AMDGPU_KERNEL: 208 case CallingConv::SPIR_KERNEL: 209 return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4); 210 case CallingConv::AMDGPU_VS: 211 case CallingConv::AMDGPU_LS: 212 case CallingConv::AMDGPU_HS: 213 case CallingConv::AMDGPU_ES: 214 case CallingConv::AMDGPU_GS: 215 case CallingConv::AMDGPU_PS: 216 return std::make_pair(1, getWavefrontSize()); 217 default: 218 return std::make_pair(1, 16 * getWavefrontSize()); 219 } 220 } 221 222 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 223 const Function &F) const { 224 // FIXME: 1024 if function. 225 // Default minimum/maximum flat work group sizes. 226 std::pair<unsigned, unsigned> Default = 227 getDefaultFlatWorkGroupSize(F.getCallingConv()); 228 229 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 230 // starts using "amdgpu-flat-work-group-size" attribute. 231 Default.second = AMDGPU::getIntegerAttribute( 232 F, "amdgpu-max-work-group-size", Default.second); 233 Default.first = std::min(Default.first, Default.second); 234 235 // Requested minimum/maximum flat work group sizes. 236 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 237 F, "amdgpu-flat-work-group-size", Default); 238 239 // Make sure requested minimum is less than requested maximum. 240 if (Requested.first > Requested.second) 241 return Default; 242 243 // Make sure requested values do not violate subtarget's specifications. 244 if (Requested.first < getMinFlatWorkGroupSize()) 245 return Default; 246 if (Requested.second > getMaxFlatWorkGroupSize()) 247 return Default; 248 249 return Requested; 250 } 251 252 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 253 const Function &F) const { 254 // Default minimum/maximum number of waves per execution unit. 255 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 256 257 // Default/requested minimum/maximum flat work group sizes. 258 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 259 260 // If minimum/maximum flat work group sizes were explicitly requested using 261 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 262 // number of waves per execution unit to values implied by requested 263 // minimum/maximum flat work group sizes. 264 unsigned MinImpliedByFlatWorkGroupSize = 265 getMaxWavesPerEU(FlatWorkGroupSizes.second); 266 bool RequestedFlatWorkGroupSize = false; 267 268 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 269 // starts using "amdgpu-flat-work-group-size" attribute. 270 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 271 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 272 Default.first = MinImpliedByFlatWorkGroupSize; 273 RequestedFlatWorkGroupSize = true; 274 } 275 276 // Requested minimum/maximum number of waves per execution unit. 277 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 278 F, "amdgpu-waves-per-eu", Default, true); 279 280 // Make sure requested minimum is less than requested maximum. 281 if (Requested.second && Requested.first > Requested.second) 282 return Default; 283 284 // Make sure requested values do not violate subtarget's specifications. 285 if (Requested.first < getMinWavesPerEU() || 286 Requested.first > getMaxWavesPerEU()) 287 return Default; 288 if (Requested.second > getMaxWavesPerEU()) 289 return Default; 290 291 // Make sure requested values are compatible with values implied by requested 292 // minimum/maximum flat work group sizes. 293 if (RequestedFlatWorkGroupSize && 294 Requested.first < MinImpliedByFlatWorkGroupSize) 295 return Default; 296 297 return Requested; 298 } 299 300 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 301 Function *Kernel = I->getParent()->getParent(); 302 unsigned MinSize = 0; 303 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 304 bool IdQuery = false; 305 306 // If reqd_work_group_size is present it narrows value down. 307 if (auto *CI = dyn_cast<CallInst>(I)) { 308 const Function *F = CI->getCalledFunction(); 309 if (F) { 310 unsigned Dim = UINT_MAX; 311 switch (F->getIntrinsicID()) { 312 case Intrinsic::amdgcn_workitem_id_x: 313 case Intrinsic::r600_read_tidig_x: 314 IdQuery = true; 315 LLVM_FALLTHROUGH; 316 case Intrinsic::r600_read_local_size_x: 317 Dim = 0; 318 break; 319 case Intrinsic::amdgcn_workitem_id_y: 320 case Intrinsic::r600_read_tidig_y: 321 IdQuery = true; 322 LLVM_FALLTHROUGH; 323 case Intrinsic::r600_read_local_size_y: 324 Dim = 1; 325 break; 326 case Intrinsic::amdgcn_workitem_id_z: 327 case Intrinsic::r600_read_tidig_z: 328 IdQuery = true; 329 LLVM_FALLTHROUGH; 330 case Intrinsic::r600_read_local_size_z: 331 Dim = 2; 332 break; 333 default: 334 break; 335 } 336 if (Dim <= 3) { 337 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 338 if (Node->getNumOperands() == 3) 339 MinSize = MaxSize = mdconst::extract<ConstantInt>( 340 Node->getOperand(Dim))->getZExtValue(); 341 } 342 } 343 } 344 345 if (!MaxSize) 346 return false; 347 348 // Range metadata is [Lo, Hi). For ID query we need to pass max size 349 // as Hi. For size query we need to pass Hi + 1. 350 if (IdQuery) 351 MinSize = 0; 352 else 353 ++MaxSize; 354 355 MDBuilder MDB(I->getContext()); 356 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 357 APInt(32, MaxSize)); 358 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 359 return true; 360 } 361 362 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 363 const TargetMachine &TM) : 364 AMDGPUSubtarget(TT, GPU, FS, TM), 365 InstrInfo(*this), 366 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 367 TLInfo(TM, *this) {} 368 369 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS, 370 const GCNTargetMachine &TM) 371 : AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this), 372 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 373 TLInfo(TM, *this) { 374 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 375 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 376 377 RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); 378 InstSelector.reset(new AMDGPUInstructionSelector( 379 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()))); 380 } 381 382 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 383 unsigned NumRegionInstrs) const { 384 // Track register pressure so the scheduler can try to decrease 385 // pressure once register usage is above the threshold defined by 386 // SIRegisterInfo::getRegPressureSetLimit() 387 Policy.ShouldTrackPressure = true; 388 389 // Enabling both top down and bottom up scheduling seems to give us less 390 // register spills than just using one of these approaches on its own. 391 Policy.OnlyTopDown = false; 392 Policy.OnlyBottomUp = false; 393 394 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 395 if (!enableSIScheduler()) 396 Policy.ShouldTrackLaneMasks = true; 397 } 398 399 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const { 400 return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); 401 } 402 403 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF, 404 unsigned ExplicitArgBytes) const { 405 unsigned ImplicitBytes = getImplicitArgNumBytes(MF); 406 if (ImplicitBytes == 0) 407 return ExplicitArgBytes; 408 409 unsigned Alignment = getAlignmentForImplicitArgPtr(); 410 return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 411 } 412 413 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 414 if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 415 if (SGPRs <= 80) 416 return 10; 417 if (SGPRs <= 88) 418 return 9; 419 if (SGPRs <= 100) 420 return 8; 421 return 7; 422 } 423 if (SGPRs <= 48) 424 return 10; 425 if (SGPRs <= 56) 426 return 9; 427 if (SGPRs <= 64) 428 return 8; 429 if (SGPRs <= 72) 430 return 7; 431 if (SGPRs <= 80) 432 return 6; 433 return 5; 434 } 435 436 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 437 if (VGPRs <= 24) 438 return 10; 439 if (VGPRs <= 28) 440 return 9; 441 if (VGPRs <= 32) 442 return 8; 443 if (VGPRs <= 36) 444 return 7; 445 if (VGPRs <= 40) 446 return 6; 447 if (VGPRs <= 48) 448 return 5; 449 if (VGPRs <= 64) 450 return 4; 451 if (VGPRs <= 84) 452 return 3; 453 if (VGPRs <= 128) 454 return 2; 455 return 1; 456 } 457 458 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 459 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 460 if (MFI.hasFlatScratchInit()) { 461 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 462 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 463 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 464 return 4; // FLAT_SCRATCH, VCC (in that order). 465 } 466 467 if (isXNACKEnabled()) 468 return 4; // XNACK, VCC (in that order). 469 return 2; // VCC. 470 } 471 472 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 473 const Function &F = MF.getFunction(); 474 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 475 476 // Compute maximum number of SGPRs function can use using default/requested 477 // minimum number of waves per execution unit. 478 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 479 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 480 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 481 482 // Check if maximum number of SGPRs was explicitly requested using 483 // "amdgpu-num-sgpr" attribute. 484 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 485 unsigned Requested = AMDGPU::getIntegerAttribute( 486 F, "amdgpu-num-sgpr", MaxNumSGPRs); 487 488 // Make sure requested value does not violate subtarget's specifications. 489 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 490 Requested = 0; 491 492 // If more SGPRs are required to support the input user/system SGPRs, 493 // increase to accommodate them. 494 // 495 // FIXME: This really ends up using the requested number of SGPRs + number 496 // of reserved special registers in total. Theoretically you could re-use 497 // the last input registers for these special registers, but this would 498 // require a lot of complexity to deal with the weird aliasing. 499 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 500 if (Requested && Requested < InputNumSGPRs) 501 Requested = InputNumSGPRs; 502 503 // Make sure requested value is compatible with values implied by 504 // default/requested minimum/maximum number of waves per execution unit. 505 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 506 Requested = 0; 507 if (WavesPerEU.second && 508 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 509 Requested = 0; 510 511 if (Requested) 512 MaxNumSGPRs = Requested; 513 } 514 515 if (hasSGPRInitBug()) 516 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 517 518 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 519 MaxAddressableNumSGPRs); 520 } 521 522 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 523 const Function &F = MF.getFunction(); 524 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 525 526 // Compute maximum number of VGPRs function can use using default/requested 527 // minimum number of waves per execution unit. 528 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 529 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 530 531 // Check if maximum number of VGPRs was explicitly requested using 532 // "amdgpu-num-vgpr" attribute. 533 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 534 unsigned Requested = AMDGPU::getIntegerAttribute( 535 F, "amdgpu-num-vgpr", MaxNumVGPRs); 536 537 // Make sure requested value does not violate subtarget's specifications. 538 if (Requested && Requested <= getReservedNumVGPRs(MF)) 539 Requested = 0; 540 541 // Make sure requested value is compatible with values implied by 542 // default/requested minimum/maximum number of waves per execution unit. 543 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 544 Requested = 0; 545 if (WavesPerEU.second && 546 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 547 Requested = 0; 548 549 if (Requested) 550 MaxNumVGPRs = Requested; 551 } 552 553 return MaxNumVGPRs - getReservedNumVGPRs(MF); 554 } 555 556 namespace { 557 struct MemOpClusterMutation : ScheduleDAGMutation { 558 const SIInstrInfo *TII; 559 560 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {} 561 562 void apply(ScheduleDAGInstrs *DAGInstrs) override { 563 ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 564 565 SUnit *SUa = nullptr; 566 // Search for two consequent memory operations and link them 567 // to prevent scheduler from moving them apart. 568 // In DAG pre-process SUnits are in the original order of 569 // the instructions before scheduling. 570 for (SUnit &SU : DAG->SUnits) { 571 MachineInstr &MI2 = *SU.getInstr(); 572 if (!MI2.mayLoad() && !MI2.mayStore()) { 573 SUa = nullptr; 574 continue; 575 } 576 if (!SUa) { 577 SUa = &SU; 578 continue; 579 } 580 581 MachineInstr &MI1 = *SUa->getInstr(); 582 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) || 583 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) || 584 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) || 585 (TII->isDS(MI1) && TII->isDS(MI2))) { 586 SU.addPredBarrier(SUa); 587 588 for (const SDep &SI : SU.Preds) { 589 if (SI.getSUnit() != SUa) 590 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial)); 591 } 592 593 if (&SU != &DAG->ExitSU) { 594 for (const SDep &SI : SUa->Succs) { 595 if (SI.getSUnit() != &SU) 596 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial)); 597 } 598 } 599 } 600 601 SUa = &SU; 602 } 603 } 604 }; 605 } // namespace 606 607 void SISubtarget::getPostRAMutations( 608 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 609 Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo)); 610 } 611