1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUSubtarget.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUTargetMachine.h" 18 #include "AMDGPUCallLowering.h" 19 #include "AMDGPUInstructionSelector.h" 20 #include "AMDGPULegalizerInfo.h" 21 #include "AMDGPURegisterBankInfo.h" 22 #include "SIMachineFunctionInfo.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/IR/MDBuilder.h" 26 #include "llvm/CodeGen/TargetFrameLowering.h" 27 #include <algorithm> 28 29 using namespace llvm; 30 31 #define DEBUG_TYPE "amdgpu-subtarget" 32 33 #define GET_SUBTARGETINFO_TARGET_DESC 34 #define GET_SUBTARGETINFO_CTOR 35 #include "AMDGPUGenSubtargetInfo.inc" 36 37 AMDGPUSubtarget::~AMDGPUSubtarget() = default; 38 39 AMDGPUSubtarget & 40 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, 41 StringRef GPU, StringRef FS) { 42 // Determine default and user-specified characteristics 43 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 44 // enabled, but some instructions do not respect them and they run at the 45 // double precision rate, so don't enable by default. 46 // 47 // We want to be able to turn these off, but making this a subtarget feature 48 // for SI has the unhelpful behavior that it unsets everything else if you 49 // disable it. 50 51 SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,"); 52 53 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 54 FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 55 56 // FIXME: I don't think think Evergreen has any useful support for 57 // denormals, but should be checked. Should we issue a warning somewhere 58 // if someone tries to enable these? 59 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 60 FullFS += "+fp64-fp16-denormals,"; 61 } else { 62 FullFS += "-fp32-denormals,"; 63 } 64 65 FullFS += FS; 66 67 ParseSubtargetFeatures(GPU, FullFS); 68 69 // We don't support FP64 for EG/NI atm. 70 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 71 72 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 73 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 74 // variants of MUBUF instructions. 75 if (!hasAddr64() && !FS.contains("flat-for-global")) { 76 FlatForGlobal = true; 77 } 78 79 // Set defaults if needed. 80 if (MaxPrivateElementSize == 0) 81 MaxPrivateElementSize = 4; 82 83 if (LDSBankCount == 0) 84 LDSBankCount = 32; 85 86 if (TT.getArch() == Triple::amdgcn) { 87 if (LocalMemorySize == 0) 88 LocalMemorySize = 32768; 89 90 // Do something sensible for unspecified target. 91 if (!HasMovrel && !HasVGPRIndexMode) 92 HasMovrel = true; 93 } 94 95 return *this; 96 } 97 98 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 99 const TargetMachine &TM) 100 : AMDGPUGenSubtargetInfo(TT, GPU, FS), 101 TargetTriple(TT), 102 Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600), 103 IsaVersion(ISAVersion0_0_0), 104 WavefrontSize(0), 105 LocalMemorySize(0), 106 LDSBankCount(0), 107 MaxPrivateElementSize(0), 108 109 FastFMAF32(false), 110 HalfRate64Ops(false), 111 112 FP32Denormals(false), 113 FP64FP16Denormals(false), 114 FPExceptions(false), 115 DX10Clamp(false), 116 FlatForGlobal(false), 117 AutoWaitcntBeforeBarrier(false), 118 CodeObjectV3(false), 119 UnalignedScratchAccess(false), 120 UnalignedBufferAccess(false), 121 122 HasApertureRegs(false), 123 EnableXNACK(false), 124 TrapHandler(false), 125 DebuggerInsertNops(false), 126 DebuggerReserveRegs(false), 127 DebuggerEmitPrologue(false), 128 129 EnableHugePrivateBuffer(false), 130 EnableVGPRSpilling(false), 131 EnablePromoteAlloca(false), 132 EnableLoadStoreOpt(false), 133 EnableUnsafeDSOffsetFolding(false), 134 EnableSIScheduler(false), 135 DumpCode(false), 136 137 FP64(false), 138 FMA(false), 139 IsGCN(false), 140 GCN3Encoding(false), 141 CIInsts(false), 142 GFX9Insts(false), 143 SGPRInitBug(false), 144 HasSMemRealTime(false), 145 Has16BitInsts(false), 146 HasIntClamp(false), 147 HasVOP3PInsts(false), 148 HasMadMixInsts(false), 149 HasMovrel(false), 150 HasVGPRIndexMode(false), 151 HasScalarStores(false), 152 HasInv2PiInlineImm(false), 153 HasSDWA(false), 154 HasSDWAOmod(false), 155 HasSDWAScalar(false), 156 HasSDWASdst(false), 157 HasSDWAMac(false), 158 HasSDWAOutModsVOPC(false), 159 HasDPP(false), 160 FlatAddressSpace(false), 161 FlatInstOffsets(false), 162 FlatGlobalInsts(false), 163 FlatScratchInsts(false), 164 AddNoCarryInsts(false), 165 HasUnpackedD16VMem(false), 166 167 R600ALUInst(false), 168 CaymanISA(false), 169 CFALUBug(false), 170 HasVertexCache(false), 171 TexVTXClauseSize(0), 172 ScalarizeGlobal(false), 173 174 FeatureDisable(false), 175 InstrItins(getInstrItineraryForCPU(GPU)) { 176 AS = AMDGPU::getAMDGPUAS(TT); 177 initializeSubtargetDependencies(TT, GPU, FS); 178 } 179 180 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 181 const Function &F) const { 182 if (NWaves == 1) 183 return getLocalMemorySize(); 184 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 185 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 186 unsigned MaxWaves = getMaxWavesPerEU(); 187 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 188 } 189 190 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 191 const Function &F) const { 192 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 193 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 194 unsigned MaxWaves = getMaxWavesPerEU(); 195 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 196 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 197 NumWaves = std::min(NumWaves, MaxWaves); 198 NumWaves = std::max(NumWaves, 1u); 199 return NumWaves; 200 } 201 202 std::pair<unsigned, unsigned> 203 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 204 switch (CC) { 205 case CallingConv::AMDGPU_CS: 206 case CallingConv::AMDGPU_KERNEL: 207 case CallingConv::SPIR_KERNEL: 208 return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4); 209 case CallingConv::AMDGPU_VS: 210 case CallingConv::AMDGPU_LS: 211 case CallingConv::AMDGPU_HS: 212 case CallingConv::AMDGPU_ES: 213 case CallingConv::AMDGPU_GS: 214 case CallingConv::AMDGPU_PS: 215 return std::make_pair(1, getWavefrontSize()); 216 default: 217 return std::make_pair(1, 16 * getWavefrontSize()); 218 } 219 } 220 221 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 222 const Function &F) const { 223 // FIXME: 1024 if function. 224 // Default minimum/maximum flat work group sizes. 225 std::pair<unsigned, unsigned> Default = 226 getDefaultFlatWorkGroupSize(F.getCallingConv()); 227 228 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 229 // starts using "amdgpu-flat-work-group-size" attribute. 230 Default.second = AMDGPU::getIntegerAttribute( 231 F, "amdgpu-max-work-group-size", Default.second); 232 Default.first = std::min(Default.first, Default.second); 233 234 // Requested minimum/maximum flat work group sizes. 235 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 236 F, "amdgpu-flat-work-group-size", Default); 237 238 // Make sure requested minimum is less than requested maximum. 239 if (Requested.first > Requested.second) 240 return Default; 241 242 // Make sure requested values do not violate subtarget's specifications. 243 if (Requested.first < getMinFlatWorkGroupSize()) 244 return Default; 245 if (Requested.second > getMaxFlatWorkGroupSize()) 246 return Default; 247 248 return Requested; 249 } 250 251 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 252 const Function &F) const { 253 // Default minimum/maximum number of waves per execution unit. 254 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 255 256 // Default/requested minimum/maximum flat work group sizes. 257 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 258 259 // If minimum/maximum flat work group sizes were explicitly requested using 260 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 261 // number of waves per execution unit to values implied by requested 262 // minimum/maximum flat work group sizes. 263 unsigned MinImpliedByFlatWorkGroupSize = 264 getMaxWavesPerEU(FlatWorkGroupSizes.second); 265 bool RequestedFlatWorkGroupSize = false; 266 267 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 268 // starts using "amdgpu-flat-work-group-size" attribute. 269 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 270 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 271 Default.first = MinImpliedByFlatWorkGroupSize; 272 RequestedFlatWorkGroupSize = true; 273 } 274 275 // Requested minimum/maximum number of waves per execution unit. 276 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 277 F, "amdgpu-waves-per-eu", Default, true); 278 279 // Make sure requested minimum is less than requested maximum. 280 if (Requested.second && Requested.first > Requested.second) 281 return Default; 282 283 // Make sure requested values do not violate subtarget's specifications. 284 if (Requested.first < getMinWavesPerEU() || 285 Requested.first > getMaxWavesPerEU()) 286 return Default; 287 if (Requested.second > getMaxWavesPerEU()) 288 return Default; 289 290 // Make sure requested values are compatible with values implied by requested 291 // minimum/maximum flat work group sizes. 292 if (RequestedFlatWorkGroupSize && 293 Requested.first < MinImpliedByFlatWorkGroupSize) 294 return Default; 295 296 return Requested; 297 } 298 299 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 300 Function *Kernel = I->getParent()->getParent(); 301 unsigned MinSize = 0; 302 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 303 bool IdQuery = false; 304 305 // If reqd_work_group_size is present it narrows value down. 306 if (auto *CI = dyn_cast<CallInst>(I)) { 307 const Function *F = CI->getCalledFunction(); 308 if (F) { 309 unsigned Dim = UINT_MAX; 310 switch (F->getIntrinsicID()) { 311 case Intrinsic::amdgcn_workitem_id_x: 312 case Intrinsic::r600_read_tidig_x: 313 IdQuery = true; 314 LLVM_FALLTHROUGH; 315 case Intrinsic::r600_read_local_size_x: 316 Dim = 0; 317 break; 318 case Intrinsic::amdgcn_workitem_id_y: 319 case Intrinsic::r600_read_tidig_y: 320 IdQuery = true; 321 LLVM_FALLTHROUGH; 322 case Intrinsic::r600_read_local_size_y: 323 Dim = 1; 324 break; 325 case Intrinsic::amdgcn_workitem_id_z: 326 case Intrinsic::r600_read_tidig_z: 327 IdQuery = true; 328 LLVM_FALLTHROUGH; 329 case Intrinsic::r600_read_local_size_z: 330 Dim = 2; 331 break; 332 default: 333 break; 334 } 335 if (Dim <= 3) { 336 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 337 if (Node->getNumOperands() == 3) 338 MinSize = MaxSize = mdconst::extract<ConstantInt>( 339 Node->getOperand(Dim))->getZExtValue(); 340 } 341 } 342 } 343 344 if (!MaxSize) 345 return false; 346 347 // Range metadata is [Lo, Hi). For ID query we need to pass max size 348 // as Hi. For size query we need to pass Hi + 1. 349 if (IdQuery) 350 MinSize = 0; 351 else 352 ++MaxSize; 353 354 MDBuilder MDB(I->getContext()); 355 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 356 APInt(32, MaxSize)); 357 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 358 return true; 359 } 360 361 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 362 const TargetMachine &TM) : 363 AMDGPUSubtarget(TT, GPU, FS, TM), 364 InstrInfo(*this), 365 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 366 TLInfo(TM, *this) {} 367 368 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS, 369 const TargetMachine &TM) 370 : AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this), 371 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 372 TLInfo(TM, *this) { 373 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 374 Legalizer.reset(new AMDGPULegalizerInfo()); 375 376 RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); 377 InstSelector.reset(new AMDGPUInstructionSelector( 378 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()))); 379 } 380 381 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 382 unsigned NumRegionInstrs) const { 383 // Track register pressure so the scheduler can try to decrease 384 // pressure once register usage is above the threshold defined by 385 // SIRegisterInfo::getRegPressureSetLimit() 386 Policy.ShouldTrackPressure = true; 387 388 // Enabling both top down and bottom up scheduling seems to give us less 389 // register spills than just using one of these approaches on its own. 390 Policy.OnlyTopDown = false; 391 Policy.OnlyBottomUp = false; 392 393 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 394 if (!enableSIScheduler()) 395 Policy.ShouldTrackLaneMasks = true; 396 } 397 398 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const { 399 return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); 400 } 401 402 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF, 403 unsigned ExplicitArgBytes) const { 404 unsigned ImplicitBytes = getImplicitArgNumBytes(MF); 405 if (ImplicitBytes == 0) 406 return ExplicitArgBytes; 407 408 unsigned Alignment = getAlignmentForImplicitArgPtr(); 409 return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 410 } 411 412 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 413 if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 414 if (SGPRs <= 80) 415 return 10; 416 if (SGPRs <= 88) 417 return 9; 418 if (SGPRs <= 100) 419 return 8; 420 return 7; 421 } 422 if (SGPRs <= 48) 423 return 10; 424 if (SGPRs <= 56) 425 return 9; 426 if (SGPRs <= 64) 427 return 8; 428 if (SGPRs <= 72) 429 return 7; 430 if (SGPRs <= 80) 431 return 6; 432 return 5; 433 } 434 435 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 436 if (VGPRs <= 24) 437 return 10; 438 if (VGPRs <= 28) 439 return 9; 440 if (VGPRs <= 32) 441 return 8; 442 if (VGPRs <= 36) 443 return 7; 444 if (VGPRs <= 40) 445 return 6; 446 if (VGPRs <= 48) 447 return 5; 448 if (VGPRs <= 64) 449 return 4; 450 if (VGPRs <= 84) 451 return 3; 452 if (VGPRs <= 128) 453 return 2; 454 return 1; 455 } 456 457 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 458 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 459 if (MFI.hasFlatScratchInit()) { 460 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 461 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 462 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 463 return 4; // FLAT_SCRATCH, VCC (in that order). 464 } 465 466 if (isXNACKEnabled()) 467 return 4; // XNACK, VCC (in that order). 468 return 2; // VCC. 469 } 470 471 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 472 const Function &F = MF.getFunction(); 473 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 474 475 // Compute maximum number of SGPRs function can use using default/requested 476 // minimum number of waves per execution unit. 477 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 478 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 479 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 480 481 // Check if maximum number of SGPRs was explicitly requested using 482 // "amdgpu-num-sgpr" attribute. 483 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 484 unsigned Requested = AMDGPU::getIntegerAttribute( 485 F, "amdgpu-num-sgpr", MaxNumSGPRs); 486 487 // Make sure requested value does not violate subtarget's specifications. 488 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 489 Requested = 0; 490 491 // If more SGPRs are required to support the input user/system SGPRs, 492 // increase to accommodate them. 493 // 494 // FIXME: This really ends up using the requested number of SGPRs + number 495 // of reserved special registers in total. Theoretically you could re-use 496 // the last input registers for these special registers, but this would 497 // require a lot of complexity to deal with the weird aliasing. 498 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 499 if (Requested && Requested < InputNumSGPRs) 500 Requested = InputNumSGPRs; 501 502 // Make sure requested value is compatible with values implied by 503 // default/requested minimum/maximum number of waves per execution unit. 504 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 505 Requested = 0; 506 if (WavesPerEU.second && 507 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 508 Requested = 0; 509 510 if (Requested) 511 MaxNumSGPRs = Requested; 512 } 513 514 if (hasSGPRInitBug()) 515 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 516 517 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 518 MaxAddressableNumSGPRs); 519 } 520 521 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 522 const Function &F = MF.getFunction(); 523 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 524 525 // Compute maximum number of VGPRs function can use using default/requested 526 // minimum number of waves per execution unit. 527 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 528 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 529 530 // Check if maximum number of VGPRs was explicitly requested using 531 // "amdgpu-num-vgpr" attribute. 532 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 533 unsigned Requested = AMDGPU::getIntegerAttribute( 534 F, "amdgpu-num-vgpr", MaxNumVGPRs); 535 536 // Make sure requested value does not violate subtarget's specifications. 537 if (Requested && Requested <= getReservedNumVGPRs(MF)) 538 Requested = 0; 539 540 // Make sure requested value is compatible with values implied by 541 // default/requested minimum/maximum number of waves per execution unit. 542 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 543 Requested = 0; 544 if (WavesPerEU.second && 545 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 546 Requested = 0; 547 548 if (Requested) 549 MaxNumVGPRs = Requested; 550 } 551 552 return MaxNumVGPRs - getReservedNumVGPRs(MF); 553 } 554 555 namespace { 556 struct MemOpClusterMutation : ScheduleDAGMutation { 557 const SIInstrInfo *TII; 558 559 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {} 560 561 void apply(ScheduleDAGInstrs *DAGInstrs) override { 562 ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 563 564 SUnit *SUa = nullptr; 565 // Search for two consequent memory operations and link them 566 // to prevent scheduler from moving them apart. 567 // In DAG pre-process SUnits are in the original order of 568 // the instructions before scheduling. 569 for (SUnit &SU : DAG->SUnits) { 570 MachineInstr &MI2 = *SU.getInstr(); 571 if (!MI2.mayLoad() && !MI2.mayStore()) { 572 SUa = nullptr; 573 continue; 574 } 575 if (!SUa) { 576 SUa = &SU; 577 continue; 578 } 579 580 MachineInstr &MI1 = *SUa->getInstr(); 581 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) || 582 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) || 583 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) || 584 (TII->isDS(MI1) && TII->isDS(MI2))) { 585 SU.addPredBarrier(SUa); 586 587 for (const SDep &SI : SU.Preds) { 588 if (SI.getSUnit() != SUa) 589 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial)); 590 } 591 592 if (&SU != &DAG->ExitSU) { 593 for (const SDep &SI : SUa->Succs) { 594 if (SI.getSUnit() != &SU) 595 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial)); 596 } 597 } 598 } 599 600 SUa = &SU; 601 } 602 } 603 }; 604 } // namespace 605 606 void SISubtarget::getPostRAMutations( 607 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 608 Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo)); 609 } 610