1 //===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// R600 Machine Scheduler interface 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "R600MachineScheduler.h" 15 #include "AMDGPUSubtarget.h" 16 17 using namespace llvm; 18 19 #define DEBUG_TYPE "machine-scheduler" 20 21 void R600SchedStrategy::initialize(ScheduleDAGMI *dag) { 22 assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness"); 23 DAG = static_cast<ScheduleDAGMILive*>(dag); 24 const R600Subtarget &ST = DAG->MF.getSubtarget<R600Subtarget>(); 25 TII = static_cast<const R600InstrInfo*>(DAG->TII); 26 TRI = static_cast<const R600RegisterInfo*>(DAG->TRI); 27 VLIW5 = !ST.hasCaymanISA(); 28 MRI = &DAG->MRI; 29 CurInstKind = IDOther; 30 CurEmitted = 0; 31 OccupedSlotsMask = 31; 32 InstKindLimit[IDAlu] = TII->getMaxAlusPerClause(); 33 InstKindLimit[IDOther] = 32; 34 InstKindLimit[IDFetch] = ST.getTexVTXClauseSize(); 35 AluInstCount = 0; 36 FetchInstCount = 0; 37 } 38 39 void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc, 40 std::vector<SUnit *> &QDst) 41 { 42 llvm::append_range(QDst, QSrc); 43 QSrc.clear(); 44 } 45 46 static unsigned getWFCountLimitedByGPR(unsigned GPRCount) { 47 assert (GPRCount && "GPRCount cannot be 0"); 48 return 248 / GPRCount; 49 } 50 51 SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) { 52 SUnit *SU = nullptr; 53 NextInstKind = IDOther; 54 55 IsTopNode = false; 56 57 // check if we might want to switch current clause type 58 bool AllowSwitchToAlu = (CurEmitted >= InstKindLimit[CurInstKind]) || 59 (Available[CurInstKind].empty()); 60 bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) && 61 (!Available[IDFetch].empty() || !Available[IDOther].empty()); 62 63 if (CurInstKind == IDAlu && !Available[IDFetch].empty()) { 64 // We use the heuristic provided by AMD Accelerated Parallel Processing 65 // OpenCL Programming Guide : 66 // The approx. number of WF that allows TEX inst to hide ALU inst is : 67 // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU)) 68 float ALUFetchRationEstimate = 69 (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) / 70 (FetchInstCount + Available[IDFetch].size()); 71 if (ALUFetchRationEstimate == 0) { 72 AllowSwitchFromAlu = true; 73 } else { 74 unsigned NeededWF = 62.5f / ALUFetchRationEstimate; 75 LLVM_DEBUG(dbgs() << NeededWF << " approx. Wavefronts Required\n"); 76 // We assume the local GPR requirements to be "dominated" by the requirement 77 // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and 78 // after TEX are indeed likely to consume or generate values from/for the 79 // TEX clause. 80 // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause 81 // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need 82 // one GPR) or TmXYZW = TnXYZW (need 2 GPR). 83 // (TODO : use RegisterPressure) 84 // If we are going too use too many GPR, we flush Fetch instruction to lower 85 // register pressure on 128 bits regs. 86 unsigned NearRegisterRequirement = 2 * Available[IDFetch].size(); 87 if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement)) 88 AllowSwitchFromAlu = true; 89 } 90 } 91 92 if (!SU && ((AllowSwitchToAlu && CurInstKind != IDAlu) || 93 (!AllowSwitchFromAlu && CurInstKind == IDAlu))) { 94 // try to pick ALU 95 SU = pickAlu(); 96 if (!SU && !PhysicalRegCopy.empty()) { 97 SU = PhysicalRegCopy.front(); 98 PhysicalRegCopy.erase(PhysicalRegCopy.begin()); 99 } 100 if (SU) { 101 if (CurEmitted >= InstKindLimit[IDAlu]) 102 CurEmitted = 0; 103 NextInstKind = IDAlu; 104 } 105 } 106 107 if (!SU) { 108 // try to pick FETCH 109 SU = pickOther(IDFetch); 110 if (SU) 111 NextInstKind = IDFetch; 112 } 113 114 // try to pick other 115 if (!SU) { 116 SU = pickOther(IDOther); 117 if (SU) 118 NextInstKind = IDOther; 119 } 120 121 LLVM_DEBUG(if (SU) { 122 dbgs() << " ** Pick node **\n"; 123 DAG->dumpNode(*SU); 124 } else { 125 dbgs() << "NO NODE \n"; 126 for (unsigned i = 0; i < DAG->SUnits.size(); i++) { 127 const SUnit &S = DAG->SUnits[i]; 128 if (!S.isScheduled) 129 DAG->dumpNode(S); 130 } 131 }); 132 133 return SU; 134 } 135 136 void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { 137 if (NextInstKind != CurInstKind) { 138 LLVM_DEBUG(dbgs() << "Instruction Type Switch\n"); 139 if (NextInstKind != IDAlu) 140 OccupedSlotsMask |= 31; 141 CurEmitted = 0; 142 CurInstKind = NextInstKind; 143 } 144 145 if (CurInstKind == IDAlu) { 146 AluInstCount ++; 147 switch (getAluKind(SU)) { 148 case AluT_XYZW: 149 CurEmitted += 4; 150 break; 151 case AluDiscarded: 152 break; 153 default: { 154 ++CurEmitted; 155 for (MachineInstr::mop_iterator It = SU->getInstr()->operands_begin(), 156 E = SU->getInstr()->operands_end(); It != E; ++It) { 157 MachineOperand &MO = *It; 158 if (MO.isReg() && MO.getReg() == R600::ALU_LITERAL_X) 159 ++CurEmitted; 160 } 161 } 162 } 163 } else { 164 ++CurEmitted; 165 } 166 167 LLVM_DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n"); 168 169 if (CurInstKind != IDFetch) { 170 MoveUnits(Pending[IDFetch], Available[IDFetch]); 171 } else 172 FetchInstCount++; 173 } 174 175 static bool 176 isPhysicalRegCopy(MachineInstr *MI) { 177 if (MI->getOpcode() != R600::COPY) 178 return false; 179 180 return !MI->getOperand(1).getReg().isVirtual(); 181 } 182 183 void R600SchedStrategy::releaseTopNode(SUnit *SU) { 184 LLVM_DEBUG(dbgs() << "Top Releasing "; DAG->dumpNode(*SU)); 185 } 186 187 void R600SchedStrategy::releaseBottomNode(SUnit *SU) { 188 LLVM_DEBUG(dbgs() << "Bottom Releasing "; DAG->dumpNode(*SU)); 189 if (isPhysicalRegCopy(SU->getInstr())) { 190 PhysicalRegCopy.push_back(SU); 191 return; 192 } 193 194 int IK = getInstKind(SU); 195 196 // There is no export clause, we can schedule one as soon as its ready 197 if (IK == IDOther) 198 Available[IDOther].push_back(SU); 199 else 200 Pending[IK].push_back(SU); 201 202 } 203 204 bool R600SchedStrategy::regBelongsToClass(Register Reg, 205 const TargetRegisterClass *RC) const { 206 if (!Reg.isVirtual()) { 207 return RC->contains(Reg); 208 } else { 209 return MRI->getRegClass(Reg) == RC; 210 } 211 } 212 213 R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const { 214 MachineInstr *MI = SU->getInstr(); 215 216 if (TII->isTransOnly(*MI)) 217 return AluTrans; 218 219 switch (MI->getOpcode()) { 220 case R600::PRED_X: 221 return AluPredX; 222 case R600::INTERP_PAIR_XY: 223 case R600::INTERP_PAIR_ZW: 224 case R600::INTERP_VEC_LOAD: 225 case R600::DOT_4: 226 return AluT_XYZW; 227 case R600::COPY: 228 if (MI->getOperand(1).isUndef()) { 229 // MI will become a KILL, don't considers it in scheduling 230 return AluDiscarded; 231 } 232 break; 233 default: 234 break; 235 } 236 237 // Does the instruction take a whole IG ? 238 // XXX: Is it possible to add a helper function in R600InstrInfo that can 239 // be used here and in R600PacketizerList::isSoloInstruction() ? 240 if(TII->isVector(*MI) || 241 TII->isCubeOp(MI->getOpcode()) || 242 TII->isReductionOp(MI->getOpcode()) || 243 MI->getOpcode() == R600::GROUP_BARRIER) { 244 return AluT_XYZW; 245 } 246 247 if (TII->isLDSInstr(MI->getOpcode())) { 248 return AluT_X; 249 } 250 251 // Is the result already assigned to a channel ? 252 unsigned DestSubReg = MI->getOperand(0).getSubReg(); 253 switch (DestSubReg) { 254 case R600::sub0: 255 return AluT_X; 256 case R600::sub1: 257 return AluT_Y; 258 case R600::sub2: 259 return AluT_Z; 260 case R600::sub3: 261 return AluT_W; 262 default: 263 break; 264 } 265 266 // Is the result already member of a X/Y/Z/W class ? 267 Register DestReg = MI->getOperand(0).getReg(); 268 if (regBelongsToClass(DestReg, &R600::R600_TReg32_XRegClass) || 269 regBelongsToClass(DestReg, &R600::R600_AddrRegClass)) 270 return AluT_X; 271 if (regBelongsToClass(DestReg, &R600::R600_TReg32_YRegClass)) 272 return AluT_Y; 273 if (regBelongsToClass(DestReg, &R600::R600_TReg32_ZRegClass)) 274 return AluT_Z; 275 if (regBelongsToClass(DestReg, &R600::R600_TReg32_WRegClass)) 276 return AluT_W; 277 if (regBelongsToClass(DestReg, &R600::R600_Reg128RegClass)) 278 return AluT_XYZW; 279 280 // LDS src registers cannot be used in the Trans slot. 281 if (TII->readsLDSSrcReg(*MI)) 282 return AluT_XYZW; 283 284 return AluAny; 285 } 286 287 int R600SchedStrategy::getInstKind(SUnit* SU) { 288 int Opcode = SU->getInstr()->getOpcode(); 289 290 if (TII->usesTextureCache(Opcode) || TII->usesVertexCache(Opcode)) 291 return IDFetch; 292 293 if (TII->isALUInstr(Opcode)) { 294 return IDAlu; 295 } 296 297 switch (Opcode) { 298 case R600::PRED_X: 299 case R600::COPY: 300 case R600::CONST_COPY: 301 case R600::INTERP_PAIR_XY: 302 case R600::INTERP_PAIR_ZW: 303 case R600::INTERP_VEC_LOAD: 304 case R600::DOT_4: 305 return IDAlu; 306 default: 307 return IDOther; 308 } 309 } 310 311 SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) { 312 if (Q.empty()) 313 return nullptr; 314 for (std::vector<SUnit *>::reverse_iterator It = Q.rbegin(), E = Q.rend(); 315 It != E; ++It) { 316 SUnit *SU = *It; 317 InstructionsGroupCandidate.push_back(SU->getInstr()); 318 if (TII->fitsConstReadLimitations(InstructionsGroupCandidate) && 319 (!AnyALU || !TII->isVectorOnly(*SU->getInstr()))) { 320 InstructionsGroupCandidate.pop_back(); 321 Q.erase((It + 1).base()); 322 return SU; 323 } else { 324 InstructionsGroupCandidate.pop_back(); 325 } 326 } 327 return nullptr; 328 } 329 330 void R600SchedStrategy::LoadAlu() { 331 std::vector<SUnit *> &QSrc = Pending[IDAlu]; 332 for (unsigned i = 0, e = QSrc.size(); i < e; ++i) { 333 AluKind AK = getAluKind(QSrc[i]); 334 AvailableAlus[AK].push_back(QSrc[i]); 335 } 336 QSrc.clear(); 337 } 338 339 void R600SchedStrategy::PrepareNextSlot() { 340 LLVM_DEBUG(dbgs() << "New Slot\n"); 341 assert (OccupedSlotsMask && "Slot wasn't filled"); 342 OccupedSlotsMask = 0; 343 // if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS) 344 // OccupedSlotsMask |= 16; 345 InstructionsGroupCandidate.clear(); 346 LoadAlu(); 347 } 348 349 void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) { 350 int DstIndex = TII->getOperandIdx(MI->getOpcode(), R600::OpName::dst); 351 if (DstIndex == -1) { 352 return; 353 } 354 Register DestReg = MI->getOperand(DstIndex).getReg(); 355 // PressureRegister crashes if an operand is def and used in the same inst 356 // and we try to constraint its regclass 357 for (MachineInstr::mop_iterator It = MI->operands_begin(), 358 E = MI->operands_end(); It != E; ++It) { 359 MachineOperand &MO = *It; 360 if (MO.isReg() && !MO.isDef() && 361 MO.getReg() == DestReg) 362 return; 363 } 364 // Constrains the regclass of DestReg to assign it to Slot 365 switch (Slot) { 366 case 0: 367 MRI->constrainRegClass(DestReg, &R600::R600_TReg32_XRegClass); 368 break; 369 case 1: 370 MRI->constrainRegClass(DestReg, &R600::R600_TReg32_YRegClass); 371 break; 372 case 2: 373 MRI->constrainRegClass(DestReg, &R600::R600_TReg32_ZRegClass); 374 break; 375 case 3: 376 MRI->constrainRegClass(DestReg, &R600::R600_TReg32_WRegClass); 377 break; 378 } 379 } 380 381 SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot, bool AnyAlu) { 382 static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W}; 383 SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]], AnyAlu); 384 if (SlotedSU) 385 return SlotedSU; 386 SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny], AnyAlu); 387 if (UnslotedSU) 388 AssignSlot(UnslotedSU->getInstr(), Slot); 389 return UnslotedSU; 390 } 391 392 unsigned R600SchedStrategy::AvailablesAluCount() const { 393 return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() + 394 AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() + 395 AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() + 396 AvailableAlus[AluTrans].size() + AvailableAlus[AluDiscarded].size() + 397 AvailableAlus[AluPredX].size(); 398 } 399 400 SUnit* R600SchedStrategy::pickAlu() { 401 while (AvailablesAluCount() || !Pending[IDAlu].empty()) { 402 if (!OccupedSlotsMask) { 403 // Bottom up scheduling : predX must comes first 404 if (!AvailableAlus[AluPredX].empty()) { 405 OccupedSlotsMask |= 31; 406 return PopInst(AvailableAlus[AluPredX], false); 407 } 408 // Flush physical reg copies (RA will discard them) 409 if (!AvailableAlus[AluDiscarded].empty()) { 410 OccupedSlotsMask |= 31; 411 return PopInst(AvailableAlus[AluDiscarded], false); 412 } 413 // If there is a T_XYZW alu available, use it 414 if (!AvailableAlus[AluT_XYZW].empty()) { 415 OccupedSlotsMask |= 15; 416 return PopInst(AvailableAlus[AluT_XYZW], false); 417 } 418 } 419 bool TransSlotOccuped = OccupedSlotsMask & 16; 420 if (!TransSlotOccuped && VLIW5) { 421 if (!AvailableAlus[AluTrans].empty()) { 422 OccupedSlotsMask |= 16; 423 return PopInst(AvailableAlus[AluTrans], false); 424 } 425 SUnit *SU = AttemptFillSlot(3, true); 426 if (SU) { 427 OccupedSlotsMask |= 16; 428 return SU; 429 } 430 } 431 for (int Chan = 3; Chan > -1; --Chan) { 432 bool isOccupied = OccupedSlotsMask & (1 << Chan); 433 if (!isOccupied) { 434 SUnit *SU = AttemptFillSlot(Chan, false); 435 if (SU) { 436 OccupedSlotsMask |= (1 << Chan); 437 InstructionsGroupCandidate.push_back(SU->getInstr()); 438 return SU; 439 } 440 } 441 } 442 PrepareNextSlot(); 443 } 444 return nullptr; 445 } 446 447 SUnit* R600SchedStrategy::pickOther(int QID) { 448 SUnit *SU = nullptr; 449 std::vector<SUnit *> &AQ = Available[QID]; 450 451 if (AQ.empty()) { 452 MoveUnits(Pending[QID], AQ); 453 } 454 if (!AQ.empty()) { 455 SU = AQ.back(); 456 AQ.pop_back(); 457 } 458 return SU; 459 } 460