1 //===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer --------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// 12 /// The AMDGPUAsmPrinter is used to print both assembly string and also binary 13 /// code. When passed an MCAsmStreamer it prints assembly and when passed 14 /// an MCObjectStreamer it outputs binary code. 15 // 16 //===----------------------------------------------------------------------===// 17 // 18 19 #include "AMDGPUAsmPrinter.h" 20 #include "AMDGPU.h" 21 #include "AMDGPUSubtarget.h" 22 #include "AMDGPUTargetMachine.h" 23 #include "InstPrinter/AMDGPUInstPrinter.h" 24 #include "MCTargetDesc/AMDGPUTargetStreamer.h" 25 #include "R600Defines.h" 26 #include "R600MachineFunctionInfo.h" 27 #include "R600RegisterInfo.h" 28 #include "SIDefines.h" 29 #include "SIInstrInfo.h" 30 #include "SIMachineFunctionInfo.h" 31 #include "SIRegisterInfo.h" 32 #include "Utils/AMDGPUBaseInfo.h" 33 #include "llvm/BinaryFormat/ELF.h" 34 #include "llvm/CodeGen/MachineFrameInfo.h" 35 #include "llvm/IR/DiagnosticInfo.h" 36 #include "llvm/MC/MCContext.h" 37 #include "llvm/MC/MCSectionELF.h" 38 #include "llvm/MC/MCStreamer.h" 39 #include "llvm/Support/MathExtras.h" 40 #include "llvm/Support/TargetRegistry.h" 41 #include "llvm/Target/TargetLoweringObjectFile.h" 42 43 using namespace llvm; 44 45 // TODO: This should get the default rounding mode from the kernel. We just set 46 // the default here, but this could change if the OpenCL rounding mode pragmas 47 // are used. 48 // 49 // The denormal mode here should match what is reported by the OpenCL runtime 50 // for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but 51 // can also be override to flush with the -cl-denorms-are-zero compiler flag. 52 // 53 // AMD OpenCL only sets flush none and reports CL_FP_DENORM for double 54 // precision, and leaves single precision to flush all and does not report 55 // CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports 56 // CL_FP_DENORM for both. 57 // 58 // FIXME: It seems some instructions do not support single precision denormals 59 // regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32, 60 // and sin_f32, cos_f32 on most parts). 61 62 // We want to use these instructions, and using fp32 denormals also causes 63 // instructions to run at the double precision rate for the device so it's 64 // probably best to just report no single precision denormals. 65 static uint32_t getFPMode(const MachineFunction &F) { 66 const SISubtarget& ST = F.getSubtarget<SISubtarget>(); 67 // TODO: Is there any real use for the flush in only / flush out only modes? 68 69 uint32_t FP32Denormals = 70 ST.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; 71 72 uint32_t FP64Denormals = 73 ST.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; 74 75 return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) | 76 FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) | 77 FP_DENORM_MODE_SP(FP32Denormals) | 78 FP_DENORM_MODE_DP(FP64Denormals); 79 } 80 81 static AsmPrinter * 82 createAMDGPUAsmPrinterPass(TargetMachine &tm, 83 std::unique_ptr<MCStreamer> &&Streamer) { 84 return new AMDGPUAsmPrinter(tm, std::move(Streamer)); 85 } 86 87 extern "C" void LLVMInitializeAMDGPUAsmPrinter() { 88 TargetRegistry::RegisterAsmPrinter(getTheAMDGPUTarget(), 89 createAMDGPUAsmPrinterPass); 90 TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(), 91 createAMDGPUAsmPrinterPass); 92 } 93 94 AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, 95 std::unique_ptr<MCStreamer> Streamer) 96 : AsmPrinter(TM, std::move(Streamer)) { 97 AMDGPUASI = static_cast<AMDGPUTargetMachine*>(&TM)->getAMDGPUAS(); 98 } 99 100 StringRef AMDGPUAsmPrinter::getPassName() const { 101 return "AMDGPU Assembly Printer"; 102 } 103 104 const MCSubtargetInfo* AMDGPUAsmPrinter::getSTI() const { 105 return TM.getMCSubtargetInfo(); 106 } 107 108 AMDGPUTargetStreamer& AMDGPUAsmPrinter::getTargetStreamer() const { 109 return static_cast<AMDGPUTargetStreamer&>(*OutStreamer->getTargetStreamer()); 110 } 111 112 void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) { 113 AMDGPU::IsaInfo::IsaVersion ISA = 114 AMDGPU::IsaInfo::getIsaVersion(getSTI()->getFeatureBits()); 115 116 if (TM.getTargetTriple().getOS() == Triple::AMDPAL) { 117 readPalMetadata(M); 118 // AMDPAL wants an HSA_ISA .note. 119 getTargetStreamer().EmitDirectiveHSACodeObjectISA( 120 ISA.Major, ISA.Minor, ISA.Stepping, "AMD", "AMDGPU"); 121 } 122 if (TM.getTargetTriple().getOS() != Triple::AMDHSA) 123 return; 124 125 getTargetStreamer().EmitDirectiveHSACodeObjectVersion(2, 1); 126 getTargetStreamer().EmitDirectiveHSACodeObjectISA( 127 ISA.Major, ISA.Minor, ISA.Stepping, "AMD", "AMDGPU"); 128 getTargetStreamer().EmitStartOfCodeObjectMetadata(M); 129 } 130 131 void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) { 132 if (TM.getTargetTriple().getOS() == Triple::AMDPAL) { 133 // Copy the PAL metadata from the map where we collected it into a vector, 134 // then write it as a .note. 135 std::vector<uint32_t> Data; 136 for (auto i : PalMetadata) { 137 Data.push_back(i.first); 138 Data.push_back(i.second); 139 } 140 getTargetStreamer().EmitPalMetadata(Data); 141 } 142 143 if (TM.getTargetTriple().getOS() != Triple::AMDHSA) 144 return; 145 146 getTargetStreamer().EmitEndOfCodeObjectMetadata(); 147 } 148 149 bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough( 150 const MachineBasicBlock *MBB) const { 151 if (!AsmPrinter::isBlockOnlyReachableByFallthrough(MBB)) 152 return false; 153 154 if (MBB->empty()) 155 return true; 156 157 // If this is a block implementing a long branch, an expression relative to 158 // the start of the block is needed. to the start of the block. 159 // XXX - Is there a smarter way to check this? 160 return (MBB->back().getOpcode() != AMDGPU::S_SETPC_B64); 161 } 162 163 void AMDGPUAsmPrinter::EmitFunctionBodyStart() { 164 const AMDGPUMachineFunction *MFI = MF->getInfo<AMDGPUMachineFunction>(); 165 if (!MFI->isEntryFunction()) 166 return; 167 168 const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>(); 169 amd_kernel_code_t KernelCode; 170 if (STM.isAmdCodeObjectV2(*MF)) { 171 getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF); 172 173 OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); 174 getTargetStreamer().EmitAMDKernelCodeT(KernelCode); 175 } 176 177 if (TM.getTargetTriple().getOS() != Triple::AMDHSA) 178 return; 179 getTargetStreamer().EmitKernelCodeObjectMetadata(*MF->getFunction(), 180 KernelCode); 181 } 182 183 void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { 184 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 185 const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>(); 186 if (MFI->isEntryFunction() && STM.isAmdCodeObjectV2(*MF)) { 187 SmallString<128> SymbolName; 188 getNameWithPrefix(SymbolName, MF->getFunction()), 189 getTargetStreamer().EmitAMDGPUSymbolType( 190 SymbolName, ELF::STT_AMDGPU_HSA_KERNEL); 191 } 192 193 AsmPrinter::EmitFunctionEntryLabel(); 194 } 195 196 void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { 197 198 // Group segment variables aren't emitted in HSA. 199 if (AMDGPU::isGroupSegment(GV, AMDGPUASI)) 200 return; 201 202 AsmPrinter::EmitGlobalVariable(GV); 203 } 204 205 bool AMDGPUAsmPrinter::doFinalization(Module &M) { 206 CallGraphResourceInfo.clear(); 207 return AsmPrinter::doFinalization(M); 208 } 209 210 // For the amdpal OS type, read the amdgpu.pal.metadata supplied by the 211 // frontend into our PalMetadata map, ready for per-function modification. It 212 // is a NamedMD containing an MDTuple containing a number of MDNodes each of 213 // which is an integer value, and each two integer values forms a key=value 214 // pair that we store as PalMetadata[key]=value in the map. 215 void AMDGPUAsmPrinter::readPalMetadata(Module &M) { 216 auto NamedMD = M.getNamedMetadata("amdgpu.pal.metadata"); 217 if (!NamedMD || !NamedMD->getNumOperands()) 218 return; 219 auto Tuple = dyn_cast<MDTuple>(NamedMD->getOperand(0)); 220 if (!Tuple) 221 return; 222 for (unsigned I = 0, E = Tuple->getNumOperands() & -2; I != E; I += 2) { 223 auto Key = mdconst::dyn_extract<ConstantInt>(Tuple->getOperand(I)); 224 auto Val = mdconst::dyn_extract<ConstantInt>(Tuple->getOperand(I + 1)); 225 if (!Key || !Val) 226 continue; 227 PalMetadata[Key->getZExtValue()] = Val->getZExtValue(); 228 } 229 } 230 231 // Print comments that apply to both callable functions and entry points. 232 void AMDGPUAsmPrinter::emitCommonFunctionComments( 233 uint32_t NumVGPR, 234 uint32_t NumSGPR, 235 uint32_t ScratchSize, 236 uint64_t CodeSize) { 237 OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false); 238 OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false); 239 OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false); 240 OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false); 241 } 242 243 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { 244 CurrentProgramInfo = SIProgramInfo(); 245 246 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>(); 247 248 // The starting address of all shader programs must be 256 bytes aligned. 249 // Regular functions just need the basic required instruction alignment. 250 MF.setAlignment(MFI->isEntryFunction() ? 8 : 2); 251 252 SetupMachineFunction(MF); 253 254 const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>(); 255 MCContext &Context = getObjFileLowering().getContext(); 256 if (!STM.isAmdHsaOS()) { 257 MCSectionELF *ConfigSection = 258 Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0); 259 OutStreamer->SwitchSection(ConfigSection); 260 } 261 262 if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 263 if (MFI->isEntryFunction()) { 264 getSIProgramInfo(CurrentProgramInfo, MF); 265 } else { 266 auto I = CallGraphResourceInfo.insert( 267 std::make_pair(MF.getFunction(), SIFunctionResourceInfo())); 268 SIFunctionResourceInfo &Info = I.first->second; 269 assert(I.second && "should only be called once per function"); 270 Info = analyzeResourceUsage(MF); 271 } 272 273 if (STM.isAmdPalOS()) 274 EmitPalMetadata(MF, CurrentProgramInfo); 275 if (!STM.isAmdHsaOS()) { 276 EmitProgramInfoSI(MF, CurrentProgramInfo); 277 } 278 } else { 279 EmitProgramInfoR600(MF); 280 } 281 282 DisasmLines.clear(); 283 HexLines.clear(); 284 DisasmLineMaxLen = 0; 285 286 EmitFunctionBody(); 287 288 if (isVerbose()) { 289 MCSectionELF *CommentSection = 290 Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0); 291 OutStreamer->SwitchSection(CommentSection); 292 293 if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 294 if (!MFI->isEntryFunction()) { 295 OutStreamer->emitRawComment(" Function info:", false); 296 SIFunctionResourceInfo &Info = CallGraphResourceInfo[MF.getFunction()]; 297 emitCommonFunctionComments( 298 Info.NumVGPR, 299 Info.getTotalNumSGPRs(MF.getSubtarget<SISubtarget>()), 300 Info.PrivateSegmentSize, 301 getFunctionCodeSize(MF)); 302 return false; 303 } 304 305 OutStreamer->emitRawComment(" Kernel info:", false); 306 emitCommonFunctionComments(CurrentProgramInfo.NumVGPR, 307 CurrentProgramInfo.NumSGPR, 308 CurrentProgramInfo.ScratchSize, 309 getFunctionCodeSize(MF)); 310 311 OutStreamer->emitRawComment( 312 " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false); 313 OutStreamer->emitRawComment( 314 " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false); 315 OutStreamer->emitRawComment( 316 " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) + 317 " bytes/workgroup (compile time only)", false); 318 319 OutStreamer->emitRawComment( 320 " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false); 321 OutStreamer->emitRawComment( 322 " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false); 323 324 OutStreamer->emitRawComment( 325 " NumSGPRsForWavesPerEU: " + 326 Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false); 327 OutStreamer->emitRawComment( 328 " NumVGPRsForWavesPerEU: " + 329 Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false); 330 331 OutStreamer->emitRawComment( 332 " ReservedVGPRFirst: " + Twine(CurrentProgramInfo.ReservedVGPRFirst), 333 false); 334 OutStreamer->emitRawComment( 335 " ReservedVGPRCount: " + Twine(CurrentProgramInfo.ReservedVGPRCount), 336 false); 337 338 if (MF.getSubtarget<SISubtarget>().debuggerEmitPrologue()) { 339 OutStreamer->emitRawComment( 340 " DebuggerWavefrontPrivateSegmentOffsetSGPR: s" + 341 Twine(CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false); 342 OutStreamer->emitRawComment( 343 " DebuggerPrivateSegmentBufferSGPR: s" + 344 Twine(CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR), false); 345 } 346 347 OutStreamer->emitRawComment( 348 " COMPUTE_PGM_RSRC2:USER_SGPR: " + 349 Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false); 350 OutStreamer->emitRawComment( 351 " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " + 352 Twine(G_00B84C_TRAP_HANDLER(CurrentProgramInfo.ComputePGMRSrc2)), false); 353 OutStreamer->emitRawComment( 354 " COMPUTE_PGM_RSRC2:TGID_X_EN: " + 355 Twine(G_00B84C_TGID_X_EN(CurrentProgramInfo.ComputePGMRSrc2)), false); 356 OutStreamer->emitRawComment( 357 " COMPUTE_PGM_RSRC2:TGID_Y_EN: " + 358 Twine(G_00B84C_TGID_Y_EN(CurrentProgramInfo.ComputePGMRSrc2)), false); 359 OutStreamer->emitRawComment( 360 " COMPUTE_PGM_RSRC2:TGID_Z_EN: " + 361 Twine(G_00B84C_TGID_Z_EN(CurrentProgramInfo.ComputePGMRSrc2)), false); 362 OutStreamer->emitRawComment( 363 " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " + 364 Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)), 365 false); 366 } else { 367 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); 368 OutStreamer->emitRawComment( 369 Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->CFStackSize))); 370 } 371 } 372 373 if (STM.dumpCode()) { 374 375 OutStreamer->SwitchSection( 376 Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0)); 377 378 for (size_t i = 0; i < DisasmLines.size(); ++i) { 379 std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' '); 380 Comment += " ; " + HexLines[i] + "\n"; 381 382 OutStreamer->EmitBytes(StringRef(DisasmLines[i])); 383 OutStreamer->EmitBytes(StringRef(Comment)); 384 } 385 } 386 387 return false; 388 } 389 390 void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { 391 unsigned MaxGPR = 0; 392 bool killPixel = false; 393 const R600Subtarget &STM = MF.getSubtarget<R600Subtarget>(); 394 const R600RegisterInfo *RI = STM.getRegisterInfo(); 395 const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); 396 397 for (const MachineBasicBlock &MBB : MF) { 398 for (const MachineInstr &MI : MBB) { 399 if (MI.getOpcode() == AMDGPU::KILLGT) 400 killPixel = true; 401 unsigned numOperands = MI.getNumOperands(); 402 for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) { 403 const MachineOperand &MO = MI.getOperand(op_idx); 404 if (!MO.isReg()) 405 continue; 406 unsigned HWReg = RI->getHWRegIndex(MO.getReg()); 407 408 // Register with value > 127 aren't GPR 409 if (HWReg > 127) 410 continue; 411 MaxGPR = std::max(MaxGPR, HWReg); 412 } 413 } 414 } 415 416 unsigned RsrcReg; 417 if (STM.getGeneration() >= R600Subtarget::EVERGREEN) { 418 // Evergreen / Northern Islands 419 switch (MF.getFunction()->getCallingConv()) { 420 default: LLVM_FALLTHROUGH; 421 case CallingConv::AMDGPU_CS: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break; 422 case CallingConv::AMDGPU_GS: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break; 423 case CallingConv::AMDGPU_PS: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break; 424 case CallingConv::AMDGPU_VS: RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break; 425 } 426 } else { 427 // R600 / R700 428 switch (MF.getFunction()->getCallingConv()) { 429 default: LLVM_FALLTHROUGH; 430 case CallingConv::AMDGPU_GS: LLVM_FALLTHROUGH; 431 case CallingConv::AMDGPU_CS: LLVM_FALLTHROUGH; 432 case CallingConv::AMDGPU_VS: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break; 433 case CallingConv::AMDGPU_PS: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break; 434 } 435 } 436 437 OutStreamer->EmitIntValue(RsrcReg, 4); 438 OutStreamer->EmitIntValue(S_NUM_GPRS(MaxGPR + 1) | 439 S_STACK_SIZE(MFI->CFStackSize), 4); 440 OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4); 441 OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4); 442 443 if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) { 444 OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4); 445 OutStreamer->EmitIntValue(alignTo(MFI->getLDSSize(), 4) >> 2, 4); 446 } 447 } 448 449 uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const { 450 const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); 451 const SIInstrInfo *TII = STM.getInstrInfo(); 452 453 uint64_t CodeSize = 0; 454 455 for (const MachineBasicBlock &MBB : MF) { 456 for (const MachineInstr &MI : MBB) { 457 // TODO: CodeSize should account for multiple functions. 458 459 // TODO: Should we count size of debug info? 460 if (MI.isDebugValue()) 461 continue; 462 463 CodeSize += TII->getInstSizeInBytes(MI); 464 } 465 } 466 467 return CodeSize; 468 } 469 470 static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, 471 const SIInstrInfo &TII, 472 unsigned Reg) { 473 for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) { 474 if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent())) 475 return true; 476 } 477 478 return false; 479 } 480 481 static unsigned getNumExtraSGPRs(const SISubtarget &ST, 482 bool VCCUsed, 483 bool FlatScrUsed) { 484 unsigned ExtraSGPRs = 0; 485 if (VCCUsed) 486 ExtraSGPRs = 2; 487 488 if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) { 489 if (FlatScrUsed) 490 ExtraSGPRs = 4; 491 } else { 492 if (ST.isXNACKEnabled()) 493 ExtraSGPRs = 4; 494 495 if (FlatScrUsed) 496 ExtraSGPRs = 6; 497 } 498 499 return ExtraSGPRs; 500 } 501 502 int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs( 503 const SISubtarget &ST) const { 504 return NumExplicitSGPR + getNumExtraSGPRs(ST, UsesVCC, UsesFlatScratch); 505 } 506 507 AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( 508 const MachineFunction &MF) const { 509 SIFunctionResourceInfo Info; 510 511 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 512 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 513 const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 514 const MachineRegisterInfo &MRI = MF.getRegInfo(); 515 const SIInstrInfo *TII = ST.getInstrInfo(); 516 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 517 518 Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) || 519 MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI); 520 521 // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat 522 // instructions aren't used to access the scratch buffer. Inline assembly may 523 // need it though. 524 // 525 // If we only have implicit uses of flat_scr on flat instructions, it is not 526 // really needed. 527 if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() && 528 (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) && 529 !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) && 530 !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) { 531 Info.UsesFlatScratch = false; 532 } 533 534 Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects(); 535 Info.PrivateSegmentSize = FrameInfo.getStackSize(); 536 537 538 Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) || 539 MRI.isPhysRegUsed(AMDGPU::VCC_HI); 540 541 // If there are no calls, MachineRegisterInfo can tell us the used register 542 // count easily. 543 // A tail call isn't considered a call for MachineFrameInfo's purposes. 544 if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) { 545 MCPhysReg HighestVGPRReg = AMDGPU::NoRegister; 546 for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { 547 if (MRI.isPhysRegUsed(Reg)) { 548 HighestVGPRReg = Reg; 549 break; 550 } 551 } 552 553 MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; 554 for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { 555 if (MRI.isPhysRegUsed(Reg)) { 556 HighestSGPRReg = Reg; 557 break; 558 } 559 } 560 561 // We found the maximum register index. They start at 0, so add one to get the 562 // number of registers. 563 Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 : 564 TRI.getHWRegIndex(HighestVGPRReg) + 1; 565 Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 : 566 TRI.getHWRegIndex(HighestSGPRReg) + 1; 567 568 return Info; 569 } 570 571 int32_t MaxVGPR = -1; 572 int32_t MaxSGPR = -1; 573 uint32_t CalleeFrameSize = 0; 574 575 for (const MachineBasicBlock &MBB : MF) { 576 for (const MachineInstr &MI : MBB) { 577 // TODO: Check regmasks? Do they occur anywhere except calls? 578 for (const MachineOperand &MO : MI.operands()) { 579 unsigned Width = 0; 580 bool IsSGPR = false; 581 582 if (!MO.isReg()) 583 continue; 584 585 unsigned Reg = MO.getReg(); 586 switch (Reg) { 587 case AMDGPU::EXEC: 588 case AMDGPU::EXEC_LO: 589 case AMDGPU::EXEC_HI: 590 case AMDGPU::SCC: 591 case AMDGPU::M0: 592 case AMDGPU::SRC_SHARED_BASE: 593 case AMDGPU::SRC_SHARED_LIMIT: 594 case AMDGPU::SRC_PRIVATE_BASE: 595 case AMDGPU::SRC_PRIVATE_LIMIT: 596 continue; 597 598 case AMDGPU::NoRegister: 599 assert(MI.isDebugValue()); 600 continue; 601 602 case AMDGPU::VCC: 603 case AMDGPU::VCC_LO: 604 case AMDGPU::VCC_HI: 605 Info.UsesVCC = true; 606 continue; 607 608 case AMDGPU::FLAT_SCR: 609 case AMDGPU::FLAT_SCR_LO: 610 case AMDGPU::FLAT_SCR_HI: 611 continue; 612 613 case AMDGPU::TBA: 614 case AMDGPU::TBA_LO: 615 case AMDGPU::TBA_HI: 616 case AMDGPU::TMA: 617 case AMDGPU::TMA_LO: 618 case AMDGPU::TMA_HI: 619 llvm_unreachable("trap handler registers should not be used"); 620 621 default: 622 break; 623 } 624 625 if (AMDGPU::SReg_32RegClass.contains(Reg)) { 626 assert(!AMDGPU::TTMP_32RegClass.contains(Reg) && 627 "trap handler registers should not be used"); 628 IsSGPR = true; 629 Width = 1; 630 } else if (AMDGPU::VGPR_32RegClass.contains(Reg)) { 631 IsSGPR = false; 632 Width = 1; 633 } else if (AMDGPU::SReg_64RegClass.contains(Reg)) { 634 assert(!AMDGPU::TTMP_64RegClass.contains(Reg) && 635 "trap handler registers should not be used"); 636 IsSGPR = true; 637 Width = 2; 638 } else if (AMDGPU::VReg_64RegClass.contains(Reg)) { 639 IsSGPR = false; 640 Width = 2; 641 } else if (AMDGPU::VReg_96RegClass.contains(Reg)) { 642 IsSGPR = false; 643 Width = 3; 644 } else if (AMDGPU::SReg_128RegClass.contains(Reg)) { 645 IsSGPR = true; 646 Width = 4; 647 } else if (AMDGPU::VReg_128RegClass.contains(Reg)) { 648 IsSGPR = false; 649 Width = 4; 650 } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { 651 IsSGPR = true; 652 Width = 8; 653 } else if (AMDGPU::VReg_256RegClass.contains(Reg)) { 654 IsSGPR = false; 655 Width = 8; 656 } else if (AMDGPU::SReg_512RegClass.contains(Reg)) { 657 IsSGPR = true; 658 Width = 16; 659 } else if (AMDGPU::VReg_512RegClass.contains(Reg)) { 660 IsSGPR = false; 661 Width = 16; 662 } else { 663 llvm_unreachable("Unknown register class"); 664 } 665 unsigned HWReg = TRI.getHWRegIndex(Reg); 666 int MaxUsed = HWReg + Width - 1; 667 if (IsSGPR) { 668 MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR; 669 } else { 670 MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR; 671 } 672 } 673 674 if (MI.isCall()) { 675 // Pseudo used just to encode the underlying global. Is there a better 676 // way to track this? 677 678 const MachineOperand *CalleeOp 679 = TII->getNamedOperand(MI, AMDGPU::OpName::callee); 680 const Function *Callee = cast<Function>(CalleeOp->getGlobal()); 681 if (Callee->isDeclaration()) { 682 // If this is a call to an external function, we can't do much. Make 683 // conservative guesses. 684 685 // 48 SGPRs - vcc, - flat_scr, -xnack 686 int MaxSGPRGuess = 47 - getNumExtraSGPRs(ST, true, 687 ST.hasFlatAddressSpace()); 688 MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess); 689 MaxVGPR = std::max(MaxVGPR, 23); 690 691 CalleeFrameSize = std::max(CalleeFrameSize, 16384u); 692 Info.UsesVCC = true; 693 Info.UsesFlatScratch = ST.hasFlatAddressSpace(); 694 Info.HasDynamicallySizedStack = true; 695 } else { 696 // We force CodeGen to run in SCC order, so the callee's register 697 // usage etc. should be the cumulative usage of all callees. 698 auto I = CallGraphResourceInfo.find(Callee); 699 assert(I != CallGraphResourceInfo.end() && 700 "callee should have been handled before caller"); 701 702 MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR); 703 MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR); 704 CalleeFrameSize 705 = std::max(I->second.PrivateSegmentSize, CalleeFrameSize); 706 Info.UsesVCC |= I->second.UsesVCC; 707 Info.UsesFlatScratch |= I->second.UsesFlatScratch; 708 Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack; 709 Info.HasRecursion |= I->second.HasRecursion; 710 } 711 712 if (!Callee->doesNotRecurse()) 713 Info.HasRecursion = true; 714 } 715 } 716 } 717 718 Info.NumExplicitSGPR = MaxSGPR + 1; 719 Info.NumVGPR = MaxVGPR + 1; 720 Info.PrivateSegmentSize += CalleeFrameSize; 721 722 return Info; 723 } 724 725 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, 726 const MachineFunction &MF) { 727 SIFunctionResourceInfo Info = analyzeResourceUsage(MF); 728 729 ProgInfo.NumVGPR = Info.NumVGPR; 730 ProgInfo.NumSGPR = Info.NumExplicitSGPR; 731 ProgInfo.ScratchSize = Info.PrivateSegmentSize; 732 ProgInfo.VCCUsed = Info.UsesVCC; 733 ProgInfo.FlatUsed = Info.UsesFlatScratch; 734 ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion; 735 736 const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); 737 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 738 const SIInstrInfo *TII = STM.getInstrInfo(); 739 const SIRegisterInfo *RI = &TII->getRegisterInfo(); 740 741 unsigned ExtraSGPRs = getNumExtraSGPRs(STM, 742 ProgInfo.VCCUsed, 743 ProgInfo.FlatUsed); 744 unsigned ExtraVGPRs = STM.getReservedNumVGPRs(MF); 745 746 // Check the addressable register limit before we add ExtraSGPRs. 747 if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 748 !STM.hasSGPRInitBug()) { 749 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); 750 if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) { 751 // This can happen due to a compiler bug or when using inline asm. 752 LLVMContext &Ctx = MF.getFunction()->getContext(); 753 DiagnosticInfoResourceLimit Diag(*MF.getFunction(), 754 "addressable scalar registers", 755 ProgInfo.NumSGPR, DS_Error, 756 DK_ResourceLimit, 757 MaxAddressableNumSGPRs); 758 Ctx.diagnose(Diag); 759 ProgInfo.NumSGPR = MaxAddressableNumSGPRs - 1; 760 } 761 } 762 763 // Account for extra SGPRs and VGPRs reserved for debugger use. 764 ProgInfo.NumSGPR += ExtraSGPRs; 765 ProgInfo.NumVGPR += ExtraVGPRs; 766 767 // Adjust number of registers used to meet default/requested minimum/maximum 768 // number of waves per execution unit request. 769 ProgInfo.NumSGPRsForWavesPerEU = std::max( 770 std::max(ProgInfo.NumSGPR, 1u), STM.getMinNumSGPRs(MFI->getMaxWavesPerEU())); 771 ProgInfo.NumVGPRsForWavesPerEU = std::max( 772 std::max(ProgInfo.NumVGPR, 1u), STM.getMinNumVGPRs(MFI->getMaxWavesPerEU())); 773 774 if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS || 775 STM.hasSGPRInitBug()) { 776 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); 777 if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) { 778 // This can happen due to a compiler bug or when using inline asm to use 779 // the registers which are usually reserved for vcc etc. 780 LLVMContext &Ctx = MF.getFunction()->getContext(); 781 DiagnosticInfoResourceLimit Diag(*MF.getFunction(), 782 "scalar registers", 783 ProgInfo.NumSGPR, DS_Error, 784 DK_ResourceLimit, 785 MaxAddressableNumSGPRs); 786 Ctx.diagnose(Diag); 787 ProgInfo.NumSGPR = MaxAddressableNumSGPRs; 788 ProgInfo.NumSGPRsForWavesPerEU = MaxAddressableNumSGPRs; 789 } 790 } 791 792 if (STM.hasSGPRInitBug()) { 793 ProgInfo.NumSGPR = 794 AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 795 ProgInfo.NumSGPRsForWavesPerEU = 796 AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 797 } 798 799 if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) { 800 LLVMContext &Ctx = MF.getFunction()->getContext(); 801 DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "user SGPRs", 802 MFI->getNumUserSGPRs(), DS_Error); 803 Ctx.diagnose(Diag); 804 } 805 806 if (MFI->getLDSSize() > static_cast<unsigned>(STM.getLocalMemorySize())) { 807 LLVMContext &Ctx = MF.getFunction()->getContext(); 808 DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "local memory", 809 MFI->getLDSSize(), DS_Error); 810 Ctx.diagnose(Diag); 811 } 812 813 // SGPRBlocks is actual number of SGPR blocks minus 1. 814 ProgInfo.SGPRBlocks = alignTo(ProgInfo.NumSGPRsForWavesPerEU, 815 STM.getSGPREncodingGranule()); 816 ProgInfo.SGPRBlocks = ProgInfo.SGPRBlocks / STM.getSGPREncodingGranule() - 1; 817 818 // VGPRBlocks is actual number of VGPR blocks minus 1. 819 ProgInfo.VGPRBlocks = alignTo(ProgInfo.NumVGPRsForWavesPerEU, 820 STM.getVGPREncodingGranule()); 821 ProgInfo.VGPRBlocks = ProgInfo.VGPRBlocks / STM.getVGPREncodingGranule() - 1; 822 823 // Record first reserved VGPR and number of reserved VGPRs. 824 ProgInfo.ReservedVGPRFirst = STM.debuggerReserveRegs() ? ProgInfo.NumVGPR : 0; 825 ProgInfo.ReservedVGPRCount = STM.getReservedNumVGPRs(MF); 826 827 // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and 828 // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue" 829 // attribute was requested. 830 if (STM.debuggerEmitPrologue()) { 831 ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR = 832 RI->getHWRegIndex(MFI->getScratchWaveOffsetReg()); 833 ProgInfo.DebuggerPrivateSegmentBufferSGPR = 834 RI->getHWRegIndex(MFI->getScratchRSrcReg()); 835 } 836 837 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode 838 // register. 839 ProgInfo.FloatMode = getFPMode(MF); 840 841 ProgInfo.IEEEMode = STM.enableIEEEBit(MF); 842 843 // Make clamp modifier on NaN input returns 0. 844 ProgInfo.DX10Clamp = STM.enableDX10Clamp(); 845 846 unsigned LDSAlignShift; 847 if (STM.getGeneration() < SISubtarget::SEA_ISLANDS) { 848 // LDS is allocated in 64 dword blocks. 849 LDSAlignShift = 8; 850 } else { 851 // LDS is allocated in 128 dword blocks. 852 LDSAlignShift = 9; 853 } 854 855 unsigned LDSSpillSize = 856 MFI->getLDSWaveSpillSize() * MFI->getMaxFlatWorkGroupSize(); 857 858 ProgInfo.LDSSize = MFI->getLDSSize() + LDSSpillSize; 859 ProgInfo.LDSBlocks = 860 alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift; 861 862 // Scratch is allocated in 256 dword blocks. 863 unsigned ScratchAlignShift = 10; 864 // We need to program the hardware with the amount of scratch memory that 865 // is used by the entire wave. ProgInfo.ScratchSize is the amount of 866 // scratch memory used per thread. 867 ProgInfo.ScratchBlocks = 868 alignTo(ProgInfo.ScratchSize * STM.getWavefrontSize(), 869 1ULL << ScratchAlignShift) >> 870 ScratchAlignShift; 871 872 ProgInfo.ComputePGMRSrc1 = 873 S_00B848_VGPRS(ProgInfo.VGPRBlocks) | 874 S_00B848_SGPRS(ProgInfo.SGPRBlocks) | 875 S_00B848_PRIORITY(ProgInfo.Priority) | 876 S_00B848_FLOAT_MODE(ProgInfo.FloatMode) | 877 S_00B848_PRIV(ProgInfo.Priv) | 878 S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) | 879 S_00B848_DEBUG_MODE(ProgInfo.DebugMode) | 880 S_00B848_IEEE_MODE(ProgInfo.IEEEMode); 881 882 // 0 = X, 1 = XY, 2 = XYZ 883 unsigned TIDIGCompCnt = 0; 884 if (MFI->hasWorkItemIDZ()) 885 TIDIGCompCnt = 2; 886 else if (MFI->hasWorkItemIDY()) 887 TIDIGCompCnt = 1; 888 889 ProgInfo.ComputePGMRSrc2 = 890 S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) | 891 S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) | 892 S_00B84C_TRAP_HANDLER(STM.isTrapHandlerEnabled()) | 893 S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) | 894 S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) | 895 S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) | 896 S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) | 897 S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) | 898 S_00B84C_EXCP_EN_MSB(0) | 899 // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP. 900 S_00B84C_LDS_SIZE(STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks) | 901 S_00B84C_EXCP_EN(0); 902 } 903 904 static unsigned getRsrcReg(CallingConv::ID CallConv) { 905 switch (CallConv) { 906 default: LLVM_FALLTHROUGH; 907 case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1; 908 case CallingConv::AMDGPU_LS: return R_00B528_SPI_SHADER_PGM_RSRC1_LS; 909 case CallingConv::AMDGPU_HS: return R_00B428_SPI_SHADER_PGM_RSRC1_HS; 910 case CallingConv::AMDGPU_ES: return R_00B328_SPI_SHADER_PGM_RSRC1_ES; 911 case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS; 912 case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS; 913 case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS; 914 } 915 } 916 917 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, 918 const SIProgramInfo &CurrentProgramInfo) { 919 const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); 920 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 921 unsigned RsrcReg = getRsrcReg(MF.getFunction()->getCallingConv()); 922 923 if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) { 924 OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4); 925 926 OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc1, 4); 927 928 OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4); 929 OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc2, 4); 930 931 OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4); 932 OutStreamer->EmitIntValue(S_00B860_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4); 933 934 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 = 935 // 0" comment but I don't see a corresponding field in the register spec. 936 } else { 937 OutStreamer->EmitIntValue(RsrcReg, 4); 938 OutStreamer->EmitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) | 939 S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4); 940 unsigned Rsrc2Val = 0; 941 if (STM.isVGPRSpillingEnabled(*MF.getFunction())) { 942 OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4); 943 OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4); 944 if (TM.getTargetTriple().getOS() == Triple::AMDPAL) 945 Rsrc2Val = S_00B84C_SCRATCH_EN(CurrentProgramInfo.ScratchBlocks > 0); 946 } 947 if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_PS) { 948 OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); 949 OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4); 950 OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4); 951 OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4); 952 Rsrc2Val |= S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks); 953 } 954 if (Rsrc2Val) { 955 OutStreamer->EmitIntValue(RsrcReg + 4 /*rsrc2*/, 4); 956 OutStreamer->EmitIntValue(Rsrc2Val, 4); 957 } 958 } 959 960 OutStreamer->EmitIntValue(R_SPILLED_SGPRS, 4); 961 OutStreamer->EmitIntValue(MFI->getNumSpilledSGPRs(), 4); 962 OutStreamer->EmitIntValue(R_SPILLED_VGPRS, 4); 963 OutStreamer->EmitIntValue(MFI->getNumSpilledVGPRs(), 4); 964 } 965 966 // This is the equivalent of EmitProgramInfoSI above, but for when the OS type 967 // is AMDPAL. It stores each compute/SPI register setting and other PAL 968 // metadata items into the PalMetadata map, combining with any provided by the 969 // frontend as LLVM metadata. Once all functions are written, PalMetadata is 970 // then written as a single block in the .note section. 971 void AMDGPUAsmPrinter::EmitPalMetadata(const MachineFunction &MF, 972 const SIProgramInfo &CurrentProgramInfo) { 973 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 974 // Given the calling convention, calculate the register number for rsrc1. In 975 // principle the register number could change in future hardware, but we know 976 // it is the same for gfx6-9 (except that LS and ES don't exist on gfx9), so 977 // we can use the same fixed value that .AMDGPU.config has for Mesa. Note 978 // that we use a register number rather than a byte offset, so we need to 979 // divide by 4. 980 unsigned Rsrc1Reg = getRsrcReg(MF.getFunction()->getCallingConv()) / 4; 981 unsigned Rsrc2Reg = Rsrc1Reg + 1; 982 // Also calculate the PAL metadata key for *S_SCRATCH_SIZE. It can be used 983 // with a constant offset to access any non-register shader-specific PAL 984 // metadata key. 985 unsigned ScratchSizeKey = AMDGPU::ElfNote::AMDGPU_PAL_METADATA_CS_SCRATCH_SIZE; 986 switch (MF.getFunction()->getCallingConv()) { 987 case CallingConv::AMDGPU_PS: 988 ScratchSizeKey = AMDGPU::ElfNote::AMDGPU_PAL_METADATA_PS_SCRATCH_SIZE; 989 break; 990 case CallingConv::AMDGPU_VS: 991 ScratchSizeKey = AMDGPU::ElfNote::AMDGPU_PAL_METADATA_VS_SCRATCH_SIZE; 992 break; 993 case CallingConv::AMDGPU_GS: 994 ScratchSizeKey = AMDGPU::ElfNote::AMDGPU_PAL_METADATA_GS_SCRATCH_SIZE; 995 break; 996 case CallingConv::AMDGPU_ES: 997 ScratchSizeKey = AMDGPU::ElfNote::AMDGPU_PAL_METADATA_ES_SCRATCH_SIZE; 998 break; 999 case CallingConv::AMDGPU_HS: 1000 ScratchSizeKey = AMDGPU::ElfNote::AMDGPU_PAL_METADATA_HS_SCRATCH_SIZE; 1001 break; 1002 case CallingConv::AMDGPU_LS: 1003 ScratchSizeKey = AMDGPU::ElfNote::AMDGPU_PAL_METADATA_LS_SCRATCH_SIZE; 1004 break; 1005 } 1006 unsigned NumUsedVgprsKey = ScratchSizeKey 1007 + AMDGPU::ElfNote::AMDGPU_PAL_METADATA_VS_NUM_USED_VGPRS 1008 - AMDGPU::ElfNote::AMDGPU_PAL_METADATA_VS_SCRATCH_SIZE; 1009 unsigned NumUsedSgprsKey = ScratchSizeKey 1010 + AMDGPU::ElfNote::AMDGPU_PAL_METADATA_VS_NUM_USED_SGPRS 1011 - AMDGPU::ElfNote::AMDGPU_PAL_METADATA_VS_SCRATCH_SIZE; 1012 PalMetadata[NumUsedVgprsKey] = CurrentProgramInfo.NumVGPRsForWavesPerEU; 1013 PalMetadata[NumUsedSgprsKey] = CurrentProgramInfo.NumSGPRsForWavesPerEU; 1014 if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) { 1015 PalMetadata[Rsrc1Reg] |= CurrentProgramInfo.ComputePGMRSrc1; 1016 PalMetadata[Rsrc2Reg] |= CurrentProgramInfo.ComputePGMRSrc2; 1017 // ScratchSize is in bytes, 16 aligned. 1018 PalMetadata[ScratchSizeKey] |= alignTo(CurrentProgramInfo.ScratchSize, 16); 1019 } else { 1020 PalMetadata[Rsrc1Reg] |= S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) 1021 | S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks); 1022 if (CurrentProgramInfo.ScratchBlocks > 0) 1023 PalMetadata[Rsrc2Reg] |= S_00B84C_SCRATCH_EN(1); 1024 // ScratchSize is in bytes, 16 aligned. 1025 PalMetadata[ScratchSizeKey] |= alignTo(CurrentProgramInfo.ScratchSize, 16); 1026 } 1027 if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_PS) { 1028 PalMetadata[Rsrc2Reg] |= S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks); 1029 PalMetadata[R_0286CC_SPI_PS_INPUT_ENA / 4] |= MFI->getPSInputEnable(); 1030 PalMetadata[R_0286D0_SPI_PS_INPUT_ADDR / 4] |= MFI->getPSInputAddr(); 1031 } 1032 } 1033 1034 // This is supposed to be log2(Size) 1035 static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) { 1036 switch (Size) { 1037 case 4: 1038 return AMD_ELEMENT_4_BYTES; 1039 case 8: 1040 return AMD_ELEMENT_8_BYTES; 1041 case 16: 1042 return AMD_ELEMENT_16_BYTES; 1043 default: 1044 llvm_unreachable("invalid private_element_size"); 1045 } 1046 } 1047 1048 void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, 1049 const SIProgramInfo &CurrentProgramInfo, 1050 const MachineFunction &MF) const { 1051 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1052 const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); 1053 1054 AMDGPU::initDefaultAMDKernelCodeT(Out, STM.getFeatureBits()); 1055 1056 Out.compute_pgm_resource_registers = 1057 CurrentProgramInfo.ComputePGMRSrc1 | 1058 (CurrentProgramInfo.ComputePGMRSrc2 << 32); 1059 Out.code_properties = AMD_CODE_PROPERTY_IS_PTR64; 1060 1061 if (CurrentProgramInfo.DynamicCallStack) 1062 Out.code_properties |= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK; 1063 1064 AMD_HSA_BITS_SET(Out.code_properties, 1065 AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE, 1066 getElementByteSizeValue(STM.getMaxPrivateElementSize())); 1067 1068 if (MFI->hasPrivateSegmentBuffer()) { 1069 Out.code_properties |= 1070 AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; 1071 } 1072 1073 if (MFI->hasDispatchPtr()) 1074 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; 1075 1076 if (MFI->hasQueuePtr()) 1077 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; 1078 1079 if (MFI->hasKernargSegmentPtr()) 1080 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; 1081 1082 if (MFI->hasDispatchID()) 1083 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; 1084 1085 if (MFI->hasFlatScratchInit()) 1086 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; 1087 1088 if (MFI->hasGridWorkgroupCountX()) { 1089 Out.code_properties |= 1090 AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X; 1091 } 1092 1093 if (MFI->hasGridWorkgroupCountY()) { 1094 Out.code_properties |= 1095 AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y; 1096 } 1097 1098 if (MFI->hasGridWorkgroupCountZ()) { 1099 Out.code_properties |= 1100 AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z; 1101 } 1102 1103 if (MFI->hasDispatchPtr()) 1104 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; 1105 1106 if (STM.debuggerSupported()) 1107 Out.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED; 1108 1109 if (STM.isXNACKEnabled()) 1110 Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED; 1111 1112 // FIXME: Should use getKernArgSize 1113 Out.kernarg_segment_byte_size = 1114 STM.getKernArgSegmentSize(MF, MFI->getABIArgOffset()); 1115 Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR; 1116 Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR; 1117 Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize; 1118 Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize; 1119 Out.reserved_vgpr_first = CurrentProgramInfo.ReservedVGPRFirst; 1120 Out.reserved_vgpr_count = CurrentProgramInfo.ReservedVGPRCount; 1121 1122 // These alignment values are specified in powers of two, so alignment = 1123 // 2^n. The minimum alignment is 2^4 = 16. 1124 Out.kernarg_segment_alignment = std::max((size_t)4, 1125 countTrailingZeros(MFI->getMaxKernArgAlign())); 1126 1127 if (STM.debuggerEmitPrologue()) { 1128 Out.debug_wavefront_private_segment_offset_sgpr = 1129 CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR; 1130 Out.debug_private_segment_buffer_sgpr = 1131 CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR; 1132 } 1133 } 1134 1135 bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, 1136 unsigned AsmVariant, 1137 const char *ExtraCode, raw_ostream &O) { 1138 // First try the generic code, which knows about modifiers like 'c' and 'n'. 1139 if (!AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O)) 1140 return false; 1141 1142 if (ExtraCode && ExtraCode[0]) { 1143 if (ExtraCode[1] != 0) 1144 return true; // Unknown modifier. 1145 1146 switch (ExtraCode[0]) { 1147 case 'r': 1148 break; 1149 default: 1150 return true; 1151 } 1152 } 1153 1154 // TODO: Should be able to support other operand types like globals. 1155 const MachineOperand &MO = MI->getOperand(OpNo); 1156 if (MO.isReg()) { 1157 AMDGPUInstPrinter::printRegOperand(MO.getReg(), O, 1158 *MF->getSubtarget().getRegisterInfo()); 1159 return false; 1160 } 1161 1162 return true; 1163 } 1164