1 //===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// 11 /// The AMDGPUAsmPrinter is used to print both assembly string and also binary 12 /// code. When passed an MCAsmStreamer it prints assembly and when passed 13 /// an MCObjectStreamer it outputs binary code. 14 // 15 //===----------------------------------------------------------------------===// 16 // 17 18 #include "AMDGPUAsmPrinter.h" 19 #include "AMDGPU.h" 20 #include "AMDGPUSubtarget.h" 21 #include "AMDGPUTargetMachine.h" 22 #include "MCTargetDesc/AMDGPUInstPrinter.h" 23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 24 #include "MCTargetDesc/AMDGPUTargetStreamer.h" 25 #include "R600AsmPrinter.h" 26 #include "R600Defines.h" 27 #include "R600MachineFunctionInfo.h" 28 #include "R600RegisterInfo.h" 29 #include "SIDefines.h" 30 #include "SIInstrInfo.h" 31 #include "SIMachineFunctionInfo.h" 32 #include "SIRegisterInfo.h" 33 #include "TargetInfo/AMDGPUTargetInfo.h" 34 #include "Utils/AMDGPUBaseInfo.h" 35 #include "llvm/BinaryFormat/ELF.h" 36 #include "llvm/CodeGen/MachineFrameInfo.h" 37 #include "llvm/IR/DiagnosticInfo.h" 38 #include "llvm/MC/MCAssembler.h" 39 #include "llvm/MC/MCContext.h" 40 #include "llvm/MC/MCSectionELF.h" 41 #include "llvm/MC/MCStreamer.h" 42 #include "llvm/Support/AMDGPUMetadata.h" 43 #include "llvm/Support/MathExtras.h" 44 #include "llvm/Support/TargetParser.h" 45 #include "llvm/Support/TargetRegistry.h" 46 #include "llvm/Target/TargetLoweringObjectFile.h" 47 48 using namespace llvm; 49 using namespace llvm::AMDGPU; 50 using namespace llvm::AMDGPU::HSAMD; 51 52 // We need to tell the runtime some amount ahead of time if we don't know the 53 // true stack size. Assume a smaller number if this is only due to dynamic / 54 // non-entry block allocas. 55 static cl::opt<uint32_t> AssumedStackSizeForExternalCall( 56 "amdgpu-assume-external-call-stack-size", 57 cl::desc("Assumed stack use of any external call (in bytes)"), 58 cl::Hidden, 59 cl::init(16384)); 60 61 static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects( 62 "amdgpu-assume-dynamic-stack-object-size", 63 cl::desc("Assumed extra stack use if there are any " 64 "variable sized objects (in bytes)"), 65 cl::Hidden, 66 cl::init(4096)); 67 68 // This should get the default rounding mode from the kernel. We just set the 69 // default here, but this could change if the OpenCL rounding mode pragmas are 70 // used. 71 // 72 // The denormal mode here should match what is reported by the OpenCL runtime 73 // for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but 74 // can also be override to flush with the -cl-denorms-are-zero compiler flag. 75 // 76 // AMD OpenCL only sets flush none and reports CL_FP_DENORM for double 77 // precision, and leaves single precision to flush all and does not report 78 // CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports 79 // CL_FP_DENORM for both. 80 // 81 // FIXME: It seems some instructions do not support single precision denormals 82 // regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32, 83 // and sin_f32, cos_f32 on most parts). 84 85 // We want to use these instructions, and using fp32 denormals also causes 86 // instructions to run at the double precision rate for the device so it's 87 // probably best to just report no single precision denormals. 88 static uint32_t getFPMode(AMDGPU::SIModeRegisterDefaults Mode) { 89 return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) | 90 FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) | 91 FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) | 92 FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue()); 93 } 94 95 static AsmPrinter * 96 createAMDGPUAsmPrinterPass(TargetMachine &tm, 97 std::unique_ptr<MCStreamer> &&Streamer) { 98 return new AMDGPUAsmPrinter(tm, std::move(Streamer)); 99 } 100 101 extern "C" void LLVM_EXTERNAL_VISIBILITY LLVMInitializeAMDGPUAsmPrinter() { 102 TargetRegistry::RegisterAsmPrinter(getTheAMDGPUTarget(), 103 llvm::createR600AsmPrinterPass); 104 TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(), 105 createAMDGPUAsmPrinterPass); 106 } 107 108 AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, 109 std::unique_ptr<MCStreamer> Streamer) 110 : AsmPrinter(TM, std::move(Streamer)) { 111 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { 112 if (isHsaAbiVersion2(getGlobalSTI())) { 113 HSAMetadataStream.reset(new MetadataStreamerV2()); 114 } else { 115 HSAMetadataStream.reset(new MetadataStreamerV3()); 116 } 117 } 118 } 119 120 StringRef AMDGPUAsmPrinter::getPassName() const { 121 return "AMDGPU Assembly Printer"; 122 } 123 124 const MCSubtargetInfo *AMDGPUAsmPrinter::getGlobalSTI() const { 125 return TM.getMCSubtargetInfo(); 126 } 127 128 AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const { 129 if (!OutStreamer) 130 return nullptr; 131 return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer()); 132 } 133 134 void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) { 135 if (isHsaAbiVersion3(getGlobalSTI())) { 136 std::string ExpectedTarget; 137 raw_string_ostream ExpectedTargetOS(ExpectedTarget); 138 IsaInfo::streamIsaVersion(getGlobalSTI(), ExpectedTargetOS); 139 140 getTargetStreamer()->EmitDirectiveAMDGCNTarget(ExpectedTarget); 141 } 142 143 if (TM.getTargetTriple().getOS() != Triple::AMDHSA && 144 TM.getTargetTriple().getOS() != Triple::AMDPAL) 145 return; 146 147 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) 148 HSAMetadataStream->begin(M); 149 150 if (TM.getTargetTriple().getOS() == Triple::AMDPAL) 151 getTargetStreamer()->getPALMetadata()->readFromIR(M); 152 153 if (isHsaAbiVersion3(getGlobalSTI())) 154 return; 155 156 // HSA emits NT_AMDGPU_HSA_CODE_OBJECT_VERSION for code objects v2. 157 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) 158 getTargetStreamer()->EmitDirectiveHSACodeObjectVersion(2, 1); 159 160 // HSA and PAL emit NT_AMDGPU_HSA_ISA for code objects v2. 161 IsaVersion Version = getIsaVersion(getGlobalSTI()->getCPU()); 162 getTargetStreamer()->EmitDirectiveHSACodeObjectISA( 163 Version.Major, Version.Minor, Version.Stepping, "AMD", "AMDGPU"); 164 } 165 166 void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) { 167 // Following code requires TargetStreamer to be present. 168 if (!getTargetStreamer()) 169 return; 170 171 if (TM.getTargetTriple().getOS() != Triple::AMDHSA || 172 isHsaAbiVersion2(getGlobalSTI())) { 173 // Emit ISA Version (NT_AMD_AMDGPU_ISA). 174 std::string ISAVersionString; 175 raw_string_ostream ISAVersionStream(ISAVersionString); 176 IsaInfo::streamIsaVersion(getGlobalSTI(), ISAVersionStream); 177 getTargetStreamer()->EmitISAVersion(ISAVersionStream.str()); 178 } 179 180 // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA). 181 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { 182 HSAMetadataStream->end(); 183 bool Success = HSAMetadataStream->emitTo(*getTargetStreamer()); 184 (void)Success; 185 assert(Success && "Malformed HSA Metadata"); 186 } 187 } 188 189 bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough( 190 const MachineBasicBlock *MBB) const { 191 if (!AsmPrinter::isBlockOnlyReachableByFallthrough(MBB)) 192 return false; 193 194 if (MBB->empty()) 195 return true; 196 197 // If this is a block implementing a long branch, an expression relative to 198 // the start of the block is needed. to the start of the block. 199 // XXX - Is there a smarter way to check this? 200 return (MBB->back().getOpcode() != AMDGPU::S_SETPC_B64); 201 } 202 203 void AMDGPUAsmPrinter::emitFunctionBodyStart() { 204 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); 205 if (!MFI.isEntryFunction()) 206 return; 207 208 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>(); 209 const Function &F = MF->getFunction(); 210 if ((STM.isMesaKernel(F) || isHsaAbiVersion2(getGlobalSTI())) && 211 (F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 212 F.getCallingConv() == CallingConv::SPIR_KERNEL)) { 213 amd_kernel_code_t KernelCode; 214 getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF); 215 getTargetStreamer()->EmitAMDKernelCodeT(KernelCode); 216 } 217 218 if (STM.isAmdHsaOS()) 219 HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo); 220 } 221 222 void AMDGPUAsmPrinter::emitFunctionBodyEnd() { 223 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); 224 if (!MFI.isEntryFunction()) 225 return; 226 227 if (TM.getTargetTriple().getOS() != Triple::AMDHSA || 228 isHsaAbiVersion2(getGlobalSTI())) 229 return; 230 231 auto &Streamer = getTargetStreamer()->getStreamer(); 232 auto &Context = Streamer.getContext(); 233 auto &ObjectFileInfo = *Context.getObjectFileInfo(); 234 auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection(); 235 236 Streamer.PushSection(); 237 Streamer.SwitchSection(&ReadOnlySection); 238 239 // CP microcode requires the kernel descriptor to be allocated on 64 byte 240 // alignment. 241 Streamer.emitValueToAlignment(64, 0, 1, 0); 242 if (ReadOnlySection.getAlignment() < 64) 243 ReadOnlySection.setAlignment(Align(64)); 244 245 const MCSubtargetInfo &STI = MF->getSubtarget(); 246 247 SmallString<128> KernelName; 248 getNameWithPrefix(KernelName, &MF->getFunction()); 249 getTargetStreamer()->EmitAmdhsaKernelDescriptor( 250 STI, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo), 251 CurrentProgramInfo.NumVGPRsForWavesPerEU, 252 CurrentProgramInfo.NumSGPRsForWavesPerEU - 253 IsaInfo::getNumExtraSGPRs(&STI, 254 CurrentProgramInfo.VCCUsed, 255 CurrentProgramInfo.FlatUsed), 256 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed, 257 hasXNACK(STI)); 258 259 Streamer.PopSection(); 260 } 261 262 void AMDGPUAsmPrinter::emitFunctionEntryLabel() { 263 if (TM.getTargetTriple().getOS() == Triple::AMDHSA && 264 isHsaAbiVersion3(getGlobalSTI())) { 265 AsmPrinter::emitFunctionEntryLabel(); 266 return; 267 } 268 269 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 270 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>(); 271 if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) { 272 SmallString<128> SymbolName; 273 getNameWithPrefix(SymbolName, &MF->getFunction()), 274 getTargetStreamer()->EmitAMDGPUSymbolType( 275 SymbolName, ELF::STT_AMDGPU_HSA_KERNEL); 276 } 277 if (DumpCodeInstEmitter) { 278 // Disassemble function name label to text. 279 DisasmLines.push_back(MF->getName().str() + ":"); 280 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size()); 281 HexLines.push_back(""); 282 } 283 284 AsmPrinter::emitFunctionEntryLabel(); 285 } 286 287 void AMDGPUAsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) { 288 if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) { 289 // Write a line for the basic block label if it is not only fallthrough. 290 DisasmLines.push_back( 291 (Twine("BB") + Twine(getFunctionNumber()) 292 + "_" + Twine(MBB.getNumber()) + ":").str()); 293 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size()); 294 HexLines.push_back(""); 295 } 296 AsmPrinter::emitBasicBlockStart(MBB); 297 } 298 299 void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) { 300 if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { 301 if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) { 302 OutContext.reportError({}, 303 Twine(GV->getName()) + 304 ": unsupported initializer for address space"); 305 return; 306 } 307 308 // LDS variables aren't emitted in HSA or PAL yet. 309 const Triple::OSType OS = TM.getTargetTriple().getOS(); 310 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) 311 return; 312 313 MCSymbol *GVSym = getSymbol(GV); 314 315 GVSym->redefineIfPossible(); 316 if (GVSym->isDefined() || GVSym->isVariable()) 317 report_fatal_error("symbol '" + Twine(GVSym->getName()) + 318 "' is already defined"); 319 320 const DataLayout &DL = GV->getParent()->getDataLayout(); 321 uint64_t Size = DL.getTypeAllocSize(GV->getValueType()); 322 Align Alignment = GV->getAlign().getValueOr(Align(4)); 323 324 emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration()); 325 emitLinkage(GV, GVSym); 326 if (auto TS = getTargetStreamer()) 327 TS->emitAMDGPULDS(GVSym, Size, Alignment); 328 return; 329 } 330 331 AsmPrinter::emitGlobalVariable(GV); 332 } 333 334 bool AMDGPUAsmPrinter::doFinalization(Module &M) { 335 CallGraphResourceInfo.clear(); 336 337 // Pad with s_code_end to help tools and guard against instruction prefetch 338 // causing stale data in caches. Arguably this should be done by the linker, 339 // which is why this isn't done for Mesa. 340 const MCSubtargetInfo &STI = *getGlobalSTI(); 341 if (AMDGPU::isGFX10(STI) && 342 (STI.getTargetTriple().getOS() == Triple::AMDHSA || 343 STI.getTargetTriple().getOS() == Triple::AMDPAL)) { 344 OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); 345 getTargetStreamer()->EmitCodeEnd(); 346 } 347 348 return AsmPrinter::doFinalization(M); 349 } 350 351 // Print comments that apply to both callable functions and entry points. 352 void AMDGPUAsmPrinter::emitCommonFunctionComments( 353 uint32_t NumVGPR, 354 Optional<uint32_t> NumAGPR, 355 uint32_t TotalNumVGPR, 356 uint32_t NumSGPR, 357 uint64_t ScratchSize, 358 uint64_t CodeSize, 359 const AMDGPUMachineFunction *MFI) { 360 OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false); 361 OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false); 362 OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false); 363 if (NumAGPR) { 364 OutStreamer->emitRawComment(" NumAgprs: " + Twine(*NumAGPR), false); 365 OutStreamer->emitRawComment(" TotalNumVgprs: " + Twine(TotalNumVGPR), 366 false); 367 } 368 OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false); 369 OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()), 370 false); 371 } 372 373 uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( 374 const MachineFunction &MF) const { 375 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 376 uint16_t KernelCodeProperties = 0; 377 378 if (MFI.hasPrivateSegmentBuffer()) { 379 KernelCodeProperties |= 380 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; 381 } 382 if (MFI.hasDispatchPtr()) { 383 KernelCodeProperties |= 384 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; 385 } 386 if (MFI.hasQueuePtr()) { 387 KernelCodeProperties |= 388 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; 389 } 390 if (MFI.hasKernargSegmentPtr()) { 391 KernelCodeProperties |= 392 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; 393 } 394 if (MFI.hasDispatchID()) { 395 KernelCodeProperties |= 396 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; 397 } 398 if (MFI.hasFlatScratchInit()) { 399 KernelCodeProperties |= 400 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; 401 } 402 if (MF.getSubtarget<GCNSubtarget>().isWave32()) { 403 KernelCodeProperties |= 404 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32; 405 } 406 407 return KernelCodeProperties; 408 } 409 410 amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor( 411 const MachineFunction &MF, 412 const SIProgramInfo &PI) const { 413 amdhsa::kernel_descriptor_t KernelDescriptor; 414 memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor)); 415 416 assert(isUInt<32>(PI.ScratchSize)); 417 assert(isUInt<32>(PI.getComputePGMRSrc1())); 418 assert(isUInt<32>(PI.ComputePGMRSrc2)); 419 420 KernelDescriptor.group_segment_fixed_size = PI.LDSSize; 421 KernelDescriptor.private_segment_fixed_size = PI.ScratchSize; 422 KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(); 423 KernelDescriptor.compute_pgm_rsrc2 = PI.ComputePGMRSrc2; 424 KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF); 425 426 return KernelDescriptor; 427 } 428 429 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { 430 CurrentProgramInfo = SIProgramInfo(); 431 432 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>(); 433 434 // The starting address of all shader programs must be 256 bytes aligned. 435 // Regular functions just need the basic required instruction alignment. 436 MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4)); 437 438 SetupMachineFunction(MF); 439 440 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 441 MCContext &Context = getObjFileLowering().getContext(); 442 // FIXME: This should be an explicit check for Mesa. 443 if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) { 444 MCSectionELF *ConfigSection = 445 Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0); 446 OutStreamer->SwitchSection(ConfigSection); 447 } 448 449 if (MFI->isEntryFunction()) { 450 getSIProgramInfo(CurrentProgramInfo, MF); 451 } else { 452 auto I = CallGraphResourceInfo.insert( 453 std::make_pair(&MF.getFunction(), SIFunctionResourceInfo())); 454 SIFunctionResourceInfo &Info = I.first->second; 455 assert(I.second && "should only be called once per function"); 456 Info = analyzeResourceUsage(MF); 457 } 458 459 if (STM.isAmdPalOS() && MFI->isEntryFunction()) 460 EmitPALMetadata(MF, CurrentProgramInfo); 461 else if (!STM.isAmdHsaOS()) { 462 EmitProgramInfoSI(MF, CurrentProgramInfo); 463 } 464 465 DumpCodeInstEmitter = nullptr; 466 if (STM.dumpCode()) { 467 // For -dumpcode, get the assembler out of the streamer, even if it does 468 // not really want to let us have it. This only works with -filetype=obj. 469 bool SaveFlag = OutStreamer->getUseAssemblerInfoForParsing(); 470 OutStreamer->setUseAssemblerInfoForParsing(true); 471 MCAssembler *Assembler = OutStreamer->getAssemblerPtr(); 472 OutStreamer->setUseAssemblerInfoForParsing(SaveFlag); 473 if (Assembler) 474 DumpCodeInstEmitter = Assembler->getEmitterPtr(); 475 } 476 477 DisasmLines.clear(); 478 HexLines.clear(); 479 DisasmLineMaxLen = 0; 480 481 emitFunctionBody(); 482 483 if (isVerbose()) { 484 MCSectionELF *CommentSection = 485 Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0); 486 OutStreamer->SwitchSection(CommentSection); 487 488 if (!MFI->isEntryFunction()) { 489 OutStreamer->emitRawComment(" Function info:", false); 490 SIFunctionResourceInfo &Info = CallGraphResourceInfo[&MF.getFunction()]; 491 emitCommonFunctionComments( 492 Info.NumVGPR, 493 STM.hasMAIInsts() ? Info.NumAGPR : Optional<uint32_t>(), 494 Info.getTotalNumVGPRs(STM), 495 Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()), 496 Info.PrivateSegmentSize, 497 getFunctionCodeSize(MF), MFI); 498 return false; 499 } 500 501 OutStreamer->emitRawComment(" Kernel info:", false); 502 emitCommonFunctionComments(CurrentProgramInfo.NumArchVGPR, 503 STM.hasMAIInsts() 504 ? CurrentProgramInfo.NumAccVGPR 505 : Optional<uint32_t>(), 506 CurrentProgramInfo.NumVGPR, 507 CurrentProgramInfo.NumSGPR, 508 CurrentProgramInfo.ScratchSize, 509 getFunctionCodeSize(MF), MFI); 510 511 OutStreamer->emitRawComment( 512 " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false); 513 OutStreamer->emitRawComment( 514 " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false); 515 OutStreamer->emitRawComment( 516 " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) + 517 " bytes/workgroup (compile time only)", false); 518 519 OutStreamer->emitRawComment( 520 " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false); 521 OutStreamer->emitRawComment( 522 " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false); 523 524 OutStreamer->emitRawComment( 525 " NumSGPRsForWavesPerEU: " + 526 Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false); 527 OutStreamer->emitRawComment( 528 " NumVGPRsForWavesPerEU: " + 529 Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false); 530 531 OutStreamer->emitRawComment( 532 " Occupancy: " + 533 Twine(CurrentProgramInfo.Occupancy), false); 534 535 OutStreamer->emitRawComment( 536 " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false); 537 538 OutStreamer->emitRawComment( 539 " COMPUTE_PGM_RSRC2:USER_SGPR: " + 540 Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false); 541 OutStreamer->emitRawComment( 542 " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " + 543 Twine(G_00B84C_TRAP_HANDLER(CurrentProgramInfo.ComputePGMRSrc2)), false); 544 OutStreamer->emitRawComment( 545 " COMPUTE_PGM_RSRC2:TGID_X_EN: " + 546 Twine(G_00B84C_TGID_X_EN(CurrentProgramInfo.ComputePGMRSrc2)), false); 547 OutStreamer->emitRawComment( 548 " COMPUTE_PGM_RSRC2:TGID_Y_EN: " + 549 Twine(G_00B84C_TGID_Y_EN(CurrentProgramInfo.ComputePGMRSrc2)), false); 550 OutStreamer->emitRawComment( 551 " COMPUTE_PGM_RSRC2:TGID_Z_EN: " + 552 Twine(G_00B84C_TGID_Z_EN(CurrentProgramInfo.ComputePGMRSrc2)), false); 553 OutStreamer->emitRawComment( 554 " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " + 555 Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)), 556 false); 557 } 558 559 if (DumpCodeInstEmitter) { 560 561 OutStreamer->SwitchSection( 562 Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0)); 563 564 for (size_t i = 0; i < DisasmLines.size(); ++i) { 565 std::string Comment = "\n"; 566 if (!HexLines[i].empty()) { 567 Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' '); 568 Comment += " ; " + HexLines[i] + "\n"; 569 } 570 571 OutStreamer->emitBytes(StringRef(DisasmLines[i])); 572 OutStreamer->emitBytes(StringRef(Comment)); 573 } 574 } 575 576 return false; 577 } 578 579 uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const { 580 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 581 const SIInstrInfo *TII = STM.getInstrInfo(); 582 583 uint64_t CodeSize = 0; 584 585 for (const MachineBasicBlock &MBB : MF) { 586 for (const MachineInstr &MI : MBB) { 587 // TODO: CodeSize should account for multiple functions. 588 589 // TODO: Should we count size of debug info? 590 if (MI.isDebugInstr()) 591 continue; 592 593 CodeSize += TII->getInstSizeInBytes(MI); 594 } 595 } 596 597 return CodeSize; 598 } 599 600 static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, 601 const SIInstrInfo &TII, 602 unsigned Reg) { 603 for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) { 604 if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent())) 605 return true; 606 } 607 608 return false; 609 } 610 611 int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs( 612 const GCNSubtarget &ST) const { 613 return NumExplicitSGPR + IsaInfo::getNumExtraSGPRs(&ST, 614 UsesVCC, UsesFlatScratch); 615 } 616 617 int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumVGPRs( 618 const GCNSubtarget &ST) const { 619 return std::max(NumVGPR, NumAGPR); 620 } 621 622 static const Function *getCalleeFunction(const MachineOperand &Op) { 623 if (Op.isImm()) { 624 assert(Op.getImm() == 0); 625 return nullptr; 626 } 627 628 return cast<Function>(Op.getGlobal()); 629 } 630 631 AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( 632 const MachineFunction &MF) const { 633 SIFunctionResourceInfo Info; 634 635 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 636 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 637 const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 638 const MachineRegisterInfo &MRI = MF.getRegInfo(); 639 const SIInstrInfo *TII = ST.getInstrInfo(); 640 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 641 642 Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) || 643 MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI); 644 645 // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat 646 // instructions aren't used to access the scratch buffer. Inline assembly may 647 // need it though. 648 // 649 // If we only have implicit uses of flat_scr on flat instructions, it is not 650 // really needed. 651 if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() && 652 (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) && 653 !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) && 654 !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) { 655 Info.UsesFlatScratch = false; 656 } 657 658 Info.PrivateSegmentSize = FrameInfo.getStackSize(); 659 660 // Assume a big number if there are any unknown sized objects. 661 Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects(); 662 if (Info.HasDynamicallySizedStack) 663 Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects; 664 665 if (MFI->isStackRealigned()) 666 Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value(); 667 668 Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) || 669 MRI.isPhysRegUsed(AMDGPU::VCC_HI); 670 671 // If there are no calls, MachineRegisterInfo can tell us the used register 672 // count easily. 673 // A tail call isn't considered a call for MachineFrameInfo's purposes. 674 if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) { 675 MCPhysReg HighestVGPRReg = AMDGPU::NoRegister; 676 for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { 677 if (MRI.isPhysRegUsed(Reg)) { 678 HighestVGPRReg = Reg; 679 break; 680 } 681 } 682 683 if (ST.hasMAIInsts()) { 684 MCPhysReg HighestAGPRReg = AMDGPU::NoRegister; 685 for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) { 686 if (MRI.isPhysRegUsed(Reg)) { 687 HighestAGPRReg = Reg; 688 break; 689 } 690 } 691 Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister ? 0 : 692 TRI.getHWRegIndex(HighestAGPRReg) + 1; 693 } 694 695 MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; 696 for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { 697 if (MRI.isPhysRegUsed(Reg)) { 698 HighestSGPRReg = Reg; 699 break; 700 } 701 } 702 703 // We found the maximum register index. They start at 0, so add one to get the 704 // number of registers. 705 Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 : 706 TRI.getHWRegIndex(HighestVGPRReg) + 1; 707 Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 : 708 TRI.getHWRegIndex(HighestSGPRReg) + 1; 709 710 return Info; 711 } 712 713 int32_t MaxVGPR = -1; 714 int32_t MaxAGPR = -1; 715 int32_t MaxSGPR = -1; 716 uint64_t CalleeFrameSize = 0; 717 718 for (const MachineBasicBlock &MBB : MF) { 719 for (const MachineInstr &MI : MBB) { 720 // TODO: Check regmasks? Do they occur anywhere except calls? 721 for (const MachineOperand &MO : MI.operands()) { 722 unsigned Width = 0; 723 bool IsSGPR = false; 724 bool IsAGPR = false; 725 726 if (!MO.isReg()) 727 continue; 728 729 Register Reg = MO.getReg(); 730 switch (Reg) { 731 case AMDGPU::EXEC: 732 case AMDGPU::EXEC_LO: 733 case AMDGPU::EXEC_HI: 734 case AMDGPU::SCC: 735 case AMDGPU::M0: 736 case AMDGPU::SRC_SHARED_BASE: 737 case AMDGPU::SRC_SHARED_LIMIT: 738 case AMDGPU::SRC_PRIVATE_BASE: 739 case AMDGPU::SRC_PRIVATE_LIMIT: 740 case AMDGPU::SGPR_NULL: 741 case AMDGPU::MODE: 742 continue; 743 744 case AMDGPU::SRC_POPS_EXITING_WAVE_ID: 745 llvm_unreachable("src_pops_exiting_wave_id should not be used"); 746 747 case AMDGPU::NoRegister: 748 assert(MI.isDebugInstr() && "Instruction uses invalid noreg register"); 749 continue; 750 751 case AMDGPU::VCC: 752 case AMDGPU::VCC_LO: 753 case AMDGPU::VCC_HI: 754 case AMDGPU::VCC_LO_LO16: 755 case AMDGPU::VCC_LO_HI16: 756 case AMDGPU::VCC_HI_LO16: 757 case AMDGPU::VCC_HI_HI16: 758 Info.UsesVCC = true; 759 continue; 760 761 case AMDGPU::FLAT_SCR: 762 case AMDGPU::FLAT_SCR_LO: 763 case AMDGPU::FLAT_SCR_HI: 764 continue; 765 766 case AMDGPU::XNACK_MASK: 767 case AMDGPU::XNACK_MASK_LO: 768 case AMDGPU::XNACK_MASK_HI: 769 llvm_unreachable("xnack_mask registers should not be used"); 770 771 case AMDGPU::LDS_DIRECT: 772 llvm_unreachable("lds_direct register should not be used"); 773 774 case AMDGPU::TBA: 775 case AMDGPU::TBA_LO: 776 case AMDGPU::TBA_HI: 777 case AMDGPU::TMA: 778 case AMDGPU::TMA_LO: 779 case AMDGPU::TMA_HI: 780 llvm_unreachable("trap handler registers should not be used"); 781 782 case AMDGPU::SRC_VCCZ: 783 llvm_unreachable("src_vccz register should not be used"); 784 785 case AMDGPU::SRC_EXECZ: 786 llvm_unreachable("src_execz register should not be used"); 787 788 case AMDGPU::SRC_SCC: 789 llvm_unreachable("src_scc register should not be used"); 790 791 default: 792 break; 793 } 794 795 if (AMDGPU::SReg_32RegClass.contains(Reg) || 796 AMDGPU::SReg_LO16RegClass.contains(Reg) || 797 AMDGPU::SGPR_HI16RegClass.contains(Reg)) { 798 assert(!AMDGPU::TTMP_32RegClass.contains(Reg) && 799 "trap handler registers should not be used"); 800 IsSGPR = true; 801 Width = 1; 802 } else if (AMDGPU::VGPR_32RegClass.contains(Reg) || 803 AMDGPU::VGPR_LO16RegClass.contains(Reg) || 804 AMDGPU::VGPR_HI16RegClass.contains(Reg)) { 805 IsSGPR = false; 806 Width = 1; 807 } else if (AMDGPU::AGPR_32RegClass.contains(Reg) || 808 AMDGPU::AGPR_LO16RegClass.contains(Reg)) { 809 IsSGPR = false; 810 IsAGPR = true; 811 Width = 1; 812 } else if (AMDGPU::SReg_64RegClass.contains(Reg)) { 813 assert(!AMDGPU::TTMP_64RegClass.contains(Reg) && 814 "trap handler registers should not be used"); 815 IsSGPR = true; 816 Width = 2; 817 } else if (AMDGPU::VReg_64RegClass.contains(Reg)) { 818 IsSGPR = false; 819 Width = 2; 820 } else if (AMDGPU::AReg_64RegClass.contains(Reg)) { 821 IsSGPR = false; 822 IsAGPR = true; 823 Width = 2; 824 } else if (AMDGPU::VReg_96RegClass.contains(Reg)) { 825 IsSGPR = false; 826 Width = 3; 827 } else if (AMDGPU::SReg_96RegClass.contains(Reg)) { 828 IsSGPR = true; 829 Width = 3; 830 } else if (AMDGPU::AReg_96RegClass.contains(Reg)) { 831 IsSGPR = false; 832 IsAGPR = true; 833 Width = 3; 834 } else if (AMDGPU::SReg_128RegClass.contains(Reg)) { 835 assert(!AMDGPU::TTMP_128RegClass.contains(Reg) && 836 "trap handler registers should not be used"); 837 IsSGPR = true; 838 Width = 4; 839 } else if (AMDGPU::VReg_128RegClass.contains(Reg)) { 840 IsSGPR = false; 841 Width = 4; 842 } else if (AMDGPU::AReg_128RegClass.contains(Reg)) { 843 IsSGPR = false; 844 IsAGPR = true; 845 Width = 4; 846 } else if (AMDGPU::VReg_160RegClass.contains(Reg)) { 847 IsSGPR = false; 848 Width = 5; 849 } else if (AMDGPU::SReg_160RegClass.contains(Reg)) { 850 IsSGPR = true; 851 Width = 5; 852 } else if (AMDGPU::AReg_160RegClass.contains(Reg)) { 853 IsSGPR = false; 854 IsAGPR = true; 855 Width = 5; 856 } else if (AMDGPU::VReg_192RegClass.contains(Reg)) { 857 IsSGPR = false; 858 Width = 6; 859 } else if (AMDGPU::SReg_192RegClass.contains(Reg)) { 860 IsSGPR = true; 861 Width = 6; 862 } else if (AMDGPU::AReg_192RegClass.contains(Reg)) { 863 IsSGPR = false; 864 IsAGPR = true; 865 Width = 6; 866 } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { 867 assert(!AMDGPU::TTMP_256RegClass.contains(Reg) && 868 "trap handler registers should not be used"); 869 IsSGPR = true; 870 Width = 8; 871 } else if (AMDGPU::VReg_256RegClass.contains(Reg)) { 872 IsSGPR = false; 873 Width = 8; 874 } else if (AMDGPU::AReg_256RegClass.contains(Reg)) { 875 IsSGPR = false; 876 IsAGPR = true; 877 Width = 8; 878 } else if (AMDGPU::SReg_512RegClass.contains(Reg)) { 879 assert(!AMDGPU::TTMP_512RegClass.contains(Reg) && 880 "trap handler registers should not be used"); 881 IsSGPR = true; 882 Width = 16; 883 } else if (AMDGPU::VReg_512RegClass.contains(Reg)) { 884 IsSGPR = false; 885 Width = 16; 886 } else if (AMDGPU::AReg_512RegClass.contains(Reg)) { 887 IsSGPR = false; 888 IsAGPR = true; 889 Width = 16; 890 } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) { 891 IsSGPR = true; 892 Width = 32; 893 } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) { 894 IsSGPR = false; 895 Width = 32; 896 } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) { 897 IsSGPR = false; 898 IsAGPR = true; 899 Width = 32; 900 } else { 901 llvm_unreachable("Unknown register class"); 902 } 903 unsigned HWReg = TRI.getHWRegIndex(Reg); 904 int MaxUsed = HWReg + Width - 1; 905 if (IsSGPR) { 906 MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR; 907 } else if (IsAGPR) { 908 MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR; 909 } else { 910 MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR; 911 } 912 } 913 914 if (MI.isCall()) { 915 // Pseudo used just to encode the underlying global. Is there a better 916 // way to track this? 917 918 const MachineOperand *CalleeOp 919 = TII->getNamedOperand(MI, AMDGPU::OpName::callee); 920 921 const Function *Callee = getCalleeFunction(*CalleeOp); 922 if (!Callee || Callee->isDeclaration()) { 923 // If this is a call to an external function, we can't do much. Make 924 // conservative guesses. 925 926 // 48 SGPRs - vcc, - flat_scr, -xnack 927 int MaxSGPRGuess = 928 47 - IsaInfo::getNumExtraSGPRs(&ST, true, ST.hasFlatAddressSpace()); 929 MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess); 930 MaxVGPR = std::max(MaxVGPR, 23); 931 MaxAGPR = std::max(MaxAGPR, 23); 932 933 CalleeFrameSize = std::max(CalleeFrameSize, 934 static_cast<uint64_t>(AssumedStackSizeForExternalCall)); 935 936 Info.UsesVCC = true; 937 Info.UsesFlatScratch = ST.hasFlatAddressSpace(); 938 Info.HasDynamicallySizedStack = true; 939 } else { 940 // We force CodeGen to run in SCC order, so the callee's register 941 // usage etc. should be the cumulative usage of all callees. 942 943 auto I = CallGraphResourceInfo.find(Callee); 944 if (I == CallGraphResourceInfo.end()) { 945 // Avoid crashing on undefined behavior with an illegal call to a 946 // kernel. If a callsite's calling convention doesn't match the 947 // function's, it's undefined behavior. If the callsite calling 948 // convention does match, that would have errored earlier. 949 // FIXME: The verifier shouldn't allow this. 950 if (AMDGPU::isEntryFunctionCC(Callee->getCallingConv())) 951 report_fatal_error("invalid call to entry function"); 952 953 llvm_unreachable("callee should have been handled before caller"); 954 } 955 956 MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR); 957 MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR); 958 MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR); 959 CalleeFrameSize 960 = std::max(I->second.PrivateSegmentSize, CalleeFrameSize); 961 Info.UsesVCC |= I->second.UsesVCC; 962 Info.UsesFlatScratch |= I->second.UsesFlatScratch; 963 Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack; 964 Info.HasRecursion |= I->second.HasRecursion; 965 } 966 967 // FIXME: Call site could have norecurse on it 968 if (!Callee || !Callee->doesNotRecurse()) 969 Info.HasRecursion = true; 970 } 971 } 972 } 973 974 Info.NumExplicitSGPR = MaxSGPR + 1; 975 Info.NumVGPR = MaxVGPR + 1; 976 Info.NumAGPR = MaxAGPR + 1; 977 Info.PrivateSegmentSize += CalleeFrameSize; 978 979 return Info; 980 } 981 982 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, 983 const MachineFunction &MF) { 984 SIFunctionResourceInfo Info = analyzeResourceUsage(MF); 985 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 986 987 ProgInfo.NumArchVGPR = Info.NumVGPR; 988 ProgInfo.NumAccVGPR = Info.NumAGPR; 989 ProgInfo.NumVGPR = Info.getTotalNumVGPRs(STM); 990 ProgInfo.NumSGPR = Info.NumExplicitSGPR; 991 ProgInfo.ScratchSize = Info.PrivateSegmentSize; 992 ProgInfo.VCCUsed = Info.UsesVCC; 993 ProgInfo.FlatUsed = Info.UsesFlatScratch; 994 ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion; 995 996 const uint64_t MaxScratchPerWorkitem = 997 GCNSubtarget::MaxWaveScratchSize / STM.getWavefrontSize(); 998 if (ProgInfo.ScratchSize > MaxScratchPerWorkitem) { 999 DiagnosticInfoStackSize DiagStackSize(MF.getFunction(), 1000 ProgInfo.ScratchSize, DS_Error); 1001 MF.getFunction().getContext().diagnose(DiagStackSize); 1002 } 1003 1004 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1005 1006 // TODO(scott.linder): The calculations related to SGPR/VGPR blocks are 1007 // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be 1008 // unified. 1009 unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs( 1010 &STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed); 1011 1012 // Check the addressable register limit before we add ExtraSGPRs. 1013 if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 1014 !STM.hasSGPRInitBug()) { 1015 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); 1016 if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) { 1017 // This can happen due to a compiler bug or when using inline asm. 1018 LLVMContext &Ctx = MF.getFunction().getContext(); 1019 DiagnosticInfoResourceLimit Diag(MF.getFunction(), 1020 "addressable scalar registers", 1021 ProgInfo.NumSGPR, DS_Error, 1022 DK_ResourceLimit, 1023 MaxAddressableNumSGPRs); 1024 Ctx.diagnose(Diag); 1025 ProgInfo.NumSGPR = MaxAddressableNumSGPRs - 1; 1026 } 1027 } 1028 1029 // Account for extra SGPRs and VGPRs reserved for debugger use. 1030 ProgInfo.NumSGPR += ExtraSGPRs; 1031 1032 // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave 1033 // dispatch registers are function args. 1034 unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0; 1035 for (auto &Arg : MF.getFunction().args()) { 1036 unsigned NumRegs = (Arg.getType()->getPrimitiveSizeInBits() + 31) / 32; 1037 if (Arg.hasAttribute(Attribute::InReg)) 1038 WaveDispatchNumSGPR += NumRegs; 1039 else 1040 WaveDispatchNumVGPR += NumRegs; 1041 } 1042 ProgInfo.NumSGPR = std::max(ProgInfo.NumSGPR, WaveDispatchNumSGPR); 1043 ProgInfo.NumVGPR = std::max(ProgInfo.NumVGPR, WaveDispatchNumVGPR); 1044 1045 // Adjust number of registers used to meet default/requested minimum/maximum 1046 // number of waves per execution unit request. 1047 ProgInfo.NumSGPRsForWavesPerEU = std::max( 1048 std::max(ProgInfo.NumSGPR, 1u), STM.getMinNumSGPRs(MFI->getMaxWavesPerEU())); 1049 ProgInfo.NumVGPRsForWavesPerEU = std::max( 1050 std::max(ProgInfo.NumVGPR, 1u), STM.getMinNumVGPRs(MFI->getMaxWavesPerEU())); 1051 1052 if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS || 1053 STM.hasSGPRInitBug()) { 1054 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); 1055 if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) { 1056 // This can happen due to a compiler bug or when using inline asm to use 1057 // the registers which are usually reserved for vcc etc. 1058 LLVMContext &Ctx = MF.getFunction().getContext(); 1059 DiagnosticInfoResourceLimit Diag(MF.getFunction(), 1060 "scalar registers", 1061 ProgInfo.NumSGPR, DS_Error, 1062 DK_ResourceLimit, 1063 MaxAddressableNumSGPRs); 1064 Ctx.diagnose(Diag); 1065 ProgInfo.NumSGPR = MaxAddressableNumSGPRs; 1066 ProgInfo.NumSGPRsForWavesPerEU = MaxAddressableNumSGPRs; 1067 } 1068 } 1069 1070 if (STM.hasSGPRInitBug()) { 1071 ProgInfo.NumSGPR = 1072 AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 1073 ProgInfo.NumSGPRsForWavesPerEU = 1074 AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 1075 } 1076 1077 if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) { 1078 LLVMContext &Ctx = MF.getFunction().getContext(); 1079 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs", 1080 MFI->getNumUserSGPRs(), DS_Error); 1081 Ctx.diagnose(Diag); 1082 } 1083 1084 if (MFI->getLDSSize() > static_cast<unsigned>(STM.getLocalMemorySize())) { 1085 LLVMContext &Ctx = MF.getFunction().getContext(); 1086 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "local memory", 1087 MFI->getLDSSize(), DS_Error); 1088 Ctx.diagnose(Diag); 1089 } 1090 1091 ProgInfo.SGPRBlocks = IsaInfo::getNumSGPRBlocks( 1092 &STM, ProgInfo.NumSGPRsForWavesPerEU); 1093 ProgInfo.VGPRBlocks = IsaInfo::getNumVGPRBlocks( 1094 &STM, ProgInfo.NumVGPRsForWavesPerEU); 1095 1096 const SIModeRegisterDefaults Mode = MFI->getMode(); 1097 1098 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode 1099 // register. 1100 ProgInfo.FloatMode = getFPMode(Mode); 1101 1102 ProgInfo.IEEEMode = Mode.IEEE; 1103 1104 // Make clamp modifier on NaN input returns 0. 1105 ProgInfo.DX10Clamp = Mode.DX10Clamp; 1106 1107 unsigned LDSAlignShift; 1108 if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { 1109 // LDS is allocated in 64 dword blocks. 1110 LDSAlignShift = 8; 1111 } else { 1112 // LDS is allocated in 128 dword blocks. 1113 LDSAlignShift = 9; 1114 } 1115 1116 unsigned LDSSpillSize = 1117 MFI->getLDSWaveSpillSize() * MFI->getMaxFlatWorkGroupSize(); 1118 1119 ProgInfo.LDSSize = MFI->getLDSSize() + LDSSpillSize; 1120 ProgInfo.LDSBlocks = 1121 alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift; 1122 1123 // Scratch is allocated in 256 dword blocks. 1124 unsigned ScratchAlignShift = 10; 1125 // We need to program the hardware with the amount of scratch memory that 1126 // is used by the entire wave. ProgInfo.ScratchSize is the amount of 1127 // scratch memory used per thread. 1128 ProgInfo.ScratchBlocks = 1129 alignTo(ProgInfo.ScratchSize * STM.getWavefrontSize(), 1130 1ULL << ScratchAlignShift) >> 1131 ScratchAlignShift; 1132 1133 if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) { 1134 ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1; 1135 ProgInfo.MemOrdered = 1; 1136 } 1137 1138 // 0 = X, 1 = XY, 2 = XYZ 1139 unsigned TIDIGCompCnt = 0; 1140 if (MFI->hasWorkItemIDZ()) 1141 TIDIGCompCnt = 2; 1142 else if (MFI->hasWorkItemIDY()) 1143 TIDIGCompCnt = 1; 1144 1145 ProgInfo.ComputePGMRSrc2 = 1146 S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) | 1147 S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) | 1148 // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP. 1149 S_00B84C_TRAP_HANDLER(STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled()) | 1150 S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) | 1151 S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) | 1152 S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) | 1153 S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) | 1154 S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) | 1155 S_00B84C_EXCP_EN_MSB(0) | 1156 // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP. 1157 S_00B84C_LDS_SIZE(STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks) | 1158 S_00B84C_EXCP_EN(0); 1159 1160 ProgInfo.Occupancy = STM.computeOccupancy(MF.getFunction(), ProgInfo.LDSSize, 1161 ProgInfo.NumSGPRsForWavesPerEU, 1162 ProgInfo.NumVGPRsForWavesPerEU); 1163 } 1164 1165 static unsigned getRsrcReg(CallingConv::ID CallConv) { 1166 switch (CallConv) { 1167 default: LLVM_FALLTHROUGH; 1168 case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1; 1169 case CallingConv::AMDGPU_LS: return R_00B528_SPI_SHADER_PGM_RSRC1_LS; 1170 case CallingConv::AMDGPU_HS: return R_00B428_SPI_SHADER_PGM_RSRC1_HS; 1171 case CallingConv::AMDGPU_ES: return R_00B328_SPI_SHADER_PGM_RSRC1_ES; 1172 case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS; 1173 case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS; 1174 case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS; 1175 } 1176 } 1177 1178 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, 1179 const SIProgramInfo &CurrentProgramInfo) { 1180 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1181 unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv()); 1182 1183 if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { 1184 OutStreamer->emitInt32(R_00B848_COMPUTE_PGM_RSRC1); 1185 1186 OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc1()); 1187 1188 OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2); 1189 OutStreamer->emitInt32(CurrentProgramInfo.ComputePGMRSrc2); 1190 1191 OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE); 1192 OutStreamer->emitInt32(S_00B860_WAVESIZE(CurrentProgramInfo.ScratchBlocks)); 1193 1194 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 = 1195 // 0" comment but I don't see a corresponding field in the register spec. 1196 } else { 1197 OutStreamer->emitInt32(RsrcReg); 1198 OutStreamer->emitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) | 1199 S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4); 1200 OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE); 1201 OutStreamer->emitIntValue( 1202 S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4); 1203 } 1204 1205 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { 1206 OutStreamer->emitInt32(R_00B02C_SPI_SHADER_PGM_RSRC2_PS); 1207 OutStreamer->emitInt32( 1208 S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks)); 1209 OutStreamer->emitInt32(R_0286CC_SPI_PS_INPUT_ENA); 1210 OutStreamer->emitInt32(MFI->getPSInputEnable()); 1211 OutStreamer->emitInt32(R_0286D0_SPI_PS_INPUT_ADDR); 1212 OutStreamer->emitInt32(MFI->getPSInputAddr()); 1213 } 1214 1215 OutStreamer->emitInt32(R_SPILLED_SGPRS); 1216 OutStreamer->emitInt32(MFI->getNumSpilledSGPRs()); 1217 OutStreamer->emitInt32(R_SPILLED_VGPRS); 1218 OutStreamer->emitInt32(MFI->getNumSpilledVGPRs()); 1219 } 1220 1221 // This is the equivalent of EmitProgramInfoSI above, but for when the OS type 1222 // is AMDPAL. It stores each compute/SPI register setting and other PAL 1223 // metadata items into the PALMD::Metadata, combining with any provided by the 1224 // frontend as LLVM metadata. Once all functions are written, the PAL metadata 1225 // is then written as a single block in the .note section. 1226 void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF, 1227 const SIProgramInfo &CurrentProgramInfo) { 1228 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1229 auto CC = MF.getFunction().getCallingConv(); 1230 auto MD = getTargetStreamer()->getPALMetadata(); 1231 1232 MD->setEntryPoint(CC, MF.getFunction().getName()); 1233 MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU); 1234 MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU); 1235 MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC)); 1236 if (AMDGPU::isCompute(CC)) { 1237 MD->setRsrc2(CC, CurrentProgramInfo.ComputePGMRSrc2); 1238 } else { 1239 if (CurrentProgramInfo.ScratchBlocks > 0) 1240 MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1)); 1241 } 1242 // ScratchSize is in bytes, 16 aligned. 1243 MD->setScratchSize(CC, alignTo(CurrentProgramInfo.ScratchSize, 16)); 1244 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { 1245 MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks)); 1246 MD->setSpiPsInputEna(MFI->getPSInputEnable()); 1247 MD->setSpiPsInputAddr(MFI->getPSInputAddr()); 1248 } 1249 1250 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 1251 if (STM.isWave32()) 1252 MD->setWave32(MF.getFunction().getCallingConv()); 1253 } 1254 1255 // This is supposed to be log2(Size) 1256 static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) { 1257 switch (Size) { 1258 case 4: 1259 return AMD_ELEMENT_4_BYTES; 1260 case 8: 1261 return AMD_ELEMENT_8_BYTES; 1262 case 16: 1263 return AMD_ELEMENT_16_BYTES; 1264 default: 1265 llvm_unreachable("invalid private_element_size"); 1266 } 1267 } 1268 1269 void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, 1270 const SIProgramInfo &CurrentProgramInfo, 1271 const MachineFunction &MF) const { 1272 const Function &F = MF.getFunction(); 1273 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 1274 F.getCallingConv() == CallingConv::SPIR_KERNEL); 1275 1276 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1277 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 1278 1279 AMDGPU::initDefaultAMDKernelCodeT(Out, &STM); 1280 1281 Out.compute_pgm_resource_registers = 1282 CurrentProgramInfo.getComputePGMRSrc1() | 1283 (CurrentProgramInfo.ComputePGMRSrc2 << 32); 1284 Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64; 1285 1286 if (CurrentProgramInfo.DynamicCallStack) 1287 Out.code_properties |= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK; 1288 1289 AMD_HSA_BITS_SET(Out.code_properties, 1290 AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE, 1291 getElementByteSizeValue(STM.getMaxPrivateElementSize(true))); 1292 1293 if (MFI->hasPrivateSegmentBuffer()) { 1294 Out.code_properties |= 1295 AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; 1296 } 1297 1298 if (MFI->hasDispatchPtr()) 1299 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; 1300 1301 if (MFI->hasQueuePtr()) 1302 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; 1303 1304 if (MFI->hasKernargSegmentPtr()) 1305 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; 1306 1307 if (MFI->hasDispatchID()) 1308 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; 1309 1310 if (MFI->hasFlatScratchInit()) 1311 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; 1312 1313 if (MFI->hasDispatchPtr()) 1314 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; 1315 1316 if (STM.isXNACKEnabled()) 1317 Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED; 1318 1319 Align MaxKernArgAlign; 1320 Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign); 1321 Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR; 1322 Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR; 1323 Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize; 1324 Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize; 1325 1326 // kernarg_segment_alignment is specified as log of the alignment. 1327 // The minimum alignment is 16. 1328 Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign)); 1329 } 1330 1331 bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, 1332 const char *ExtraCode, raw_ostream &O) { 1333 // First try the generic code, which knows about modifiers like 'c' and 'n'. 1334 if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O)) 1335 return false; 1336 1337 if (ExtraCode && ExtraCode[0]) { 1338 if (ExtraCode[1] != 0) 1339 return true; // Unknown modifier. 1340 1341 switch (ExtraCode[0]) { 1342 case 'r': 1343 break; 1344 default: 1345 return true; 1346 } 1347 } 1348 1349 // TODO: Should be able to support other operand types like globals. 1350 const MachineOperand &MO = MI->getOperand(OpNo); 1351 if (MO.isReg()) { 1352 AMDGPUInstPrinter::printRegOperand(MO.getReg(), O, 1353 *MF->getSubtarget().getRegisterInfo()); 1354 return false; 1355 } else if (MO.isImm()) { 1356 int64_t Val = MO.getImm(); 1357 if (AMDGPU::isInlinableIntLiteral(Val)) { 1358 O << Val; 1359 } else if (isUInt<16>(Val)) { 1360 O << format("0x%" PRIx16, static_cast<uint16_t>(Val)); 1361 } else if (isUInt<32>(Val)) { 1362 O << format("0x%" PRIx32, static_cast<uint32_t>(Val)); 1363 } else { 1364 O << format("0x%" PRIx64, static_cast<uint64_t>(Val)); 1365 } 1366 return false; 1367 } 1368 return true; 1369 } 1370