1 //===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer --------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// 12 /// The AMDGPUAsmPrinter is used to print both assembly string and also binary 13 /// code. When passed an MCAsmStreamer it prints assembly and when passed 14 /// an MCObjectStreamer it outputs binary code. 15 // 16 //===----------------------------------------------------------------------===// 17 // 18 19 #include "AMDGPUAsmPrinter.h" 20 #include "AMDGPUTargetMachine.h" 21 #include "MCTargetDesc/AMDGPUTargetStreamer.h" 22 #include "InstPrinter/AMDGPUInstPrinter.h" 23 #include "Utils/AMDGPUBaseInfo.h" 24 #include "AMDGPU.h" 25 #include "AMDGPUSubtarget.h" 26 #include "R600Defines.h" 27 #include "R600MachineFunctionInfo.h" 28 #include "R600RegisterInfo.h" 29 #include "SIDefines.h" 30 #include "SIMachineFunctionInfo.h" 31 #include "SIInstrInfo.h" 32 #include "SIRegisterInfo.h" 33 #include "llvm/CodeGen/MachineFrameInfo.h" 34 #include "llvm/IR/DiagnosticInfo.h" 35 #include "llvm/MC/MCContext.h" 36 #include "llvm/MC/MCSectionELF.h" 37 #include "llvm/MC/MCStreamer.h" 38 #include "llvm/Support/ELF.h" 39 #include "llvm/Support/MathExtras.h" 40 #include "llvm/Support/TargetRegistry.h" 41 #include "llvm/Target/TargetLoweringObjectFile.h" 42 43 using namespace llvm; 44 45 // TODO: This should get the default rounding mode from the kernel. We just set 46 // the default here, but this could change if the OpenCL rounding mode pragmas 47 // are used. 48 // 49 // The denormal mode here should match what is reported by the OpenCL runtime 50 // for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but 51 // can also be override to flush with the -cl-denorms-are-zero compiler flag. 52 // 53 // AMD OpenCL only sets flush none and reports CL_FP_DENORM for double 54 // precision, and leaves single precision to flush all and does not report 55 // CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports 56 // CL_FP_DENORM for both. 57 // 58 // FIXME: It seems some instructions do not support single precision denormals 59 // regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32, 60 // and sin_f32, cos_f32 on most parts). 61 62 // We want to use these instructions, and using fp32 denormals also causes 63 // instructions to run at the double precision rate for the device so it's 64 // probably best to just report no single precision denormals. 65 static uint32_t getFPMode(const MachineFunction &F) { 66 const SISubtarget& ST = F.getSubtarget<SISubtarget>(); 67 // TODO: Is there any real use for the flush in only / flush out only modes? 68 69 uint32_t FP32Denormals = 70 ST.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; 71 72 uint32_t FP64Denormals = 73 ST.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; 74 75 return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) | 76 FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) | 77 FP_DENORM_MODE_SP(FP32Denormals) | 78 FP_DENORM_MODE_DP(FP64Denormals); 79 } 80 81 static AsmPrinter * 82 createAMDGPUAsmPrinterPass(TargetMachine &tm, 83 std::unique_ptr<MCStreamer> &&Streamer) { 84 return new AMDGPUAsmPrinter(tm, std::move(Streamer)); 85 } 86 87 extern "C" void LLVMInitializeAMDGPUAsmPrinter() { 88 TargetRegistry::RegisterAsmPrinter(getTheAMDGPUTarget(), 89 createAMDGPUAsmPrinterPass); 90 TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(), 91 createAMDGPUAsmPrinterPass); 92 } 93 94 AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, 95 std::unique_ptr<MCStreamer> Streamer) 96 : AsmPrinter(TM, std::move(Streamer)) { 97 AMDGPUASI = static_cast<AMDGPUTargetMachine*>(&TM)->getAMDGPUAS(); 98 } 99 100 StringRef AMDGPUAsmPrinter::getPassName() const { 101 return "AMDGPU Assembly Printer"; 102 } 103 104 const MCSubtargetInfo* AMDGPUAsmPrinter::getSTI() const { 105 return TM.getMCSubtargetInfo(); 106 } 107 108 AMDGPUTargetStreamer& AMDGPUAsmPrinter::getTargetStreamer() const { 109 return static_cast<AMDGPUTargetStreamer&>(*OutStreamer->getTargetStreamer()); 110 } 111 112 void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) { 113 if (TM.getTargetTriple().getOS() != Triple::AMDHSA) 114 return; 115 116 AMDGPU::IsaInfo::IsaVersion ISA = 117 AMDGPU::IsaInfo::getIsaVersion(getSTI()->getFeatureBits()); 118 119 getTargetStreamer().EmitDirectiveHSACodeObjectVersion(2, 1); 120 getTargetStreamer().EmitDirectiveHSACodeObjectISA( 121 ISA.Major, ISA.Minor, ISA.Stepping, "AMD", "AMDGPU"); 122 getTargetStreamer().EmitStartOfCodeObjectMetadata(M); 123 } 124 125 void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) { 126 if (TM.getTargetTriple().getOS() != Triple::AMDHSA) 127 return; 128 129 getTargetStreamer().EmitEndOfCodeObjectMetadata(); 130 } 131 132 bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough( 133 const MachineBasicBlock *MBB) const { 134 if (!AsmPrinter::isBlockOnlyReachableByFallthrough(MBB)) 135 return false; 136 137 if (MBB->empty()) 138 return true; 139 140 // If this is a block implementing a long branch, an expression relative to 141 // the start of the block is needed. to the start of the block. 142 // XXX - Is there a smarter way to check this? 143 return (MBB->back().getOpcode() != AMDGPU::S_SETPC_B64); 144 } 145 146 void AMDGPUAsmPrinter::EmitFunctionBodyStart() { 147 const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>(); 148 SIProgramInfo KernelInfo; 149 amd_kernel_code_t KernelCode; 150 if (STM.isAmdCodeObjectV2(*MF)) { 151 getSIProgramInfo(KernelInfo, *MF); 152 getAmdKernelCode(KernelCode, KernelInfo, *MF); 153 154 OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); 155 getTargetStreamer().EmitAMDKernelCodeT(KernelCode); 156 } 157 158 if (TM.getTargetTriple().getOS() != Triple::AMDHSA) 159 return; 160 getTargetStreamer().EmitKernelCodeObjectMetadata(*MF->getFunction(), 161 KernelCode); 162 } 163 164 void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { 165 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 166 const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>(); 167 if (MFI->isEntryFunction() && STM.isAmdCodeObjectV2(*MF)) { 168 SmallString<128> SymbolName; 169 getNameWithPrefix(SymbolName, MF->getFunction()), 170 getTargetStreamer().EmitAMDGPUSymbolType( 171 SymbolName, ELF::STT_AMDGPU_HSA_KERNEL); 172 } 173 174 AsmPrinter::EmitFunctionEntryLabel(); 175 } 176 177 void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { 178 179 // Group segment variables aren't emitted in HSA. 180 if (AMDGPU::isGroupSegment(GV, AMDGPUASI)) 181 return; 182 183 AsmPrinter::EmitGlobalVariable(GV); 184 } 185 186 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { 187 188 // The starting address of all shader programs must be 256 bytes aligned. 189 MF.setAlignment(8); 190 191 SetupMachineFunction(MF); 192 193 const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>(); 194 MCContext &Context = getObjFileLowering().getContext(); 195 if (!STM.isAmdHsaOS()) { 196 MCSectionELF *ConfigSection = 197 Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0); 198 OutStreamer->SwitchSection(ConfigSection); 199 } 200 201 SIProgramInfo KernelInfo; 202 if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 203 getSIProgramInfo(KernelInfo, MF); 204 if (!STM.isAmdHsaOS()) { 205 EmitProgramInfoSI(MF, KernelInfo); 206 } 207 } else { 208 EmitProgramInfoR600(MF); 209 } 210 211 DisasmLines.clear(); 212 HexLines.clear(); 213 DisasmLineMaxLen = 0; 214 215 EmitFunctionBody(); 216 217 if (isVerbose()) { 218 MCSectionELF *CommentSection = 219 Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0); 220 OutStreamer->SwitchSection(CommentSection); 221 222 if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 223 OutStreamer->emitRawComment(" Kernel info:", false); 224 OutStreamer->emitRawComment(" codeLenInByte = " + 225 Twine(getFunctionCodeSize(MF)), false); 226 OutStreamer->emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR), 227 false); 228 OutStreamer->emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR), 229 false); 230 OutStreamer->emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode), 231 false); 232 OutStreamer->emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode), 233 false); 234 OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize), 235 false); 236 OutStreamer->emitRawComment(" LDSByteSize: " + Twine(KernelInfo.LDSSize) + 237 " bytes/workgroup (compile time only)", false); 238 239 OutStreamer->emitRawComment(" SGPRBlocks: " + 240 Twine(KernelInfo.SGPRBlocks), false); 241 OutStreamer->emitRawComment(" VGPRBlocks: " + 242 Twine(KernelInfo.VGPRBlocks), false); 243 244 OutStreamer->emitRawComment(" NumSGPRsForWavesPerEU: " + 245 Twine(KernelInfo.NumSGPRsForWavesPerEU), false); 246 OutStreamer->emitRawComment(" NumVGPRsForWavesPerEU: " + 247 Twine(KernelInfo.NumVGPRsForWavesPerEU), false); 248 249 OutStreamer->emitRawComment(" ReservedVGPRFirst: " + Twine(KernelInfo.ReservedVGPRFirst), 250 false); 251 OutStreamer->emitRawComment(" ReservedVGPRCount: " + Twine(KernelInfo.ReservedVGPRCount), 252 false); 253 254 if (MF.getSubtarget<SISubtarget>().debuggerEmitPrologue()) { 255 OutStreamer->emitRawComment(" DebuggerWavefrontPrivateSegmentOffsetSGPR: s" + 256 Twine(KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false); 257 OutStreamer->emitRawComment(" DebuggerPrivateSegmentBufferSGPR: s" + 258 Twine(KernelInfo.DebuggerPrivateSegmentBufferSGPR), false); 259 } 260 261 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " + 262 Twine(G_00B84C_USER_SGPR(KernelInfo.ComputePGMRSrc2)), 263 false); 264 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " + 265 Twine(G_00B84C_TRAP_HANDLER(KernelInfo.ComputePGMRSrc2)), 266 false); 267 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " + 268 Twine(G_00B84C_TGID_X_EN(KernelInfo.ComputePGMRSrc2)), 269 false); 270 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " + 271 Twine(G_00B84C_TGID_Y_EN(KernelInfo.ComputePGMRSrc2)), 272 false); 273 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " + 274 Twine(G_00B84C_TGID_Z_EN(KernelInfo.ComputePGMRSrc2)), 275 false); 276 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " + 277 Twine(G_00B84C_TIDIG_COMP_CNT(KernelInfo.ComputePGMRSrc2)), 278 false); 279 280 } else { 281 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); 282 OutStreamer->emitRawComment( 283 Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->CFStackSize))); 284 } 285 } 286 287 if (STM.dumpCode()) { 288 289 OutStreamer->SwitchSection( 290 Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0)); 291 292 for (size_t i = 0; i < DisasmLines.size(); ++i) { 293 std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' '); 294 Comment += " ; " + HexLines[i] + "\n"; 295 296 OutStreamer->EmitBytes(StringRef(DisasmLines[i])); 297 OutStreamer->EmitBytes(StringRef(Comment)); 298 } 299 } 300 301 return false; 302 } 303 304 void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { 305 unsigned MaxGPR = 0; 306 bool killPixel = false; 307 const R600Subtarget &STM = MF.getSubtarget<R600Subtarget>(); 308 const R600RegisterInfo *RI = STM.getRegisterInfo(); 309 const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); 310 311 for (const MachineBasicBlock &MBB : MF) { 312 for (const MachineInstr &MI : MBB) { 313 if (MI.getOpcode() == AMDGPU::KILLGT) 314 killPixel = true; 315 unsigned numOperands = MI.getNumOperands(); 316 for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) { 317 const MachineOperand &MO = MI.getOperand(op_idx); 318 if (!MO.isReg()) 319 continue; 320 unsigned HWReg = RI->getHWRegIndex(MO.getReg()); 321 322 // Register with value > 127 aren't GPR 323 if (HWReg > 127) 324 continue; 325 MaxGPR = std::max(MaxGPR, HWReg); 326 } 327 } 328 } 329 330 unsigned RsrcReg; 331 if (STM.getGeneration() >= R600Subtarget::EVERGREEN) { 332 // Evergreen / Northern Islands 333 switch (MF.getFunction()->getCallingConv()) { 334 default: LLVM_FALLTHROUGH; 335 case CallingConv::AMDGPU_CS: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break; 336 case CallingConv::AMDGPU_GS: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break; 337 case CallingConv::AMDGPU_PS: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break; 338 case CallingConv::AMDGPU_VS: RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break; 339 } 340 } else { 341 // R600 / R700 342 switch (MF.getFunction()->getCallingConv()) { 343 default: LLVM_FALLTHROUGH; 344 case CallingConv::AMDGPU_GS: LLVM_FALLTHROUGH; 345 case CallingConv::AMDGPU_CS: LLVM_FALLTHROUGH; 346 case CallingConv::AMDGPU_VS: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break; 347 case CallingConv::AMDGPU_PS: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break; 348 } 349 } 350 351 OutStreamer->EmitIntValue(RsrcReg, 4); 352 OutStreamer->EmitIntValue(S_NUM_GPRS(MaxGPR + 1) | 353 S_STACK_SIZE(MFI->CFStackSize), 4); 354 OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4); 355 OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4); 356 357 if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) { 358 OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4); 359 OutStreamer->EmitIntValue(alignTo(MFI->getLDSSize(), 4) >> 2, 4); 360 } 361 } 362 363 uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const { 364 const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); 365 const SIInstrInfo *TII = STM.getInstrInfo(); 366 367 uint64_t CodeSize = 0; 368 369 for (const MachineBasicBlock &MBB : MF) { 370 for (const MachineInstr &MI : MBB) { 371 // TODO: CodeSize should account for multiple functions. 372 373 // TODO: Should we count size of debug info? 374 if (MI.isDebugValue()) 375 continue; 376 377 CodeSize += TII->getInstSizeInBytes(MI); 378 } 379 } 380 381 return CodeSize; 382 } 383 384 static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, 385 const SIInstrInfo &TII, 386 unsigned Reg) { 387 for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) { 388 if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent())) 389 return true; 390 } 391 392 return false; 393 } 394 395 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, 396 const MachineFunction &MF) const { 397 const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); 398 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 399 const MachineRegisterInfo &MRI = MF.getRegInfo(); 400 const SIInstrInfo *TII = STM.getInstrInfo(); 401 const SIRegisterInfo *RI = &TII->getRegisterInfo(); 402 403 404 MCPhysReg NumVGPRReg = AMDGPU::NoRegister; 405 for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { 406 if (MRI.isPhysRegUsed(Reg)) { 407 NumVGPRReg = Reg; 408 break; 409 } 410 } 411 412 MCPhysReg NumSGPRReg = AMDGPU::NoRegister; 413 for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { 414 if (MRI.isPhysRegUsed(Reg)) { 415 NumSGPRReg = Reg; 416 break; 417 } 418 } 419 420 // We found the maximum register index. They start at 0, so add one to get the 421 // number of registers. 422 ProgInfo.NumVGPR = NumVGPRReg == AMDGPU::NoRegister ? 0 : 423 RI->getHWRegIndex(NumVGPRReg) + 1; 424 ProgInfo.NumSGPR = NumSGPRReg == AMDGPU::NoRegister ? 0 : 425 RI->getHWRegIndex(NumSGPRReg) + 1; 426 unsigned ExtraSGPRs = 0; 427 428 ProgInfo.VCCUsed = MRI.isPhysRegUsed(AMDGPU::VCC_LO) || 429 MRI.isPhysRegUsed(AMDGPU::VCC_HI); 430 if (ProgInfo.VCCUsed) 431 ExtraSGPRs = 2; 432 433 ProgInfo.FlatUsed = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) || 434 MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI); 435 436 // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat 437 // instructions aren't used to access the scratch buffer. Inline assembly 438 // may need it though. 439 // 440 // If we only have implicit uses of flat_scr on flat instructions, it is not 441 // really needed. 442 if (ProgInfo.FlatUsed && !MFI->hasFlatScratchInit() && 443 (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) && 444 !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) && 445 !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) { 446 ProgInfo.FlatUsed = false; 447 } 448 449 if (STM.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) { 450 if (ProgInfo.FlatUsed) 451 ExtraSGPRs = 4; 452 } else { 453 if (STM.isXNACKEnabled()) 454 ExtraSGPRs = 4; 455 456 if (ProgInfo.FlatUsed) 457 ExtraSGPRs = 6; 458 } 459 460 unsigned ExtraVGPRs = STM.getReservedNumVGPRs(MF); 461 462 // Check the addressable register limit before we add ExtraSGPRs. 463 if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 464 !STM.hasSGPRInitBug()) { 465 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); 466 if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) { 467 // This can happen due to a compiler bug or when using inline asm. 468 LLVMContext &Ctx = MF.getFunction()->getContext(); 469 DiagnosticInfoResourceLimit Diag(*MF.getFunction(), 470 "addressable scalar registers", 471 ProgInfo.NumSGPR, DS_Error, 472 DK_ResourceLimit, 473 MaxAddressableNumSGPRs); 474 Ctx.diagnose(Diag); 475 ProgInfo.NumSGPR = MaxAddressableNumSGPRs - 1; 476 } 477 } 478 479 // Account for extra SGPRs and VGPRs reserved for debugger use. 480 ProgInfo.NumSGPR += ExtraSGPRs; 481 ProgInfo.NumVGPR += ExtraVGPRs; 482 483 // Adjust number of registers used to meet default/requested minimum/maximum 484 // number of waves per execution unit request. 485 ProgInfo.NumSGPRsForWavesPerEU = std::max( 486 std::max(ProgInfo.NumSGPR, 1u), STM.getMinNumSGPRs(MFI->getMaxWavesPerEU())); 487 ProgInfo.NumVGPRsForWavesPerEU = std::max( 488 std::max(ProgInfo.NumVGPR, 1u), STM.getMinNumVGPRs(MFI->getMaxWavesPerEU())); 489 490 if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS || 491 STM.hasSGPRInitBug()) { 492 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); 493 if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) { 494 // This can happen due to a compiler bug or when using inline asm to use 495 // the registers which are usually reserved for vcc etc. 496 LLVMContext &Ctx = MF.getFunction()->getContext(); 497 DiagnosticInfoResourceLimit Diag(*MF.getFunction(), 498 "scalar registers", 499 ProgInfo.NumSGPR, DS_Error, 500 DK_ResourceLimit, 501 MaxAddressableNumSGPRs); 502 Ctx.diagnose(Diag); 503 ProgInfo.NumSGPR = MaxAddressableNumSGPRs; 504 ProgInfo.NumSGPRsForWavesPerEU = MaxAddressableNumSGPRs; 505 } 506 } 507 508 if (STM.hasSGPRInitBug()) { 509 ProgInfo.NumSGPR = 510 AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 511 ProgInfo.NumSGPRsForWavesPerEU = 512 AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 513 } 514 515 if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) { 516 LLVMContext &Ctx = MF.getFunction()->getContext(); 517 DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "user SGPRs", 518 MFI->getNumUserSGPRs(), DS_Error); 519 Ctx.diagnose(Diag); 520 } 521 522 if (MFI->getLDSSize() > static_cast<unsigned>(STM.getLocalMemorySize())) { 523 LLVMContext &Ctx = MF.getFunction()->getContext(); 524 DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "local memory", 525 MFI->getLDSSize(), DS_Error); 526 Ctx.diagnose(Diag); 527 } 528 529 // SGPRBlocks is actual number of SGPR blocks minus 1. 530 ProgInfo.SGPRBlocks = alignTo(ProgInfo.NumSGPRsForWavesPerEU, 531 STM.getSGPREncodingGranule()); 532 ProgInfo.SGPRBlocks = ProgInfo.SGPRBlocks / STM.getSGPREncodingGranule() - 1; 533 534 // VGPRBlocks is actual number of VGPR blocks minus 1. 535 ProgInfo.VGPRBlocks = alignTo(ProgInfo.NumVGPRsForWavesPerEU, 536 STM.getVGPREncodingGranule()); 537 ProgInfo.VGPRBlocks = ProgInfo.VGPRBlocks / STM.getVGPREncodingGranule() - 1; 538 539 // Record first reserved VGPR and number of reserved VGPRs. 540 ProgInfo.ReservedVGPRFirst = STM.debuggerReserveRegs() ? ProgInfo.NumVGPR : 0; 541 ProgInfo.ReservedVGPRCount = STM.getReservedNumVGPRs(MF); 542 543 // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and 544 // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue" 545 // attribute was requested. 546 if (STM.debuggerEmitPrologue()) { 547 ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR = 548 RI->getHWRegIndex(MFI->getScratchWaveOffsetReg()); 549 ProgInfo.DebuggerPrivateSegmentBufferSGPR = 550 RI->getHWRegIndex(MFI->getScratchRSrcReg()); 551 } 552 553 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode 554 // register. 555 ProgInfo.FloatMode = getFPMode(MF); 556 557 ProgInfo.IEEEMode = STM.enableIEEEBit(MF); 558 559 // Make clamp modifier on NaN input returns 0. 560 ProgInfo.DX10Clamp = STM.enableDX10Clamp(); 561 562 const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 563 ProgInfo.ScratchSize = FrameInfo.getStackSize(); 564 565 unsigned LDSAlignShift; 566 if (STM.getGeneration() < SISubtarget::SEA_ISLANDS) { 567 // LDS is allocated in 64 dword blocks. 568 LDSAlignShift = 8; 569 } else { 570 // LDS is allocated in 128 dword blocks. 571 LDSAlignShift = 9; 572 } 573 574 unsigned LDSSpillSize = 575 MFI->getLDSWaveSpillSize() * MFI->getMaxFlatWorkGroupSize(); 576 577 ProgInfo.LDSSize = MFI->getLDSSize() + LDSSpillSize; 578 ProgInfo.LDSBlocks = 579 alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift; 580 581 // Scratch is allocated in 256 dword blocks. 582 unsigned ScratchAlignShift = 10; 583 // We need to program the hardware with the amount of scratch memory that 584 // is used by the entire wave. ProgInfo.ScratchSize is the amount of 585 // scratch memory used per thread. 586 ProgInfo.ScratchBlocks = 587 alignTo(ProgInfo.ScratchSize * STM.getWavefrontSize(), 588 1ULL << ScratchAlignShift) >> 589 ScratchAlignShift; 590 591 ProgInfo.ComputePGMRSrc1 = 592 S_00B848_VGPRS(ProgInfo.VGPRBlocks) | 593 S_00B848_SGPRS(ProgInfo.SGPRBlocks) | 594 S_00B848_PRIORITY(ProgInfo.Priority) | 595 S_00B848_FLOAT_MODE(ProgInfo.FloatMode) | 596 S_00B848_PRIV(ProgInfo.Priv) | 597 S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) | 598 S_00B848_DEBUG_MODE(ProgInfo.DebugMode) | 599 S_00B848_IEEE_MODE(ProgInfo.IEEEMode); 600 601 // 0 = X, 1 = XY, 2 = XYZ 602 unsigned TIDIGCompCnt = 0; 603 if (MFI->hasWorkItemIDZ()) 604 TIDIGCompCnt = 2; 605 else if (MFI->hasWorkItemIDY()) 606 TIDIGCompCnt = 1; 607 608 ProgInfo.ComputePGMRSrc2 = 609 S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) | 610 S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) | 611 S_00B84C_TRAP_HANDLER(STM.isTrapHandlerEnabled()) | 612 S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) | 613 S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) | 614 S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) | 615 S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) | 616 S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) | 617 S_00B84C_EXCP_EN_MSB(0) | 618 S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks) | 619 S_00B84C_EXCP_EN(0); 620 } 621 622 static unsigned getRsrcReg(CallingConv::ID CallConv) { 623 switch (CallConv) { 624 default: LLVM_FALLTHROUGH; 625 case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1; 626 case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS; 627 case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS; 628 case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS; 629 } 630 } 631 632 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, 633 const SIProgramInfo &KernelInfo) { 634 const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); 635 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 636 unsigned RsrcReg = getRsrcReg(MF.getFunction()->getCallingConv()); 637 638 if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) { 639 OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4); 640 641 OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc1, 4); 642 643 OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4); 644 OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc2, 4); 645 646 OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4); 647 OutStreamer->EmitIntValue(S_00B860_WAVESIZE(KernelInfo.ScratchBlocks), 4); 648 649 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 = 650 // 0" comment but I don't see a corresponding field in the register spec. 651 } else { 652 OutStreamer->EmitIntValue(RsrcReg, 4); 653 OutStreamer->EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) | 654 S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4); 655 if (STM.isVGPRSpillingEnabled(*MF.getFunction())) { 656 OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4); 657 OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4); 658 } 659 } 660 661 if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_PS) { 662 OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); 663 OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4); 664 OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); 665 OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4); 666 OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4); 667 OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4); 668 } 669 670 OutStreamer->EmitIntValue(R_SPILLED_SGPRS, 4); 671 OutStreamer->EmitIntValue(MFI->getNumSpilledSGPRs(), 4); 672 OutStreamer->EmitIntValue(R_SPILLED_VGPRS, 4); 673 OutStreamer->EmitIntValue(MFI->getNumSpilledVGPRs(), 4); 674 } 675 676 // This is supposed to be log2(Size) 677 static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) { 678 switch (Size) { 679 case 4: 680 return AMD_ELEMENT_4_BYTES; 681 case 8: 682 return AMD_ELEMENT_8_BYTES; 683 case 16: 684 return AMD_ELEMENT_16_BYTES; 685 default: 686 llvm_unreachable("invalid private_element_size"); 687 } 688 } 689 690 void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, 691 const SIProgramInfo &KernelInfo, 692 const MachineFunction &MF) const { 693 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 694 const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); 695 696 AMDGPU::initDefaultAMDKernelCodeT(Out, STM.getFeatureBits()); 697 698 Out.compute_pgm_resource_registers = 699 KernelInfo.ComputePGMRSrc1 | 700 (KernelInfo.ComputePGMRSrc2 << 32); 701 Out.code_properties = AMD_CODE_PROPERTY_IS_PTR64; 702 703 AMD_HSA_BITS_SET(Out.code_properties, 704 AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE, 705 getElementByteSizeValue(STM.getMaxPrivateElementSize())); 706 707 if (MFI->hasPrivateSegmentBuffer()) { 708 Out.code_properties |= 709 AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; 710 } 711 712 if (MFI->hasDispatchPtr()) 713 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; 714 715 if (MFI->hasQueuePtr()) 716 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; 717 718 if (MFI->hasKernargSegmentPtr()) 719 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; 720 721 if (MFI->hasDispatchID()) 722 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; 723 724 if (MFI->hasFlatScratchInit()) 725 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; 726 727 if (MFI->hasGridWorkgroupCountX()) { 728 Out.code_properties |= 729 AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X; 730 } 731 732 if (MFI->hasGridWorkgroupCountY()) { 733 Out.code_properties |= 734 AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y; 735 } 736 737 if (MFI->hasGridWorkgroupCountZ()) { 738 Out.code_properties |= 739 AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z; 740 } 741 742 if (MFI->hasDispatchPtr()) 743 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; 744 745 if (STM.debuggerSupported()) 746 Out.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED; 747 748 if (STM.isXNACKEnabled()) 749 Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED; 750 751 // FIXME: Should use getKernArgSize 752 Out.kernarg_segment_byte_size = 753 STM.getKernArgSegmentSize(MF, MFI->getABIArgOffset()); 754 Out.wavefront_sgpr_count = KernelInfo.NumSGPR; 755 Out.workitem_vgpr_count = KernelInfo.NumVGPR; 756 Out.workitem_private_segment_byte_size = KernelInfo.ScratchSize; 757 Out.workgroup_group_segment_byte_size = KernelInfo.LDSSize; 758 Out.reserved_vgpr_first = KernelInfo.ReservedVGPRFirst; 759 Out.reserved_vgpr_count = KernelInfo.ReservedVGPRCount; 760 761 // These alignment values are specified in powers of two, so alignment = 762 // 2^n. The minimum alignment is 2^4 = 16. 763 Out.kernarg_segment_alignment = std::max((size_t)4, 764 countTrailingZeros(MFI->getMaxKernArgAlign())); 765 766 if (STM.debuggerEmitPrologue()) { 767 Out.debug_wavefront_private_segment_offset_sgpr = 768 KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR; 769 Out.debug_private_segment_buffer_sgpr = 770 KernelInfo.DebuggerPrivateSegmentBufferSGPR; 771 } 772 } 773 774 bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, 775 unsigned AsmVariant, 776 const char *ExtraCode, raw_ostream &O) { 777 if (ExtraCode && ExtraCode[0]) { 778 if (ExtraCode[1] != 0) 779 return true; // Unknown modifier. 780 781 switch (ExtraCode[0]) { 782 default: 783 // See if this is a generic print operand 784 return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O); 785 case 'r': 786 break; 787 } 788 } 789 790 AMDGPUInstPrinter::printRegOperand(MI->getOperand(OpNo).getReg(), O, 791 *TM.getSubtargetImpl(*MF->getFunction())->getRegisterInfo()); 792 return false; 793 } 794