1 //===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer --------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// 12 /// The AMDGPUAsmPrinter is used to print both assembly string and also binary 13 /// code. When passed an MCAsmStreamer it prints assembly and when passed 14 /// an MCObjectStreamer it outputs binary code. 15 // 16 //===----------------------------------------------------------------------===// 17 // 18 19 #include "AMDGPUAsmPrinter.h" 20 #include "MCTargetDesc/AMDGPUTargetStreamer.h" 21 #include "InstPrinter/AMDGPUInstPrinter.h" 22 #include "Utils/AMDGPUBaseInfo.h" 23 #include "AMDGPU.h" 24 #include "AMDKernelCodeT.h" 25 #include "AMDGPUSubtarget.h" 26 #include "R600Defines.h" 27 #include "R600MachineFunctionInfo.h" 28 #include "R600RegisterInfo.h" 29 #include "SIDefines.h" 30 #include "SIMachineFunctionInfo.h" 31 #include "SIInstrInfo.h" 32 #include "SIRegisterInfo.h" 33 #include "llvm/CodeGen/MachineFrameInfo.h" 34 #include "llvm/IR/DiagnosticInfo.h" 35 #include "llvm/MC/MCContext.h" 36 #include "llvm/MC/MCSectionELF.h" 37 #include "llvm/MC/MCStreamer.h" 38 #include "llvm/Support/ELF.h" 39 #include "llvm/Support/MathExtras.h" 40 #include "llvm/Support/TargetRegistry.h" 41 #include "llvm/Target/TargetLoweringObjectFile.h" 42 43 using namespace llvm; 44 45 // TODO: This should get the default rounding mode from the kernel. We just set 46 // the default here, but this could change if the OpenCL rounding mode pragmas 47 // are used. 48 // 49 // The denormal mode here should match what is reported by the OpenCL runtime 50 // for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but 51 // can also be override to flush with the -cl-denorms-are-zero compiler flag. 52 // 53 // AMD OpenCL only sets flush none and reports CL_FP_DENORM for double 54 // precision, and leaves single precision to flush all and does not report 55 // CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports 56 // CL_FP_DENORM for both. 57 // 58 // FIXME: It seems some instructions do not support single precision denormals 59 // regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32, 60 // and sin_f32, cos_f32 on most parts). 61 62 // We want to use these instructions, and using fp32 denormals also causes 63 // instructions to run at the double precision rate for the device so it's 64 // probably best to just report no single precision denormals. 65 static uint32_t getFPMode(const MachineFunction &F) { 66 const SISubtarget& ST = F.getSubtarget<SISubtarget>(); 67 // TODO: Is there any real use for the flush in only / flush out only modes? 68 69 uint32_t FP32Denormals = 70 ST.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; 71 72 uint32_t FP64Denormals = 73 ST.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; 74 75 return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) | 76 FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) | 77 FP_DENORM_MODE_SP(FP32Denormals) | 78 FP_DENORM_MODE_DP(FP64Denormals); 79 } 80 81 static AsmPrinter * 82 createAMDGPUAsmPrinterPass(TargetMachine &tm, 83 std::unique_ptr<MCStreamer> &&Streamer) { 84 return new AMDGPUAsmPrinter(tm, std::move(Streamer)); 85 } 86 87 extern "C" void LLVMInitializeAMDGPUAsmPrinter() { 88 TargetRegistry::RegisterAsmPrinter(getTheAMDGPUTarget(), 89 createAMDGPUAsmPrinterPass); 90 TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(), 91 createAMDGPUAsmPrinterPass); 92 } 93 94 AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, 95 std::unique_ptr<MCStreamer> Streamer) 96 : AsmPrinter(TM, std::move(Streamer)) {} 97 98 StringRef AMDGPUAsmPrinter::getPassName() const { 99 return "AMDGPU Assembly Printer"; 100 } 101 102 void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) { 103 if (TM.getTargetTriple().getOS() != Triple::AMDHSA) 104 return; 105 106 // Need to construct an MCSubtargetInfo here in case we have no functions 107 // in the module. 108 std::unique_ptr<MCSubtargetInfo> STI(TM.getTarget().createMCSubtargetInfo( 109 TM.getTargetTriple().str(), TM.getTargetCPU(), 110 TM.getTargetFeatureString())); 111 112 AMDGPUTargetStreamer *TS = 113 static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer()); 114 115 TS->EmitDirectiveHSACodeObjectVersion(2, 1); 116 117 AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(STI->getFeatureBits()); 118 TS->EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor, ISA.Stepping, 119 "AMD", "AMDGPU"); 120 121 // Emit runtime metadata. 122 TS->EmitRuntimeMetadata(M); 123 } 124 125 bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough( 126 const MachineBasicBlock *MBB) const { 127 if (!AsmPrinter::isBlockOnlyReachableByFallthrough(MBB)) 128 return false; 129 130 if (MBB->empty()) 131 return true; 132 133 // If this is a block implementing a long branch, an expression relative to 134 // the start of the block is needed. to the start of the block. 135 // XXX - Is there a smarter way to check this? 136 return (MBB->back().getOpcode() != AMDGPU::S_SETPC_B64); 137 } 138 139 140 void AMDGPUAsmPrinter::EmitFunctionBodyStart() { 141 const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>(); 142 SIProgramInfo KernelInfo; 143 if (STM.isAmdCodeObjectV2()) { 144 getSIProgramInfo(KernelInfo, *MF); 145 EmitAmdKernelCodeT(*MF, KernelInfo); 146 } 147 } 148 149 void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { 150 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 151 const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>(); 152 if (MFI->isKernel() && STM.isAmdCodeObjectV2()) { 153 AMDGPUTargetStreamer *TS = 154 static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer()); 155 SmallString<128> SymbolName; 156 getNameWithPrefix(SymbolName, MF->getFunction()), 157 TS->EmitAMDGPUSymbolType(SymbolName, ELF::STT_AMDGPU_HSA_KERNEL); 158 } 159 160 AsmPrinter::EmitFunctionEntryLabel(); 161 } 162 163 void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { 164 165 // Group segment variables aren't emitted in HSA. 166 if (AMDGPU::isGroupSegment(GV)) 167 return; 168 169 AsmPrinter::EmitGlobalVariable(GV); 170 } 171 172 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { 173 174 // The starting address of all shader programs must be 256 bytes aligned. 175 MF.setAlignment(8); 176 177 SetupMachineFunction(MF); 178 179 const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>(); 180 MCContext &Context = getObjFileLowering().getContext(); 181 if (!STM.isAmdHsaOS()) { 182 MCSectionELF *ConfigSection = 183 Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0); 184 OutStreamer->SwitchSection(ConfigSection); 185 } 186 187 SIProgramInfo KernelInfo; 188 if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 189 getSIProgramInfo(KernelInfo, MF); 190 if (!STM.isAmdHsaOS()) { 191 EmitProgramInfoSI(MF, KernelInfo); 192 } 193 } else { 194 EmitProgramInfoR600(MF); 195 } 196 197 DisasmLines.clear(); 198 HexLines.clear(); 199 DisasmLineMaxLen = 0; 200 201 EmitFunctionBody(); 202 203 if (isVerbose()) { 204 MCSectionELF *CommentSection = 205 Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0); 206 OutStreamer->SwitchSection(CommentSection); 207 208 if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 209 OutStreamer->emitRawComment(" Kernel info:", false); 210 OutStreamer->emitRawComment(" codeLenInByte = " + Twine(KernelInfo.CodeLen), 211 false); 212 OutStreamer->emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR), 213 false); 214 OutStreamer->emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR), 215 false); 216 OutStreamer->emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode), 217 false); 218 OutStreamer->emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode), 219 false); 220 OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize), 221 false); 222 OutStreamer->emitRawComment(" LDSByteSize: " + Twine(KernelInfo.LDSSize) + 223 " bytes/workgroup (compile time only)", false); 224 225 OutStreamer->emitRawComment(" SGPRBlocks: " + 226 Twine(KernelInfo.SGPRBlocks), false); 227 OutStreamer->emitRawComment(" VGPRBlocks: " + 228 Twine(KernelInfo.VGPRBlocks), false); 229 230 OutStreamer->emitRawComment(" NumSGPRsForWavesPerEU: " + 231 Twine(KernelInfo.NumSGPRsForWavesPerEU), false); 232 OutStreamer->emitRawComment(" NumVGPRsForWavesPerEU: " + 233 Twine(KernelInfo.NumVGPRsForWavesPerEU), false); 234 235 OutStreamer->emitRawComment(" ReservedVGPRFirst: " + Twine(KernelInfo.ReservedVGPRFirst), 236 false); 237 OutStreamer->emitRawComment(" ReservedVGPRCount: " + Twine(KernelInfo.ReservedVGPRCount), 238 false); 239 240 if (MF.getSubtarget<SISubtarget>().debuggerEmitPrologue()) { 241 OutStreamer->emitRawComment(" DebuggerWavefrontPrivateSegmentOffsetSGPR: s" + 242 Twine(KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false); 243 OutStreamer->emitRawComment(" DebuggerPrivateSegmentBufferSGPR: s" + 244 Twine(KernelInfo.DebuggerPrivateSegmentBufferSGPR), false); 245 } 246 247 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " + 248 Twine(G_00B84C_USER_SGPR(KernelInfo.ComputePGMRSrc2)), 249 false); 250 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " + 251 Twine(G_00B84C_TGID_X_EN(KernelInfo.ComputePGMRSrc2)), 252 false); 253 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " + 254 Twine(G_00B84C_TGID_Y_EN(KernelInfo.ComputePGMRSrc2)), 255 false); 256 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " + 257 Twine(G_00B84C_TGID_Z_EN(KernelInfo.ComputePGMRSrc2)), 258 false); 259 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " + 260 Twine(G_00B84C_TIDIG_COMP_CNT(KernelInfo.ComputePGMRSrc2)), 261 false); 262 263 } else { 264 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); 265 OutStreamer->emitRawComment( 266 Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->CFStackSize))); 267 } 268 } 269 270 if (STM.dumpCode()) { 271 272 OutStreamer->SwitchSection( 273 Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0)); 274 275 for (size_t i = 0; i < DisasmLines.size(); ++i) { 276 std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' '); 277 Comment += " ; " + HexLines[i] + "\n"; 278 279 OutStreamer->EmitBytes(StringRef(DisasmLines[i])); 280 OutStreamer->EmitBytes(StringRef(Comment)); 281 } 282 } 283 284 return false; 285 } 286 287 void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { 288 unsigned MaxGPR = 0; 289 bool killPixel = false; 290 const R600Subtarget &STM = MF.getSubtarget<R600Subtarget>(); 291 const R600RegisterInfo *RI = STM.getRegisterInfo(); 292 const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); 293 294 for (const MachineBasicBlock &MBB : MF) { 295 for (const MachineInstr &MI : MBB) { 296 if (MI.getOpcode() == AMDGPU::KILLGT) 297 killPixel = true; 298 unsigned numOperands = MI.getNumOperands(); 299 for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) { 300 const MachineOperand &MO = MI.getOperand(op_idx); 301 if (!MO.isReg()) 302 continue; 303 unsigned HWReg = RI->getEncodingValue(MO.getReg()) & 0xff; 304 305 // Register with value > 127 aren't GPR 306 if (HWReg > 127) 307 continue; 308 MaxGPR = std::max(MaxGPR, HWReg); 309 } 310 } 311 } 312 313 unsigned RsrcReg; 314 if (STM.getGeneration() >= R600Subtarget::EVERGREEN) { 315 // Evergreen / Northern Islands 316 switch (MF.getFunction()->getCallingConv()) { 317 default: LLVM_FALLTHROUGH; 318 case CallingConv::AMDGPU_CS: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break; 319 case CallingConv::AMDGPU_GS: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break; 320 case CallingConv::AMDGPU_PS: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break; 321 case CallingConv::AMDGPU_VS: RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break; 322 } 323 } else { 324 // R600 / R700 325 switch (MF.getFunction()->getCallingConv()) { 326 default: LLVM_FALLTHROUGH; 327 case CallingConv::AMDGPU_GS: LLVM_FALLTHROUGH; 328 case CallingConv::AMDGPU_CS: LLVM_FALLTHROUGH; 329 case CallingConv::AMDGPU_VS: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break; 330 case CallingConv::AMDGPU_PS: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break; 331 } 332 } 333 334 OutStreamer->EmitIntValue(RsrcReg, 4); 335 OutStreamer->EmitIntValue(S_NUM_GPRS(MaxGPR + 1) | 336 S_STACK_SIZE(MFI->CFStackSize), 4); 337 OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4); 338 OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4); 339 340 if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) { 341 OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4); 342 OutStreamer->EmitIntValue(alignTo(MFI->getLDSSize(), 4) >> 2, 4); 343 } 344 } 345 346 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, 347 const MachineFunction &MF) const { 348 const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); 349 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 350 uint64_t CodeSize = 0; 351 unsigned MaxSGPR = 0; 352 unsigned MaxVGPR = 0; 353 bool VCCUsed = false; 354 bool FlatUsed = false; 355 const SIRegisterInfo *RI = STM.getRegisterInfo(); 356 const SIInstrInfo *TII = STM.getInstrInfo(); 357 358 for (const MachineBasicBlock &MBB : MF) { 359 for (const MachineInstr &MI : MBB) { 360 // TODO: CodeSize should account for multiple functions. 361 362 // TODO: Should we count size of debug info? 363 if (MI.isDebugValue()) 364 continue; 365 366 if (isVerbose()) 367 CodeSize += TII->getInstSizeInBytes(MI); 368 369 unsigned numOperands = MI.getNumOperands(); 370 for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) { 371 const MachineOperand &MO = MI.getOperand(op_idx); 372 unsigned width = 0; 373 bool isSGPR = false; 374 375 if (!MO.isReg()) 376 continue; 377 378 unsigned reg = MO.getReg(); 379 switch (reg) { 380 case AMDGPU::EXEC: 381 case AMDGPU::EXEC_LO: 382 case AMDGPU::EXEC_HI: 383 case AMDGPU::SCC: 384 case AMDGPU::M0: 385 continue; 386 387 case AMDGPU::VCC: 388 case AMDGPU::VCC_LO: 389 case AMDGPU::VCC_HI: 390 VCCUsed = true; 391 continue; 392 393 case AMDGPU::FLAT_SCR: 394 case AMDGPU::FLAT_SCR_LO: 395 case AMDGPU::FLAT_SCR_HI: 396 // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat 397 // instructions aren't used to access the scratch buffer. 398 if (MFI->hasFlatScratchInit()) 399 FlatUsed = true; 400 continue; 401 402 case AMDGPU::TBA: 403 case AMDGPU::TBA_LO: 404 case AMDGPU::TBA_HI: 405 case AMDGPU::TMA: 406 case AMDGPU::TMA_LO: 407 case AMDGPU::TMA_HI: 408 llvm_unreachable("trap handler registers should not be used"); 409 410 default: 411 break; 412 } 413 414 if (AMDGPU::SReg_32RegClass.contains(reg)) { 415 assert(!AMDGPU::TTMP_32RegClass.contains(reg) && 416 "trap handler registers should not be used"); 417 isSGPR = true; 418 width = 1; 419 } else if (AMDGPU::VGPR_32RegClass.contains(reg)) { 420 isSGPR = false; 421 width = 1; 422 } else if (AMDGPU::SReg_64RegClass.contains(reg)) { 423 assert(!AMDGPU::TTMP_64RegClass.contains(reg) && 424 "trap handler registers should not be used"); 425 isSGPR = true; 426 width = 2; 427 } else if (AMDGPU::VReg_64RegClass.contains(reg)) { 428 isSGPR = false; 429 width = 2; 430 } else if (AMDGPU::VReg_96RegClass.contains(reg)) { 431 isSGPR = false; 432 width = 3; 433 } else if (AMDGPU::SReg_128RegClass.contains(reg)) { 434 isSGPR = true; 435 width = 4; 436 } else if (AMDGPU::VReg_128RegClass.contains(reg)) { 437 isSGPR = false; 438 width = 4; 439 } else if (AMDGPU::SReg_256RegClass.contains(reg)) { 440 isSGPR = true; 441 width = 8; 442 } else if (AMDGPU::VReg_256RegClass.contains(reg)) { 443 isSGPR = false; 444 width = 8; 445 } else if (AMDGPU::SReg_512RegClass.contains(reg)) { 446 isSGPR = true; 447 width = 16; 448 } else if (AMDGPU::VReg_512RegClass.contains(reg)) { 449 isSGPR = false; 450 width = 16; 451 } else { 452 llvm_unreachable("Unknown register class"); 453 } 454 unsigned hwReg = RI->getEncodingValue(reg) & 0xff; 455 unsigned maxUsed = hwReg + width - 1; 456 if (isSGPR) { 457 MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR; 458 } else { 459 MaxVGPR = maxUsed > MaxVGPR ? maxUsed : MaxVGPR; 460 } 461 } 462 } 463 } 464 465 unsigned ExtraSGPRs = 0; 466 467 if (VCCUsed) 468 ExtraSGPRs = 2; 469 470 if (STM.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) { 471 if (FlatUsed) 472 ExtraSGPRs = 4; 473 } else { 474 if (STM.isXNACKEnabled()) 475 ExtraSGPRs = 4; 476 477 if (FlatUsed) 478 ExtraSGPRs = 6; 479 } 480 481 // Record first reserved register and reserved register count fields, and 482 // update max register counts if "amdgpu-debugger-reserve-regs" attribute was 483 // requested. 484 ProgInfo.ReservedVGPRFirst = STM.debuggerReserveRegs() ? MaxVGPR + 1 : 0; 485 ProgInfo.ReservedVGPRCount = RI->getNumDebuggerReservedVGPRs(STM); 486 487 // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and 488 // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue" 489 // attribute was requested. 490 if (STM.debuggerEmitPrologue()) { 491 ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR = 492 RI->getHWRegIndex(MFI->getScratchWaveOffsetReg()); 493 ProgInfo.DebuggerPrivateSegmentBufferSGPR = 494 RI->getHWRegIndex(MFI->getScratchRSrcReg()); 495 } 496 497 // Check the addressable register limit before we add ExtraSGPRs. 498 if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 499 !STM.hasSGPRInitBug()) { 500 unsigned MaxAddressableNumSGPRs = STM.getMaxNumSGPRs(); 501 if (MaxSGPR + 1 > MaxAddressableNumSGPRs) { 502 // This can happen due to a compiler bug or when using inline asm. 503 LLVMContext &Ctx = MF.getFunction()->getContext(); 504 DiagnosticInfoResourceLimit Diag(*MF.getFunction(), 505 "addressable scalar registers", 506 MaxSGPR + 1, DS_Error, 507 DK_ResourceLimit, MaxAddressableNumSGPRs); 508 Ctx.diagnose(Diag); 509 MaxSGPR = MaxAddressableNumSGPRs - 1; 510 } 511 } 512 513 // Account for extra SGPRs and VGPRs reserved for debugger use. 514 MaxSGPR += ExtraSGPRs; 515 MaxVGPR += RI->getNumDebuggerReservedVGPRs(STM); 516 517 // We found the maximum register index. They start at 0, so add one to get the 518 // number of registers. 519 ProgInfo.NumVGPR = MaxVGPR + 1; 520 ProgInfo.NumSGPR = MaxSGPR + 1; 521 522 // Adjust number of registers used to meet default/requested minimum/maximum 523 // number of waves per execution unit request. 524 ProgInfo.NumSGPRsForWavesPerEU = std::max( 525 ProgInfo.NumSGPR, RI->getMinNumSGPRs(STM, MFI->getMaxWavesPerEU())); 526 ProgInfo.NumVGPRsForWavesPerEU = std::max( 527 ProgInfo.NumVGPR, RI->getMinNumVGPRs(MFI->getMaxWavesPerEU())); 528 529 if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS || 530 STM.hasSGPRInitBug()) { 531 unsigned MaxNumSGPRs = STM.getMaxNumSGPRs(); 532 if (ProgInfo.NumSGPR > MaxNumSGPRs) { 533 // This can happen due to a compiler bug or when using inline asm to use the 534 // registers which are usually reserved for vcc etc. 535 536 LLVMContext &Ctx = MF.getFunction()->getContext(); 537 DiagnosticInfoResourceLimit Diag(*MF.getFunction(), 538 "scalar registers", 539 ProgInfo.NumSGPR, DS_Error, 540 DK_ResourceLimit, MaxNumSGPRs); 541 Ctx.diagnose(Diag); 542 ProgInfo.NumSGPR = MaxNumSGPRs; 543 ProgInfo.NumSGPRsForWavesPerEU = MaxNumSGPRs; 544 } 545 } 546 547 if (STM.hasSGPRInitBug()) { 548 ProgInfo.NumSGPR = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; 549 ProgInfo.NumSGPRsForWavesPerEU = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; 550 } 551 552 if (MFI->NumUserSGPRs > STM.getMaxNumUserSGPRs()) { 553 LLVMContext &Ctx = MF.getFunction()->getContext(); 554 DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "user SGPRs", 555 MFI->NumUserSGPRs, DS_Error); 556 Ctx.diagnose(Diag); 557 } 558 559 if (MFI->getLDSSize() > static_cast<unsigned>(STM.getLocalMemorySize())) { 560 LLVMContext &Ctx = MF.getFunction()->getContext(); 561 DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "local memory", 562 MFI->getLDSSize(), DS_Error); 563 Ctx.diagnose(Diag); 564 } 565 566 // SGPRBlocks is actual number of SGPR blocks minus 1. 567 ProgInfo.SGPRBlocks = alignTo(ProgInfo.NumSGPRsForWavesPerEU, 568 RI->getSGPRAllocGranule()); 569 ProgInfo.SGPRBlocks = ProgInfo.SGPRBlocks / RI->getSGPRAllocGranule() - 1; 570 571 // VGPRBlocks is actual number of VGPR blocks minus 1. 572 ProgInfo.VGPRBlocks = alignTo(ProgInfo.NumVGPRsForWavesPerEU, 573 RI->getVGPRAllocGranule()); 574 ProgInfo.VGPRBlocks = ProgInfo.VGPRBlocks / RI->getVGPRAllocGranule() - 1; 575 576 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode 577 // register. 578 ProgInfo.FloatMode = getFPMode(MF); 579 580 ProgInfo.IEEEMode = STM.enableIEEEBit(MF); 581 582 // Make clamp modifier on NaN input returns 0. 583 ProgInfo.DX10Clamp = 1; 584 585 const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 586 ProgInfo.ScratchSize = FrameInfo.getStackSize(); 587 588 ProgInfo.FlatUsed = FlatUsed; 589 ProgInfo.VCCUsed = VCCUsed; 590 ProgInfo.CodeLen = CodeSize; 591 592 unsigned LDSAlignShift; 593 if (STM.getGeneration() < SISubtarget::SEA_ISLANDS) { 594 // LDS is allocated in 64 dword blocks. 595 LDSAlignShift = 8; 596 } else { 597 // LDS is allocated in 128 dword blocks. 598 LDSAlignShift = 9; 599 } 600 601 unsigned LDSSpillSize = 602 MFI->LDSWaveSpillSize * MFI->getMaxFlatWorkGroupSize(); 603 604 ProgInfo.LDSSize = MFI->getLDSSize() + LDSSpillSize; 605 ProgInfo.LDSBlocks = 606 alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift; 607 608 // Scratch is allocated in 256 dword blocks. 609 unsigned ScratchAlignShift = 10; 610 // We need to program the hardware with the amount of scratch memory that 611 // is used by the entire wave. ProgInfo.ScratchSize is the amount of 612 // scratch memory used per thread. 613 ProgInfo.ScratchBlocks = 614 alignTo(ProgInfo.ScratchSize * STM.getWavefrontSize(), 615 1ULL << ScratchAlignShift) >> 616 ScratchAlignShift; 617 618 ProgInfo.ComputePGMRSrc1 = 619 S_00B848_VGPRS(ProgInfo.VGPRBlocks) | 620 S_00B848_SGPRS(ProgInfo.SGPRBlocks) | 621 S_00B848_PRIORITY(ProgInfo.Priority) | 622 S_00B848_FLOAT_MODE(ProgInfo.FloatMode) | 623 S_00B848_PRIV(ProgInfo.Priv) | 624 S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) | 625 S_00B848_DEBUG_MODE(ProgInfo.DebugMode) | 626 S_00B848_IEEE_MODE(ProgInfo.IEEEMode); 627 628 // 0 = X, 1 = XY, 2 = XYZ 629 unsigned TIDIGCompCnt = 0; 630 if (MFI->hasWorkItemIDZ()) 631 TIDIGCompCnt = 2; 632 else if (MFI->hasWorkItemIDY()) 633 TIDIGCompCnt = 1; 634 635 ProgInfo.ComputePGMRSrc2 = 636 S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) | 637 S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) | 638 S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) | 639 S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) | 640 S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) | 641 S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) | 642 S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) | 643 S_00B84C_EXCP_EN_MSB(0) | 644 S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks) | 645 S_00B84C_EXCP_EN(0); 646 } 647 648 static unsigned getRsrcReg(CallingConv::ID CallConv) { 649 switch (CallConv) { 650 default: LLVM_FALLTHROUGH; 651 case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1; 652 case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS; 653 case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS; 654 case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS; 655 } 656 } 657 658 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, 659 const SIProgramInfo &KernelInfo) { 660 const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); 661 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 662 unsigned RsrcReg = getRsrcReg(MF.getFunction()->getCallingConv()); 663 664 if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) { 665 OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4); 666 667 OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc1, 4); 668 669 OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4); 670 OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc2, 4); 671 672 OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4); 673 OutStreamer->EmitIntValue(S_00B860_WAVESIZE(KernelInfo.ScratchBlocks), 4); 674 675 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 = 676 // 0" comment but I don't see a corresponding field in the register spec. 677 } else { 678 OutStreamer->EmitIntValue(RsrcReg, 4); 679 OutStreamer->EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) | 680 S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4); 681 if (STM.isVGPRSpillingEnabled(*MF.getFunction())) { 682 OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4); 683 OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4); 684 } 685 } 686 687 if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_PS) { 688 OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); 689 OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4); 690 OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); 691 OutStreamer->EmitIntValue(MFI->PSInputEna, 4); 692 OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4); 693 OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4); 694 } 695 696 OutStreamer->EmitIntValue(R_SPILLED_SGPRS, 4); 697 OutStreamer->EmitIntValue(MFI->getNumSpilledSGPRs(), 4); 698 OutStreamer->EmitIntValue(R_SPILLED_VGPRS, 4); 699 OutStreamer->EmitIntValue(MFI->getNumSpilledVGPRs(), 4); 700 } 701 702 // This is supposed to be log2(Size) 703 static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) { 704 switch (Size) { 705 case 4: 706 return AMD_ELEMENT_4_BYTES; 707 case 8: 708 return AMD_ELEMENT_8_BYTES; 709 case 16: 710 return AMD_ELEMENT_16_BYTES; 711 default: 712 llvm_unreachable("invalid private_element_size"); 713 } 714 } 715 716 void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF, 717 const SIProgramInfo &KernelInfo) const { 718 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 719 const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); 720 amd_kernel_code_t header; 721 722 AMDGPU::initDefaultAMDKernelCodeT(header, STM.getFeatureBits()); 723 724 header.compute_pgm_resource_registers = 725 KernelInfo.ComputePGMRSrc1 | 726 (KernelInfo.ComputePGMRSrc2 << 32); 727 header.code_properties = AMD_CODE_PROPERTY_IS_PTR64; 728 729 730 AMD_HSA_BITS_SET(header.code_properties, 731 AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE, 732 getElementByteSizeValue(STM.getMaxPrivateElementSize())); 733 734 if (MFI->hasPrivateSegmentBuffer()) { 735 header.code_properties |= 736 AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; 737 } 738 739 if (MFI->hasDispatchPtr()) 740 header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; 741 742 if (MFI->hasQueuePtr()) 743 header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; 744 745 if (MFI->hasKernargSegmentPtr()) 746 header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; 747 748 if (MFI->hasDispatchID()) 749 header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; 750 751 if (MFI->hasFlatScratchInit()) 752 header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; 753 754 // TODO: Private segment size 755 756 if (MFI->hasGridWorkgroupCountX()) { 757 header.code_properties |= 758 AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X; 759 } 760 761 if (MFI->hasGridWorkgroupCountY()) { 762 header.code_properties |= 763 AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y; 764 } 765 766 if (MFI->hasGridWorkgroupCountZ()) { 767 header.code_properties |= 768 AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z; 769 } 770 771 if (MFI->hasDispatchPtr()) 772 header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; 773 774 if (STM.debuggerSupported()) 775 header.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED; 776 777 if (STM.isXNACKEnabled()) 778 header.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED; 779 780 // FIXME: Should use getKernArgSize 781 header.kernarg_segment_byte_size = 782 STM.getKernArgSegmentSize(MFI->getABIArgOffset()); 783 header.wavefront_sgpr_count = KernelInfo.NumSGPR; 784 header.workitem_vgpr_count = KernelInfo.NumVGPR; 785 header.workitem_private_segment_byte_size = KernelInfo.ScratchSize; 786 header.workgroup_group_segment_byte_size = KernelInfo.LDSSize; 787 header.reserved_vgpr_first = KernelInfo.ReservedVGPRFirst; 788 header.reserved_vgpr_count = KernelInfo.ReservedVGPRCount; 789 790 // These alignment values are specified in powers of two, so alignment = 791 // 2^n. The minimum alignment is 2^4 = 16. 792 header.kernarg_segment_alignment = std::max((size_t)4, 793 countTrailingZeros(MFI->getMaxKernArgAlign())); 794 795 if (STM.debuggerEmitPrologue()) { 796 header.debug_wavefront_private_segment_offset_sgpr = 797 KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR; 798 header.debug_private_segment_buffer_sgpr = 799 KernelInfo.DebuggerPrivateSegmentBufferSGPR; 800 } 801 802 AMDGPUTargetStreamer *TS = 803 static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer()); 804 805 OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); 806 TS->EmitAMDKernelCodeT(header); 807 } 808 809 bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, 810 unsigned AsmVariant, 811 const char *ExtraCode, raw_ostream &O) { 812 if (ExtraCode && ExtraCode[0]) { 813 if (ExtraCode[1] != 0) 814 return true; // Unknown modifier. 815 816 switch (ExtraCode[0]) { 817 default: 818 // See if this is a generic print operand 819 return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O); 820 case 'r': 821 break; 822 } 823 } 824 825 AMDGPUInstPrinter::printRegOperand(MI->getOperand(OpNo).getReg(), O, 826 *TM.getSubtargetImpl(*MF->getFunction())->getRegisterInfo()); 827 return false; 828 } 829