1 //===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file implements the lowering of LLVM calls to machine code calls for 11 /// GlobalISel. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUCallLowering.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUISelLowering.h" 18 #include "AMDGPUSubtarget.h" 19 #include "SIISelLowering.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "SIRegisterInfo.h" 22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 23 #include "llvm/CodeGen/Analysis.h" 24 #include "llvm/CodeGen/CallingConvLower.h" 25 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 26 #include "llvm/CodeGen/MachineInstrBuilder.h" 27 #include "llvm/Support/LowLevelTypeImpl.h" 28 29 using namespace llvm; 30 31 namespace { 32 33 struct OutgoingArgHandler : public CallLowering::ValueHandler { 34 OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, 35 MachineInstrBuilder MIB, CCAssignFn *AssignFn) 36 : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {} 37 38 MachineInstrBuilder MIB; 39 40 Register getStackAddress(uint64_t Size, int64_t Offset, 41 MachinePointerInfo &MPO) override { 42 llvm_unreachable("not implemented"); 43 } 44 45 void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, 46 MachinePointerInfo &MPO, CCValAssign &VA) override { 47 llvm_unreachable("not implemented"); 48 } 49 50 void assignValueToReg(Register ValVReg, Register PhysReg, 51 CCValAssign &VA) override { 52 MIB.addUse(PhysReg); 53 MIRBuilder.buildCopy(PhysReg, ValVReg); 54 } 55 56 bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT, 57 CCValAssign::LocInfo LocInfo, 58 const CallLowering::ArgInfo &Info, 59 CCState &State) override { 60 return AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State); 61 } 62 }; 63 64 } 65 66 AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI) 67 : CallLowering(&TLI) { 68 } 69 70 bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, 71 const Value *Val, 72 ArrayRef<Register> VRegs) const { 73 74 MachineFunction &MF = MIRBuilder.getMF(); 75 MachineRegisterInfo &MRI = MF.getRegInfo(); 76 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 77 MFI->setIfReturnsVoid(!Val); 78 79 if (!Val) { 80 MIRBuilder.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 81 return true; 82 } 83 84 Register VReg = VRegs[0]; 85 86 const Function &F = MF.getFunction(); 87 auto &DL = F.getParent()->getDataLayout(); 88 if (!AMDGPU::isShader(F.getCallingConv())) 89 return false; 90 91 92 const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>(); 93 SmallVector<EVT, 4> SplitVTs; 94 SmallVector<uint64_t, 4> Offsets; 95 ArgInfo OrigArg{VReg, Val->getType()}; 96 setArgFlags(OrigArg, AttributeList::ReturnIndex, DL, F); 97 ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0); 98 99 SmallVector<ArgInfo, 8> SplitArgs; 100 CCAssignFn *AssignFn = CCAssignFnForReturn(F.getCallingConv(), false); 101 for (unsigned i = 0, e = Offsets.size(); i != e; ++i) { 102 Type *SplitTy = SplitVTs[i].getTypeForEVT(F.getContext()); 103 SplitArgs.push_back({VRegs[i], SplitTy, OrigArg.Flags, OrigArg.IsFixed}); 104 } 105 auto RetInstr = MIRBuilder.buildInstrNoInsert(AMDGPU::SI_RETURN_TO_EPILOG); 106 OutgoingArgHandler Handler(MIRBuilder, MRI, RetInstr, AssignFn); 107 if (!handleAssignments(MIRBuilder, SplitArgs, Handler)) 108 return false; 109 MIRBuilder.insertInstr(RetInstr); 110 111 return true; 112 } 113 114 Register AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder, 115 Type *ParamTy, 116 uint64_t Offset) const { 117 118 MachineFunction &MF = MIRBuilder.getMF(); 119 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 120 MachineRegisterInfo &MRI = MF.getRegInfo(); 121 const Function &F = MF.getFunction(); 122 const DataLayout &DL = F.getParent()->getDataLayout(); 123 PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS); 124 LLT PtrType = getLLTForType(*PtrTy, DL); 125 Register DstReg = MRI.createGenericVirtualRegister(PtrType); 126 Register KernArgSegmentPtr = 127 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 128 Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr); 129 130 Register OffsetReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); 131 MIRBuilder.buildConstant(OffsetReg, Offset); 132 133 MIRBuilder.buildGEP(DstReg, KernArgSegmentVReg, OffsetReg); 134 135 return DstReg; 136 } 137 138 void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder, 139 Type *ParamTy, uint64_t Offset, 140 unsigned Align, 141 Register DstReg) const { 142 MachineFunction &MF = MIRBuilder.getMF(); 143 const Function &F = MF.getFunction(); 144 const DataLayout &DL = F.getParent()->getDataLayout(); 145 PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS); 146 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); 147 unsigned TypeSize = DL.getTypeStoreSize(ParamTy); 148 Register PtrReg = lowerParameterPtr(MIRBuilder, ParamTy, Offset); 149 150 MachineMemOperand *MMO = 151 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad | 152 MachineMemOperand::MONonTemporal | 153 MachineMemOperand::MOInvariant, 154 TypeSize, Align); 155 156 MIRBuilder.buildLoad(DstReg, PtrReg, *MMO); 157 } 158 159 static Register findFirstFreeSGPR(CCState &CCInfo) { 160 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); 161 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) { 162 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) { 163 return AMDGPU::SGPR0 + Reg; 164 } 165 } 166 llvm_unreachable("Cannot allocate sgpr"); 167 } 168 169 static void allocateSpecialEntryInputVGPRs(CCState &CCInfo, 170 MachineFunction &MF, 171 const SIRegisterInfo &TRI, 172 SIMachineFunctionInfo &Info) { 173 const LLT S32 = LLT::scalar(32); 174 MachineRegisterInfo &MRI = MF.getRegInfo(); 175 176 if (Info.hasWorkItemIDX()) { 177 Register Reg = AMDGPU::VGPR0; 178 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); 179 180 CCInfo.AllocateReg(Reg); 181 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg)); 182 } 183 184 if (Info.hasWorkItemIDY()) { 185 Register Reg = AMDGPU::VGPR1; 186 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); 187 188 CCInfo.AllocateReg(Reg); 189 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg)); 190 } 191 192 if (Info.hasWorkItemIDZ()) { 193 Register Reg = AMDGPU::VGPR2; 194 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); 195 196 CCInfo.AllocateReg(Reg); 197 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg)); 198 } 199 } 200 201 // Allocate special inputs passed in user SGPRs. 202 static void allocateHSAUserSGPRs(CCState &CCInfo, 203 MachineIRBuilder &MIRBuilder, 204 MachineFunction &MF, 205 const SIRegisterInfo &TRI, 206 SIMachineFunctionInfo &Info) { 207 // FIXME: How should these inputs interact with inreg / custom SGPR inputs? 208 if (Info.hasPrivateSegmentBuffer()) { 209 unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI); 210 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass); 211 CCInfo.AllocateReg(PrivateSegmentBufferReg); 212 } 213 214 if (Info.hasDispatchPtr()) { 215 unsigned DispatchPtrReg = Info.addDispatchPtr(TRI); 216 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass); 217 CCInfo.AllocateReg(DispatchPtrReg); 218 } 219 220 if (Info.hasQueuePtr()) { 221 unsigned QueuePtrReg = Info.addQueuePtr(TRI); 222 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); 223 CCInfo.AllocateReg(QueuePtrReg); 224 } 225 226 if (Info.hasKernargSegmentPtr()) { 227 MachineRegisterInfo &MRI = MF.getRegInfo(); 228 Register InputPtrReg = Info.addKernargSegmentPtr(TRI); 229 const LLT P4 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 230 Register VReg = MRI.createGenericVirtualRegister(P4); 231 MRI.addLiveIn(InputPtrReg, VReg); 232 MIRBuilder.getMBB().addLiveIn(InputPtrReg); 233 MIRBuilder.buildCopy(VReg, InputPtrReg); 234 CCInfo.AllocateReg(InputPtrReg); 235 } 236 237 if (Info.hasDispatchID()) { 238 unsigned DispatchIDReg = Info.addDispatchID(TRI); 239 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass); 240 CCInfo.AllocateReg(DispatchIDReg); 241 } 242 243 if (Info.hasFlatScratchInit()) { 244 unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI); 245 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); 246 CCInfo.AllocateReg(FlatScratchInitReg); 247 } 248 249 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read 250 // these from the dispatch pointer. 251 } 252 253 static void allocateSystemSGPRs(CCState &CCInfo, 254 MachineFunction &MF, 255 SIMachineFunctionInfo &Info, 256 CallingConv::ID CallConv, 257 bool IsShader) { 258 const LLT S32 = LLT::scalar(32); 259 MachineRegisterInfo &MRI = MF.getRegInfo(); 260 261 if (Info.hasWorkGroupIDX()) { 262 Register Reg = Info.addWorkGroupIDX(); 263 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32); 264 CCInfo.AllocateReg(Reg); 265 } 266 267 if (Info.hasWorkGroupIDY()) { 268 Register Reg = Info.addWorkGroupIDY(); 269 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32); 270 CCInfo.AllocateReg(Reg); 271 } 272 273 if (Info.hasWorkGroupIDZ()) { 274 unsigned Reg = Info.addWorkGroupIDZ(); 275 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32); 276 CCInfo.AllocateReg(Reg); 277 } 278 279 if (Info.hasWorkGroupInfo()) { 280 unsigned Reg = Info.addWorkGroupInfo(); 281 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32); 282 CCInfo.AllocateReg(Reg); 283 } 284 285 if (Info.hasPrivateSegmentWaveByteOffset()) { 286 // Scratch wave offset passed in system SGPR. 287 unsigned PrivateSegmentWaveByteOffsetReg; 288 289 if (IsShader) { 290 PrivateSegmentWaveByteOffsetReg = 291 Info.getPrivateSegmentWaveByteOffsetSystemSGPR(); 292 293 // This is true if the scratch wave byte offset doesn't have a fixed 294 // location. 295 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) { 296 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo); 297 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg); 298 } 299 } else 300 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset(); 301 302 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass); 303 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); 304 } 305 } 306 307 bool AMDGPUCallLowering::lowerFormalArguments( 308 MachineIRBuilder &MIRBuilder, const Function &F, 309 ArrayRef<ArrayRef<Register>> VRegs) const { 310 // AMDGPU_GS and AMDGP_HS are not supported yet. 311 if (F.getCallingConv() == CallingConv::AMDGPU_GS || 312 F.getCallingConv() == CallingConv::AMDGPU_HS) 313 return false; 314 315 MachineFunction &MF = MIRBuilder.getMF(); 316 const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>(); 317 MachineRegisterInfo &MRI = MF.getRegInfo(); 318 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 319 const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); 320 const DataLayout &DL = F.getParent()->getDataLayout(); 321 322 bool IsShader = AMDGPU::isShader(F.getCallingConv()); 323 324 SmallVector<CCValAssign, 16> ArgLocs; 325 CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); 326 327 // The infrastructure for normal calling convention lowering is essentially 328 // useless for kernels. We want to avoid any kind of legalization or argument 329 // splitting. 330 if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL) { 331 allocateHSAUserSGPRs(CCInfo, MIRBuilder, MF, *TRI, *Info); 332 333 unsigned i = 0; 334 const unsigned KernArgBaseAlign = 16; 335 const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F); 336 uint64_t ExplicitArgOffset = 0; 337 338 // TODO: Align down to dword alignment and extract bits for extending loads. 339 for (auto &Arg : F.args()) { 340 Type *ArgTy = Arg.getType(); 341 unsigned AllocSize = DL.getTypeAllocSize(ArgTy); 342 if (AllocSize == 0) 343 continue; 344 345 unsigned ABIAlign = DL.getABITypeAlignment(ArgTy); 346 347 uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset; 348 ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize; 349 350 ArrayRef<Register> OrigArgRegs = VRegs[i]; 351 Register ArgReg = 352 OrigArgRegs.size() == 1 353 ? OrigArgRegs[0] 354 : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL)); 355 unsigned Align = MinAlign(KernArgBaseAlign, ArgOffset); 356 ArgOffset = alignTo(ArgOffset, DL.getABITypeAlignment(ArgTy)); 357 lowerParameter(MIRBuilder, ArgTy, ArgOffset, Align, ArgReg); 358 if (OrigArgRegs.size() > 1) 359 unpackRegs(OrigArgRegs, ArgReg, ArgTy, MIRBuilder); 360 ++i; 361 } 362 363 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); 364 allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), IsShader); 365 return true; 366 } 367 368 if (Info->hasImplicitBufferPtr()) { 369 unsigned ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI); 370 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); 371 CCInfo.AllocateReg(ImplicitBufferPtrReg); 372 } 373 374 unsigned NumArgs = F.arg_size(); 375 Function::const_arg_iterator CurOrigArg = F.arg_begin(); 376 const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>(); 377 unsigned PSInputNum = 0; 378 BitVector Skipped(NumArgs); 379 for (unsigned i = 0; i != NumArgs; ++i, ++CurOrigArg) { 380 EVT ValEVT = TLI.getValueType(DL, CurOrigArg->getType()); 381 382 // We can only hanlde simple value types at the moment. 383 ISD::ArgFlagsTy Flags; 384 assert(VRegs[i].size() == 1 && "Can't lower into more than one register"); 385 ArgInfo OrigArg{VRegs[i][0], CurOrigArg->getType()}; 386 setArgFlags(OrigArg, i + 1, DL, F); 387 Flags.setOrigAlign(DL.getABITypeAlignment(CurOrigArg->getType())); 388 389 if (F.getCallingConv() == CallingConv::AMDGPU_PS && 390 !OrigArg.Flags.isInReg() && !OrigArg.Flags.isByVal() && 391 PSInputNum <= 15) { 392 if (CurOrigArg->use_empty() && !Info->isPSInputAllocated(PSInputNum)) { 393 Skipped.set(i); 394 ++PSInputNum; 395 continue; 396 } 397 398 Info->markPSInputAllocated(PSInputNum); 399 if (!CurOrigArg->use_empty()) 400 Info->markPSInputEnabled(PSInputNum); 401 402 ++PSInputNum; 403 } 404 405 CCAssignFn *AssignFn = CCAssignFnForCall(F.getCallingConv(), 406 /*IsVarArg=*/false); 407 408 if (ValEVT.isVector()) { 409 EVT ElemVT = ValEVT.getVectorElementType(); 410 if (!ValEVT.isSimple()) 411 return false; 412 MVT ValVT = ElemVT.getSimpleVT(); 413 bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, 414 OrigArg.Flags, CCInfo); 415 if (!Res) 416 return false; 417 } else { 418 MVT ValVT = ValEVT.getSimpleVT(); 419 if (!ValEVT.isSimple()) 420 return false; 421 bool Res = 422 AssignFn(i, ValVT, ValVT, CCValAssign::Full, OrigArg.Flags, CCInfo); 423 424 // Fail if we don't know how to handle this type. 425 if (Res) 426 return false; 427 } 428 } 429 430 Function::const_arg_iterator Arg = F.arg_begin(); 431 432 if (F.getCallingConv() == CallingConv::AMDGPU_VS || 433 F.getCallingConv() == CallingConv::AMDGPU_PS) { 434 for (unsigned i = 0, OrigArgIdx = 0; 435 OrigArgIdx != NumArgs && i != ArgLocs.size(); ++Arg, ++OrigArgIdx) { 436 if (Skipped.test(OrigArgIdx)) 437 continue; 438 assert(VRegs[OrigArgIdx].size() == 1 && 439 "Can't lower into more than 1 reg"); 440 CCValAssign &VA = ArgLocs[i++]; 441 MRI.addLiveIn(VA.getLocReg(), VRegs[OrigArgIdx][0]); 442 MIRBuilder.getMBB().addLiveIn(VA.getLocReg()); 443 MIRBuilder.buildCopy(VRegs[OrigArgIdx][0], VA.getLocReg()); 444 } 445 446 allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), IsShader); 447 return true; 448 } 449 450 return false; 451 } 452