1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPU.h" 22 #include "AMDGPULegalizerInfo.h" 23 #include "AMDGPUTargetMachine.h" 24 #include "SIMachineFunctionInfo.h" 25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/CodeGen/TargetOpcodes.h" 28 #include "llvm/CodeGen/ValueTypes.h" 29 #include "llvm/IR/DerivedTypes.h" 30 #include "llvm/IR/DiagnosticInfo.h" 31 #include "llvm/IR/Type.h" 32 #include "llvm/Support/Debug.h" 33 34 #define DEBUG_TYPE "amdgpu-legalinfo" 35 36 using namespace llvm; 37 using namespace LegalizeActions; 38 using namespace LegalizeMutations; 39 using namespace LegalityPredicates; 40 41 42 static LegalityPredicate isMultiple32(unsigned TypeIdx, 43 unsigned MaxSize = 512) { 44 return [=](const LegalityQuery &Query) { 45 const LLT Ty = Query.Types[TypeIdx]; 46 const LLT EltTy = Ty.getScalarType(); 47 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 48 }; 49 } 50 51 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 52 return [=](const LegalityQuery &Query) { 53 const LLT Ty = Query.Types[TypeIdx]; 54 return Ty.isVector() && 55 Ty.getNumElements() % 2 != 0 && 56 Ty.getElementType().getSizeInBits() < 32; 57 }; 58 } 59 60 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 61 return [=](const LegalityQuery &Query) { 62 const LLT Ty = Query.Types[TypeIdx]; 63 const LLT EltTy = Ty.getElementType(); 64 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 65 }; 66 } 67 68 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 69 return [=](const LegalityQuery &Query) { 70 const LLT Ty = Query.Types[TypeIdx]; 71 const LLT EltTy = Ty.getElementType(); 72 unsigned Size = Ty.getSizeInBits(); 73 unsigned Pieces = (Size + 63) / 64; 74 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 75 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 76 }; 77 } 78 79 // Increase the number of vector elements to reach the next multiple of 32-bit 80 // type. 81 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 82 return [=](const LegalityQuery &Query) { 83 const LLT Ty = Query.Types[TypeIdx]; 84 85 const LLT EltTy = Ty.getElementType(); 86 const int Size = Ty.getSizeInBits(); 87 const int EltSize = EltTy.getSizeInBits(); 88 const int NextMul32 = (Size + 31) / 32; 89 90 assert(EltSize < 32); 91 92 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 93 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 94 }; 95 } 96 97 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 98 return [=](const LegalityQuery &Query) { 99 const LLT QueryTy = Query.Types[TypeIdx]; 100 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 101 }; 102 } 103 104 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 105 return [=](const LegalityQuery &Query) { 106 const LLT QueryTy = Query.Types[TypeIdx]; 107 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 108 }; 109 } 110 111 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 112 return [=](const LegalityQuery &Query) { 113 const LLT QueryTy = Query.Types[TypeIdx]; 114 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 115 }; 116 } 117 118 // Any combination of 32 or 64-bit elements up to 512 bits, and multiples of 119 // v2s16. 120 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 121 return [=](const LegalityQuery &Query) { 122 const LLT Ty = Query.Types[TypeIdx]; 123 if (Ty.isVector()) { 124 const int EltSize = Ty.getElementType().getSizeInBits(); 125 return EltSize == 32 || EltSize == 64 || 126 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 127 EltSize == 128 || EltSize == 256; 128 } 129 130 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 512; 131 }; 132 } 133 134 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 135 return [=](const LegalityQuery &Query) { 136 return Query.Types[TypeIdx].getElementType() == Type; 137 }; 138 } 139 140 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 141 return [=](const LegalityQuery &Query) { 142 const LLT Ty = Query.Types[TypeIdx]; 143 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 144 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 145 }; 146 } 147 148 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 149 const GCNTargetMachine &TM) 150 : ST(ST_) { 151 using namespace TargetOpcode; 152 153 auto GetAddrSpacePtr = [&TM](unsigned AS) { 154 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 155 }; 156 157 const LLT S1 = LLT::scalar(1); 158 const LLT S8 = LLT::scalar(8); 159 const LLT S16 = LLT::scalar(16); 160 const LLT S32 = LLT::scalar(32); 161 const LLT S64 = LLT::scalar(64); 162 const LLT S96 = LLT::scalar(96); 163 const LLT S128 = LLT::scalar(128); 164 const LLT S256 = LLT::scalar(256); 165 const LLT S512 = LLT::scalar(512); 166 167 const LLT V2S16 = LLT::vector(2, 16); 168 const LLT V4S16 = LLT::vector(4, 16); 169 170 const LLT V2S32 = LLT::vector(2, 32); 171 const LLT V3S32 = LLT::vector(3, 32); 172 const LLT V4S32 = LLT::vector(4, 32); 173 const LLT V5S32 = LLT::vector(5, 32); 174 const LLT V6S32 = LLT::vector(6, 32); 175 const LLT V7S32 = LLT::vector(7, 32); 176 const LLT V8S32 = LLT::vector(8, 32); 177 const LLT V9S32 = LLT::vector(9, 32); 178 const LLT V10S32 = LLT::vector(10, 32); 179 const LLT V11S32 = LLT::vector(11, 32); 180 const LLT V12S32 = LLT::vector(12, 32); 181 const LLT V13S32 = LLT::vector(13, 32); 182 const LLT V14S32 = LLT::vector(14, 32); 183 const LLT V15S32 = LLT::vector(15, 32); 184 const LLT V16S32 = LLT::vector(16, 32); 185 186 const LLT V2S64 = LLT::vector(2, 64); 187 const LLT V3S64 = LLT::vector(3, 64); 188 const LLT V4S64 = LLT::vector(4, 64); 189 const LLT V5S64 = LLT::vector(5, 64); 190 const LLT V6S64 = LLT::vector(6, 64); 191 const LLT V7S64 = LLT::vector(7, 64); 192 const LLT V8S64 = LLT::vector(8, 64); 193 194 std::initializer_list<LLT> AllS32Vectors = 195 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 196 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32}; 197 std::initializer_list<LLT> AllS64Vectors = 198 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64}; 199 200 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 201 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 202 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 203 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 204 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 205 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 206 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 207 208 const LLT CodePtr = FlatPtr; 209 210 const std::initializer_list<LLT> AddrSpaces64 = { 211 GlobalPtr, ConstantPtr, FlatPtr 212 }; 213 214 const std::initializer_list<LLT> AddrSpaces32 = { 215 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 216 }; 217 218 const std::initializer_list<LLT> FPTypesBase = { 219 S32, S64 220 }; 221 222 const std::initializer_list<LLT> FPTypes16 = { 223 S32, S64, S16 224 }; 225 226 const std::initializer_list<LLT> FPTypesPK16 = { 227 S32, S64, S16, V2S16 228 }; 229 230 setAction({G_BRCOND, S1}, Legal); 231 232 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 233 // elements for v3s16 234 getActionDefinitionsBuilder(G_PHI) 235 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 236 .legalFor(AllS32Vectors) 237 .legalFor(AllS64Vectors) 238 .legalFor(AddrSpaces64) 239 .legalFor(AddrSpaces32) 240 .clampScalar(0, S32, S256) 241 .widenScalarToNextPow2(0, 32) 242 .clampMaxNumElements(0, S32, 16) 243 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 244 .legalIf(isPointer(0)); 245 246 if (ST.has16BitInsts()) { 247 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 248 .legalFor({S32, S16}) 249 .clampScalar(0, S16, S32) 250 .scalarize(0); 251 } else { 252 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 253 .legalFor({S32}) 254 .clampScalar(0, S32, S32) 255 .scalarize(0); 256 } 257 258 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 259 .legalFor({S32}) 260 .clampScalar(0, S32, S32) 261 .scalarize(0); 262 263 // Report legal for any types we can handle anywhere. For the cases only legal 264 // on the SALU, RegBankSelect will be able to re-legalize. 265 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 266 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 267 .clampScalar(0, S32, S64) 268 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 269 .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0)) 270 .widenScalarToNextPow2(0) 271 .scalarize(0); 272 273 getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO, 274 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 275 .legalFor({{S32, S1}}) 276 .clampScalar(0, S32, S32); 277 278 getActionDefinitionsBuilder(G_BITCAST) 279 .legalForCartesianProduct({S32, V2S16}) 280 .legalForCartesianProduct({S64, V2S32, V4S16}) 281 .legalForCartesianProduct({V2S64, V4S32}) 282 // Don't worry about the size constraint. 283 .legalIf(all(isPointer(0), isPointer(1))) 284 // FIXME: Testing hack 285 .legalForCartesianProduct({S16, LLT::vector(2, 8), }); 286 287 getActionDefinitionsBuilder(G_FCONSTANT) 288 .legalFor({S32, S64, S16}) 289 .clampScalar(0, S16, S64); 290 291 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 292 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 293 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 294 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 295 .clampScalarOrElt(0, S32, S512) 296 .legalIf(isMultiple32(0)) 297 .widenScalarToNextPow2(0, 32) 298 .clampMaxNumElements(0, S32, 16); 299 300 301 // FIXME: i1 operands to intrinsics should always be legal, but other i1 302 // values may not be legal. We need to figure out how to distinguish 303 // between these two scenarios. 304 getActionDefinitionsBuilder(G_CONSTANT) 305 .legalFor({S1, S32, S64, S16, GlobalPtr, 306 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 307 .clampScalar(0, S32, S64) 308 .widenScalarToNextPow2(0) 309 .legalIf(isPointer(0)); 310 311 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 312 getActionDefinitionsBuilder(G_GLOBAL_VALUE).customFor({LocalPtr}); 313 314 315 auto &FPOpActions = getActionDefinitionsBuilder( 316 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 317 .legalFor({S32, S64}); 318 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 319 .customFor({S32, S64}); 320 321 if (ST.has16BitInsts()) { 322 if (ST.hasVOP3PInsts()) 323 FPOpActions.legalFor({S16, V2S16}); 324 else 325 FPOpActions.legalFor({S16}); 326 327 TrigActions.customFor({S16}); 328 } 329 330 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 331 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 332 333 if (ST.hasVOP3PInsts()) { 334 MinNumMaxNum.customFor(FPTypesPK16) 335 .clampMaxNumElements(0, S16, 2) 336 .clampScalar(0, S16, S64) 337 .scalarize(0); 338 } else if (ST.has16BitInsts()) { 339 MinNumMaxNum.customFor(FPTypes16) 340 .clampScalar(0, S16, S64) 341 .scalarize(0); 342 } else { 343 MinNumMaxNum.customFor(FPTypesBase) 344 .clampScalar(0, S32, S64) 345 .scalarize(0); 346 } 347 348 if (ST.hasVOP3PInsts()) 349 FPOpActions.clampMaxNumElements(0, S16, 2); 350 351 FPOpActions 352 .scalarize(0) 353 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 354 355 TrigActions 356 .scalarize(0) 357 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 358 359 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 360 .legalFor(FPTypesPK16) 361 .clampMaxNumElements(0, S16, 2) 362 .scalarize(0) 363 .clampScalar(0, S16, S64); 364 365 // TODO: Implement 366 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); 367 368 if (ST.has16BitInsts()) { 369 getActionDefinitionsBuilder(G_FSQRT) 370 .legalFor({S32, S64, S16}) 371 .scalarize(0) 372 .clampScalar(0, S16, S64); 373 } else { 374 getActionDefinitionsBuilder(G_FSQRT) 375 .legalFor({S32, S64}) 376 .scalarize(0) 377 .clampScalar(0, S32, S64); 378 } 379 380 getActionDefinitionsBuilder(G_FPTRUNC) 381 .legalFor({{S32, S64}, {S16, S32}}) 382 .scalarize(0); 383 384 getActionDefinitionsBuilder(G_FPEXT) 385 .legalFor({{S64, S32}, {S32, S16}}) 386 .lowerFor({{S64, S16}}) // FIXME: Implement 387 .scalarize(0); 388 389 // TODO: Verify V_BFI_B32 is generated from expanded bit ops. 390 getActionDefinitionsBuilder(G_FCOPYSIGN).lower(); 391 392 getActionDefinitionsBuilder(G_FSUB) 393 // Use actual fsub instruction 394 .legalFor({S32}) 395 // Must use fadd + fneg 396 .lowerFor({S64, S16, V2S16}) 397 .scalarize(0) 398 .clampScalar(0, S32, S64); 399 400 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 401 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 402 {S32, S1}, {S64, S1}, {S16, S1}, 403 {S96, S32}, 404 // FIXME: Hack 405 {S64, LLT::scalar(33)}, 406 {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}}) 407 .scalarize(0); 408 409 getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 410 .legalFor({{S32, S32}, {S64, S32}}) 411 .lowerFor({{S32, S64}}) 412 .customFor({{S64, S64}}) 413 .scalarize(0); 414 415 getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 416 .legalFor({{S32, S32}, {S32, S64}}) 417 .scalarize(0); 418 419 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 420 .legalFor({S32, S64}) 421 .scalarize(0); 422 423 if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 424 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 425 .legalFor({S32, S64}) 426 .clampScalar(0, S32, S64) 427 .scalarize(0); 428 } else { 429 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 430 .legalFor({S32}) 431 .customFor({S64}) 432 .clampScalar(0, S32, S64) 433 .scalarize(0); 434 } 435 436 getActionDefinitionsBuilder(G_GEP) 437 .legalForCartesianProduct(AddrSpaces64, {S64}) 438 .legalForCartesianProduct(AddrSpaces32, {S32}) 439 .scalarize(0); 440 441 getActionDefinitionsBuilder(G_PTR_MASK) 442 .scalarize(0) 443 .alwaysLegal(); 444 445 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 446 447 auto &CmpBuilder = 448 getActionDefinitionsBuilder(G_ICMP) 449 .legalForCartesianProduct( 450 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 451 .legalFor({{S1, S32}, {S1, S64}}); 452 if (ST.has16BitInsts()) { 453 CmpBuilder.legalFor({{S1, S16}}); 454 } 455 456 CmpBuilder 457 .widenScalarToNextPow2(1) 458 .clampScalar(1, S32, S64) 459 .scalarize(0) 460 .legalIf(all(typeIs(0, S1), isPointer(1))); 461 462 getActionDefinitionsBuilder(G_FCMP) 463 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 464 .widenScalarToNextPow2(1) 465 .clampScalar(1, S32, S64) 466 .scalarize(0); 467 468 // FIXME: fexp, flog2, flog10 needs to be custom lowered. 469 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, 470 G_FLOG, G_FLOG2, G_FLOG10}) 471 .legalFor({S32}) 472 .scalarize(0); 473 474 // The 64-bit versions produce 32-bit results, but only on the SALU. 475 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, 476 G_CTTZ, G_CTTZ_ZERO_UNDEF, 477 G_CTPOP}) 478 .legalFor({{S32, S32}, {S32, S64}}) 479 .clampScalar(0, S32, S32) 480 .clampScalar(1, S32, S64) 481 .scalarize(0) 482 .widenScalarToNextPow2(0, 32) 483 .widenScalarToNextPow2(1, 32); 484 485 // TODO: Expand for > s32 486 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE}) 487 .legalFor({S32}) 488 .clampScalar(0, S32, S32) 489 .scalarize(0); 490 491 if (ST.has16BitInsts()) { 492 if (ST.hasVOP3PInsts()) { 493 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 494 .legalFor({S32, S16, V2S16}) 495 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 496 .clampMaxNumElements(0, S16, 2) 497 .clampScalar(0, S16, S32) 498 .widenScalarToNextPow2(0) 499 .scalarize(0); 500 } else { 501 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 502 .legalFor({S32, S16}) 503 .widenScalarToNextPow2(0) 504 .clampScalar(0, S16, S32) 505 .scalarize(0); 506 } 507 } else { 508 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 509 .legalFor({S32}) 510 .clampScalar(0, S32, S32) 511 .widenScalarToNextPow2(0) 512 .scalarize(0); 513 } 514 515 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 516 return [=](const LegalityQuery &Query) { 517 return Query.Types[TypeIdx0].getSizeInBits() < 518 Query.Types[TypeIdx1].getSizeInBits(); 519 }; 520 }; 521 522 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 523 return [=](const LegalityQuery &Query) { 524 return Query.Types[TypeIdx0].getSizeInBits() > 525 Query.Types[TypeIdx1].getSizeInBits(); 526 }; 527 }; 528 529 getActionDefinitionsBuilder(G_INTTOPTR) 530 // List the common cases 531 .legalForCartesianProduct(AddrSpaces64, {S64}) 532 .legalForCartesianProduct(AddrSpaces32, {S32}) 533 .scalarize(0) 534 // Accept any address space as long as the size matches 535 .legalIf(sameSize(0, 1)) 536 .widenScalarIf(smallerThan(1, 0), 537 [](const LegalityQuery &Query) { 538 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 539 }) 540 .narrowScalarIf(greaterThan(1, 0), 541 [](const LegalityQuery &Query) { 542 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 543 }); 544 545 getActionDefinitionsBuilder(G_PTRTOINT) 546 // List the common cases 547 .legalForCartesianProduct(AddrSpaces64, {S64}) 548 .legalForCartesianProduct(AddrSpaces32, {S32}) 549 .scalarize(0) 550 // Accept any address space as long as the size matches 551 .legalIf(sameSize(0, 1)) 552 .widenScalarIf(smallerThan(0, 1), 553 [](const LegalityQuery &Query) { 554 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 555 }) 556 .narrowScalarIf( 557 greaterThan(0, 1), 558 [](const LegalityQuery &Query) { 559 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 560 }); 561 562 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 563 .scalarize(0) 564 .custom(); 565 566 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 567 // handle some operations by just promoting the register during 568 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 569 auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned { 570 switch (AS) { 571 // FIXME: Private element size. 572 case AMDGPUAS::PRIVATE_ADDRESS: 573 return 32; 574 // FIXME: Check subtarget 575 case AMDGPUAS::LOCAL_ADDRESS: 576 return ST.useDS128() ? 128 : 64; 577 578 // Treat constant and global as identical. SMRD loads are sometimes usable 579 // for global loads (ideally constant address space should be eliminated) 580 // depending on the context. Legality cannot be context dependent, but 581 // RegBankSelect can split the load as necessary depending on the pointer 582 // register bank/uniformity and if the memory is invariant or not written in 583 // a kernel. 584 case AMDGPUAS::CONSTANT_ADDRESS: 585 case AMDGPUAS::GLOBAL_ADDRESS: 586 return 512; 587 default: 588 return 128; 589 } 590 }; 591 592 const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool { 593 const LLT DstTy = Query.Types[0]; 594 595 // Split vector extloads. 596 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 597 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 598 return true; 599 600 const LLT PtrTy = Query.Types[1]; 601 unsigned AS = PtrTy.getAddressSpace(); 602 if (MemSize > maxSizeForAddrSpace(AS)) 603 return true; 604 605 // Catch weird sized loads that don't evenly divide into the access sizes 606 // TODO: May be able to widen depending on alignment etc. 607 unsigned NumRegs = MemSize / 32; 608 if (NumRegs == 3 && !ST.hasDwordx3LoadStores()) 609 return true; 610 611 unsigned Align = Query.MMODescrs[0].AlignInBits; 612 if (Align < MemSize) { 613 const SITargetLowering *TLI = ST.getTargetLowering(); 614 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 615 } 616 617 return false; 618 }; 619 620 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 621 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 622 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 623 624 // TODO: Refine based on subtargets which support unaligned access or 128-bit 625 // LDS 626 // TODO: Unsupported flat for SI. 627 628 for (unsigned Op : {G_LOAD, G_STORE}) { 629 const bool IsStore = Op == G_STORE; 630 631 auto &Actions = getActionDefinitionsBuilder(Op); 632 // Whitelist the common cases. 633 // TODO: Pointer loads 634 // TODO: Wide constant loads 635 // TODO: Only CI+ has 3x loads 636 // TODO: Loads to s16 on gfx9 637 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 638 {V2S32, GlobalPtr, 64, GlobalAlign32}, 639 {V3S32, GlobalPtr, 96, GlobalAlign32}, 640 {S96, GlobalPtr, 96, GlobalAlign32}, 641 {V4S32, GlobalPtr, 128, GlobalAlign32}, 642 {S128, GlobalPtr, 128, GlobalAlign32}, 643 {S64, GlobalPtr, 64, GlobalAlign32}, 644 {V2S64, GlobalPtr, 128, GlobalAlign32}, 645 {V2S16, GlobalPtr, 32, GlobalAlign32}, 646 {S32, GlobalPtr, 8, GlobalAlign8}, 647 {S32, GlobalPtr, 16, GlobalAlign16}, 648 649 {S32, LocalPtr, 32, 32}, 650 {S64, LocalPtr, 64, 32}, 651 {V2S32, LocalPtr, 64, 32}, 652 {S32, LocalPtr, 8, 8}, 653 {S32, LocalPtr, 16, 16}, 654 {V2S16, LocalPtr, 32, 32}, 655 656 {S32, PrivatePtr, 32, 32}, 657 {S32, PrivatePtr, 8, 8}, 658 {S32, PrivatePtr, 16, 16}, 659 {V2S16, PrivatePtr, 32, 32}, 660 661 {S32, FlatPtr, 32, GlobalAlign32}, 662 {S32, FlatPtr, 16, GlobalAlign16}, 663 {S32, FlatPtr, 8, GlobalAlign8}, 664 {V2S16, FlatPtr, 32, GlobalAlign32}, 665 666 {S32, ConstantPtr, 32, GlobalAlign32}, 667 {V2S32, ConstantPtr, 64, GlobalAlign32}, 668 {V3S32, ConstantPtr, 96, GlobalAlign32}, 669 {V4S32, ConstantPtr, 128, GlobalAlign32}, 670 {S64, ConstantPtr, 64, GlobalAlign32}, 671 {S128, ConstantPtr, 128, GlobalAlign32}, 672 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 673 Actions 674 .customIf(typeIs(1, Constant32Ptr)) 675 .narrowScalarIf( 676 [=](const LegalityQuery &Query) -> bool { 677 return !Query.Types[0].isVector() && needToSplitLoad(Query); 678 }, 679 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 680 const LLT DstTy = Query.Types[0]; 681 const LLT PtrTy = Query.Types[1]; 682 683 const unsigned DstSize = DstTy.getSizeInBits(); 684 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 685 686 // Split extloads. 687 if (DstSize > MemSize) 688 return std::make_pair(0, LLT::scalar(MemSize)); 689 690 if (DstSize > 32 && (DstSize % 32 != 0)) { 691 // FIXME: Need a way to specify non-extload of larger size if 692 // suitably aligned. 693 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 694 } 695 696 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 697 if (MemSize > MaxSize) 698 return std::make_pair(0, LLT::scalar(MaxSize)); 699 700 unsigned Align = Query.MMODescrs[0].AlignInBits; 701 return std::make_pair(0, LLT::scalar(Align)); 702 }) 703 .fewerElementsIf( 704 [=](const LegalityQuery &Query) -> bool { 705 return Query.Types[0].isVector() && needToSplitLoad(Query); 706 }, 707 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 708 const LLT DstTy = Query.Types[0]; 709 const LLT PtrTy = Query.Types[1]; 710 711 LLT EltTy = DstTy.getElementType(); 712 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 713 714 // Split if it's too large for the address space. 715 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 716 unsigned NumElts = DstTy.getNumElements(); 717 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 718 719 // FIXME: Refine when odd breakdowns handled 720 // The scalars will need to be re-legalized. 721 if (NumPieces == 1 || NumPieces >= NumElts || 722 NumElts % NumPieces != 0) 723 return std::make_pair(0, EltTy); 724 725 return std::make_pair(0, 726 LLT::vector(NumElts / NumPieces, EltTy)); 727 } 728 729 // Need to split because of alignment. 730 unsigned Align = Query.MMODescrs[0].AlignInBits; 731 unsigned EltSize = EltTy.getSizeInBits(); 732 if (EltSize > Align && 733 (EltSize / Align < DstTy.getNumElements())) { 734 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 735 } 736 737 // May need relegalization for the scalars. 738 return std::make_pair(0, EltTy); 739 }) 740 .minScalar(0, S32); 741 742 if (IsStore) 743 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 744 745 // TODO: Need a bitcast lower option? 746 Actions 747 .legalIf([=](const LegalityQuery &Query) { 748 const LLT Ty0 = Query.Types[0]; 749 unsigned Size = Ty0.getSizeInBits(); 750 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 751 unsigned Align = Query.MMODescrs[0].AlignInBits; 752 753 // No extending vector loads. 754 if (Size > MemSize && Ty0.isVector()) 755 return false; 756 757 // FIXME: Widening store from alignment not valid. 758 if (MemSize < Size) 759 MemSize = std::max(MemSize, Align); 760 761 switch (MemSize) { 762 case 8: 763 case 16: 764 return Size == 32; 765 case 32: 766 case 64: 767 case 128: 768 return true; 769 case 96: 770 return ST.hasDwordx3LoadStores(); 771 case 256: 772 case 512: 773 return true; 774 default: 775 return false; 776 } 777 }) 778 .widenScalarToNextPow2(0) 779 // TODO: v3s32->v4s32 with alignment 780 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 781 } 782 783 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 784 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 785 {S32, GlobalPtr, 16, 2 * 8}, 786 {S32, LocalPtr, 8, 8}, 787 {S32, LocalPtr, 16, 16}, 788 {S32, PrivatePtr, 8, 8}, 789 {S32, PrivatePtr, 16, 16}, 790 {S32, ConstantPtr, 8, 8}, 791 {S32, ConstantPtr, 16, 2 * 8}}); 792 if (ST.hasFlatAddressSpace()) { 793 ExtLoads.legalForTypesWithMemDesc( 794 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 795 } 796 797 ExtLoads.clampScalar(0, S32, S32) 798 .widenScalarToNextPow2(0) 799 .unsupportedIfMemSizeNotPow2() 800 .lower(); 801 802 auto &Atomics = getActionDefinitionsBuilder( 803 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 804 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 805 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 806 G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG}) 807 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 808 {S64, GlobalPtr}, {S64, LocalPtr}}); 809 if (ST.hasFlatAddressSpace()) { 810 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 811 } 812 813 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 814 .legalFor({{S32, LocalPtr}}); 815 816 // TODO: Pointer types, any 32-bit or 64-bit vector 817 getActionDefinitionsBuilder(G_SELECT) 818 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 819 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 820 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1}) 821 .clampScalar(0, S16, S64) 822 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 823 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 824 .scalarize(1) 825 .clampMaxNumElements(0, S32, 2) 826 .clampMaxNumElements(0, LocalPtr, 2) 827 .clampMaxNumElements(0, PrivatePtr, 2) 828 .scalarize(0) 829 .widenScalarToNextPow2(0) 830 .legalIf(all(isPointer(0), typeIs(1, S1))); 831 832 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 833 // be more flexible with the shift amount type. 834 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 835 .legalFor({{S32, S32}, {S64, S32}}); 836 if (ST.has16BitInsts()) { 837 if (ST.hasVOP3PInsts()) { 838 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 839 .clampMaxNumElements(0, S16, 2); 840 } else 841 Shifts.legalFor({{S16, S32}, {S16, S16}}); 842 843 Shifts.clampScalar(1, S16, S32); 844 Shifts.clampScalar(0, S16, S64); 845 Shifts.widenScalarToNextPow2(0, 16); 846 } else { 847 // Make sure we legalize the shift amount type first, as the general 848 // expansion for the shifted type will produce much worse code if it hasn't 849 // been truncated already. 850 Shifts.clampScalar(1, S32, S32); 851 Shifts.clampScalar(0, S32, S64); 852 Shifts.widenScalarToNextPow2(0, 32); 853 } 854 Shifts.scalarize(0); 855 856 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 857 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 858 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 859 unsigned IdxTypeIdx = 2; 860 861 getActionDefinitionsBuilder(Op) 862 .customIf([=](const LegalityQuery &Query) { 863 const LLT EltTy = Query.Types[EltTypeIdx]; 864 const LLT VecTy = Query.Types[VecTypeIdx]; 865 const LLT IdxTy = Query.Types[IdxTypeIdx]; 866 return (EltTy.getSizeInBits() == 16 || 867 EltTy.getSizeInBits() % 32 == 0) && 868 VecTy.getSizeInBits() % 32 == 0 && 869 VecTy.getSizeInBits() <= 512 && 870 IdxTy.getSizeInBits() == 32; 871 }) 872 .clampScalar(EltTypeIdx, S32, S64) 873 .clampScalar(VecTypeIdx, S32, S64) 874 .clampScalar(IdxTypeIdx, S32, S32); 875 } 876 877 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 878 .unsupportedIf([=](const LegalityQuery &Query) { 879 const LLT &EltTy = Query.Types[1].getElementType(); 880 return Query.Types[0] != EltTy; 881 }); 882 883 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 884 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 885 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 886 887 // FIXME: Doesn't handle extract of illegal sizes. 888 getActionDefinitionsBuilder(Op) 889 .legalIf([=](const LegalityQuery &Query) { 890 const LLT BigTy = Query.Types[BigTyIdx]; 891 const LLT LitTy = Query.Types[LitTyIdx]; 892 return (BigTy.getSizeInBits() % 32 == 0) && 893 (LitTy.getSizeInBits() % 16 == 0); 894 }) 895 .widenScalarIf( 896 [=](const LegalityQuery &Query) { 897 const LLT BigTy = Query.Types[BigTyIdx]; 898 return (BigTy.getScalarSizeInBits() < 16); 899 }, 900 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 901 .widenScalarIf( 902 [=](const LegalityQuery &Query) { 903 const LLT LitTy = Query.Types[LitTyIdx]; 904 return (LitTy.getScalarSizeInBits() < 16); 905 }, 906 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 907 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 908 .widenScalarToNextPow2(BigTyIdx, 32); 909 910 } 911 912 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 913 .legalForCartesianProduct(AllS32Vectors, {S32}) 914 .legalForCartesianProduct(AllS64Vectors, {S64}) 915 .clampNumElements(0, V16S32, V16S32) 916 .clampNumElements(0, V2S64, V8S64); 917 918 if (ST.hasScalarPackInsts()) 919 BuildVector.legalFor({V2S16, S32}); 920 921 BuildVector 922 .minScalarSameAs(1, 0) 923 .legalIf(isRegisterType(0)) 924 .minScalarOrElt(0, S32); 925 926 if (ST.hasScalarPackInsts()) { 927 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 928 .legalFor({V2S16, S32}) 929 .lower(); 930 } else { 931 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 932 .lower(); 933 } 934 935 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 936 .legalIf(isRegisterType(0)); 937 938 // TODO: Don't fully scalarize v2s16 pieces 939 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 940 941 // Merge/Unmerge 942 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 943 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 944 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 945 946 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 947 const LLT &Ty = Query.Types[TypeIdx]; 948 if (Ty.isVector()) { 949 const LLT &EltTy = Ty.getElementType(); 950 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 951 return true; 952 if (!isPowerOf2_32(EltTy.getSizeInBits())) 953 return true; 954 } 955 return false; 956 }; 957 958 getActionDefinitionsBuilder(Op) 959 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 960 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 961 // worth considering the multiples of 64 since 2*192 and 2*384 are not 962 // valid. 963 .clampScalar(LitTyIdx, S16, S256) 964 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 965 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 966 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 967 elementTypeIs(1, S16)), 968 changeTo(1, V2S16)) 969 // Break up vectors with weird elements into scalars 970 .fewerElementsIf( 971 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 972 scalarize(0)) 973 .fewerElementsIf( 974 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 975 scalarize(1)) 976 .clampScalar(BigTyIdx, S32, S512) 977 .lowerFor({{S16, V2S16}}) 978 .widenScalarIf( 979 [=](const LegalityQuery &Query) { 980 const LLT &Ty = Query.Types[BigTyIdx]; 981 return !isPowerOf2_32(Ty.getSizeInBits()) && 982 Ty.getSizeInBits() % 16 != 0; 983 }, 984 [=](const LegalityQuery &Query) { 985 // Pick the next power of 2, or a multiple of 64 over 128. 986 // Whichever is smaller. 987 const LLT &Ty = Query.Types[BigTyIdx]; 988 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 989 if (NewSizeInBits >= 256) { 990 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 991 if (RoundedTo < NewSizeInBits) 992 NewSizeInBits = RoundedTo; 993 } 994 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 995 }) 996 .legalIf([=](const LegalityQuery &Query) { 997 const LLT &BigTy = Query.Types[BigTyIdx]; 998 const LLT &LitTy = Query.Types[LitTyIdx]; 999 1000 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1001 return false; 1002 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1003 return false; 1004 1005 return BigTy.getSizeInBits() % 16 == 0 && 1006 LitTy.getSizeInBits() % 16 == 0 && 1007 BigTy.getSizeInBits() <= 512; 1008 }) 1009 // Any vectors left are the wrong size. Scalarize them. 1010 .scalarize(0) 1011 .scalarize(1); 1012 } 1013 1014 getActionDefinitionsBuilder(G_SEXT_INREG).lower(); 1015 1016 computeTables(); 1017 verify(*ST.getInstrInfo()); 1018 } 1019 1020 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1021 MachineRegisterInfo &MRI, 1022 MachineIRBuilder &B, 1023 GISelChangeObserver &Observer) const { 1024 switch (MI.getOpcode()) { 1025 case TargetOpcode::G_ADDRSPACE_CAST: 1026 return legalizeAddrSpaceCast(MI, MRI, B); 1027 case TargetOpcode::G_FRINT: 1028 return legalizeFrint(MI, MRI, B); 1029 case TargetOpcode::G_FCEIL: 1030 return legalizeFceil(MI, MRI, B); 1031 case TargetOpcode::G_INTRINSIC_TRUNC: 1032 return legalizeIntrinsicTrunc(MI, MRI, B); 1033 case TargetOpcode::G_SITOFP: 1034 return legalizeITOFP(MI, MRI, B, true); 1035 case TargetOpcode::G_UITOFP: 1036 return legalizeITOFP(MI, MRI, B, false); 1037 case TargetOpcode::G_FMINNUM: 1038 case TargetOpcode::G_FMAXNUM: 1039 case TargetOpcode::G_FMINNUM_IEEE: 1040 case TargetOpcode::G_FMAXNUM_IEEE: 1041 return legalizeMinNumMaxNum(MI, MRI, B); 1042 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1043 return legalizeExtractVectorElt(MI, MRI, B); 1044 case TargetOpcode::G_INSERT_VECTOR_ELT: 1045 return legalizeInsertVectorElt(MI, MRI, B); 1046 case TargetOpcode::G_FSIN: 1047 case TargetOpcode::G_FCOS: 1048 return legalizeSinCos(MI, MRI, B); 1049 case TargetOpcode::G_GLOBAL_VALUE: 1050 return legalizeGlobalValue(MI, MRI, B); 1051 case TargetOpcode::G_LOAD: 1052 return legalizeLoad(MI, MRI, B, Observer); 1053 default: 1054 return false; 1055 } 1056 1057 llvm_unreachable("expected switch to return"); 1058 } 1059 1060 Register AMDGPULegalizerInfo::getSegmentAperture( 1061 unsigned AS, 1062 MachineRegisterInfo &MRI, 1063 MachineIRBuilder &B) const { 1064 MachineFunction &MF = B.getMF(); 1065 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1066 const LLT S32 = LLT::scalar(32); 1067 1068 if (ST.hasApertureRegs()) { 1069 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1070 // getreg. 1071 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1072 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1073 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1074 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1075 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1076 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1077 unsigned Encoding = 1078 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1079 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1080 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1081 1082 Register ApertureReg = MRI.createGenericVirtualRegister(S32); 1083 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1084 1085 B.buildInstr(AMDGPU::S_GETREG_B32) 1086 .addDef(GetReg) 1087 .addImm(Encoding); 1088 MRI.setType(GetReg, S32); 1089 1090 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1091 B.buildInstr(TargetOpcode::G_SHL) 1092 .addDef(ApertureReg) 1093 .addUse(GetReg) 1094 .addUse(ShiftAmt.getReg(0)); 1095 1096 return ApertureReg; 1097 } 1098 1099 Register QueuePtr = MRI.createGenericVirtualRegister( 1100 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1101 1102 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1103 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1104 return Register(); 1105 1106 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1107 // private_segment_aperture_base_hi. 1108 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1109 1110 // FIXME: Don't use undef 1111 Value *V = UndefValue::get(PointerType::get( 1112 Type::getInt8Ty(MF.getFunction().getContext()), 1113 AMDGPUAS::CONSTANT_ADDRESS)); 1114 1115 MachinePointerInfo PtrInfo(V, StructOffset); 1116 MachineMemOperand *MMO = MF.getMachineMemOperand( 1117 PtrInfo, 1118 MachineMemOperand::MOLoad | 1119 MachineMemOperand::MODereferenceable | 1120 MachineMemOperand::MOInvariant, 1121 4, 1122 MinAlign(64, StructOffset)); 1123 1124 Register LoadResult = MRI.createGenericVirtualRegister(S32); 1125 Register LoadAddr; 1126 1127 B.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1128 B.buildLoad(LoadResult, LoadAddr, *MMO); 1129 return LoadResult; 1130 } 1131 1132 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1133 MachineInstr &MI, MachineRegisterInfo &MRI, 1134 MachineIRBuilder &B) const { 1135 MachineFunction &MF = B.getMF(); 1136 1137 B.setInstr(MI); 1138 1139 const LLT S32 = LLT::scalar(32); 1140 Register Dst = MI.getOperand(0).getReg(); 1141 Register Src = MI.getOperand(1).getReg(); 1142 1143 LLT DstTy = MRI.getType(Dst); 1144 LLT SrcTy = MRI.getType(Src); 1145 unsigned DestAS = DstTy.getAddressSpace(); 1146 unsigned SrcAS = SrcTy.getAddressSpace(); 1147 1148 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1149 // vector element. 1150 assert(!DstTy.isVector()); 1151 1152 const AMDGPUTargetMachine &TM 1153 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1154 1155 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1156 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1157 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1158 return true; 1159 } 1160 1161 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1162 // Truncate. 1163 B.buildExtract(Dst, Src, 0); 1164 MI.eraseFromParent(); 1165 return true; 1166 } 1167 1168 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1169 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1170 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1171 1172 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1173 // another. Merge operands are required to be the same type, but creating an 1174 // extra ptrtoint would be kind of pointless. 1175 auto HighAddr = B.buildConstant( 1176 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1177 B.buildMerge(Dst, {Src, HighAddr.getReg(0)}); 1178 MI.eraseFromParent(); 1179 return true; 1180 } 1181 1182 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1183 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1184 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1185 unsigned NullVal = TM.getNullPointerValue(DestAS); 1186 1187 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1188 auto FlatNull = B.buildConstant(SrcTy, 0); 1189 1190 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy); 1191 1192 // Extract low 32-bits of the pointer. 1193 B.buildExtract(PtrLo32, Src, 0); 1194 1195 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1196 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); 1197 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1198 1199 MI.eraseFromParent(); 1200 return true; 1201 } 1202 1203 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1204 return false; 1205 1206 if (!ST.hasFlatAddressSpace()) 1207 return false; 1208 1209 auto SegmentNull = 1210 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1211 auto FlatNull = 1212 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1213 1214 Register ApertureReg = getSegmentAperture(DestAS, MRI, B); 1215 if (!ApertureReg.isValid()) 1216 return false; 1217 1218 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1219 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); 1220 1221 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy); 1222 1223 // Coerce the type of the low half of the result so we can use merge_values. 1224 Register SrcAsInt = MRI.createGenericVirtualRegister(S32); 1225 B.buildInstr(TargetOpcode::G_PTRTOINT) 1226 .addDef(SrcAsInt) 1227 .addUse(Src); 1228 1229 // TODO: Should we allow mismatched types but matching sizes in merges to 1230 // avoid the ptrtoint? 1231 B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); 1232 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); 1233 1234 MI.eraseFromParent(); 1235 return true; 1236 } 1237 1238 bool AMDGPULegalizerInfo::legalizeFrint( 1239 MachineInstr &MI, MachineRegisterInfo &MRI, 1240 MachineIRBuilder &B) const { 1241 B.setInstr(MI); 1242 1243 Register Src = MI.getOperand(1).getReg(); 1244 LLT Ty = MRI.getType(Src); 1245 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1246 1247 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1248 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1249 1250 auto C1 = B.buildFConstant(Ty, C1Val); 1251 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1252 1253 // TODO: Should this propagate fast-math-flags? 1254 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1255 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1256 1257 auto C2 = B.buildFConstant(Ty, C2Val); 1258 auto Fabs = B.buildFAbs(Ty, Src); 1259 1260 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1261 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1262 return true; 1263 } 1264 1265 bool AMDGPULegalizerInfo::legalizeFceil( 1266 MachineInstr &MI, MachineRegisterInfo &MRI, 1267 MachineIRBuilder &B) const { 1268 B.setInstr(MI); 1269 1270 const LLT S1 = LLT::scalar(1); 1271 const LLT S64 = LLT::scalar(64); 1272 1273 Register Src = MI.getOperand(1).getReg(); 1274 assert(MRI.getType(Src) == S64); 1275 1276 // result = trunc(src) 1277 // if (src > 0.0 && src != result) 1278 // result += 1.0 1279 1280 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src}); 1281 1282 const auto Zero = B.buildFConstant(S64, 0.0); 1283 const auto One = B.buildFConstant(S64, 1.0); 1284 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1285 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1286 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1287 auto Add = B.buildSelect(S64, And, One, Zero); 1288 1289 // TODO: Should this propagate fast-math-flags? 1290 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1291 return true; 1292 } 1293 1294 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1295 MachineIRBuilder &B) { 1296 const unsigned FractBits = 52; 1297 const unsigned ExpBits = 11; 1298 LLT S32 = LLT::scalar(32); 1299 1300 auto Const0 = B.buildConstant(S32, FractBits - 32); 1301 auto Const1 = B.buildConstant(S32, ExpBits); 1302 1303 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1304 .addUse(Const0.getReg(0)) 1305 .addUse(Const1.getReg(0)); 1306 1307 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1308 } 1309 1310 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1311 MachineInstr &MI, MachineRegisterInfo &MRI, 1312 MachineIRBuilder &B) const { 1313 B.setInstr(MI); 1314 1315 const LLT S1 = LLT::scalar(1); 1316 const LLT S32 = LLT::scalar(32); 1317 const LLT S64 = LLT::scalar(64); 1318 1319 Register Src = MI.getOperand(1).getReg(); 1320 assert(MRI.getType(Src) == S64); 1321 1322 // TODO: Should this use extract since the low half is unused? 1323 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1324 Register Hi = Unmerge.getReg(1); 1325 1326 // Extract the upper half, since this is where we will find the sign and 1327 // exponent. 1328 auto Exp = extractF64Exponent(Hi, B); 1329 1330 const unsigned FractBits = 52; 1331 1332 // Extract the sign bit. 1333 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1334 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1335 1336 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1337 1338 const auto Zero32 = B.buildConstant(S32, 0); 1339 1340 // Extend back to 64-bits. 1341 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); 1342 1343 auto Shr = B.buildAShr(S64, FractMask, Exp); 1344 auto Not = B.buildNot(S64, Shr); 1345 auto Tmp0 = B.buildAnd(S64, Src, Not); 1346 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1347 1348 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1349 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1350 1351 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1352 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1353 return true; 1354 } 1355 1356 bool AMDGPULegalizerInfo::legalizeITOFP( 1357 MachineInstr &MI, MachineRegisterInfo &MRI, 1358 MachineIRBuilder &B, bool Signed) const { 1359 B.setInstr(MI); 1360 1361 Register Dst = MI.getOperand(0).getReg(); 1362 Register Src = MI.getOperand(1).getReg(); 1363 1364 const LLT S64 = LLT::scalar(64); 1365 const LLT S32 = LLT::scalar(32); 1366 1367 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1368 1369 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1370 1371 auto CvtHi = Signed ? 1372 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1373 B.buildUITOFP(S64, Unmerge.getReg(1)); 1374 1375 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1376 1377 auto ThirtyTwo = B.buildConstant(S32, 32); 1378 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1379 .addUse(CvtHi.getReg(0)) 1380 .addUse(ThirtyTwo.getReg(0)); 1381 1382 // TODO: Should this propagate fast-math-flags? 1383 B.buildFAdd(Dst, LdExp, CvtLo); 1384 MI.eraseFromParent(); 1385 return true; 1386 } 1387 1388 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1389 MachineInstr &MI, MachineRegisterInfo &MRI, 1390 MachineIRBuilder &B) const { 1391 MachineFunction &MF = B.getMF(); 1392 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1393 1394 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1395 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1396 1397 // With ieee_mode disabled, the instructions have the correct behavior 1398 // already for G_FMINNUM/G_FMAXNUM 1399 if (!MFI->getMode().IEEE) 1400 return !IsIEEEOp; 1401 1402 if (IsIEEEOp) 1403 return true; 1404 1405 MachineIRBuilder HelperBuilder(MI); 1406 GISelObserverWrapper DummyObserver; 1407 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1408 HelperBuilder.setInstr(MI); 1409 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1410 } 1411 1412 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1413 MachineInstr &MI, MachineRegisterInfo &MRI, 1414 MachineIRBuilder &B) const { 1415 // TODO: Should move some of this into LegalizerHelper. 1416 1417 // TODO: Promote dynamic indexing of s16 to s32 1418 // TODO: Dynamic s64 indexing is only legal for SGPR. 1419 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); 1420 if (!IdxVal) // Dynamic case will be selected to register indexing. 1421 return true; 1422 1423 Register Dst = MI.getOperand(0).getReg(); 1424 Register Vec = MI.getOperand(1).getReg(); 1425 1426 LLT VecTy = MRI.getType(Vec); 1427 LLT EltTy = VecTy.getElementType(); 1428 assert(EltTy == MRI.getType(Dst)); 1429 1430 B.setInstr(MI); 1431 1432 if (IdxVal.getValue() < VecTy.getNumElements()) 1433 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); 1434 else 1435 B.buildUndef(Dst); 1436 1437 MI.eraseFromParent(); 1438 return true; 1439 } 1440 1441 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1442 MachineInstr &MI, MachineRegisterInfo &MRI, 1443 MachineIRBuilder &B) const { 1444 // TODO: Should move some of this into LegalizerHelper. 1445 1446 // TODO: Promote dynamic indexing of s16 to s32 1447 // TODO: Dynamic s64 indexing is only legal for SGPR. 1448 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); 1449 if (!IdxVal) // Dynamic case will be selected to register indexing. 1450 return true; 1451 1452 Register Dst = MI.getOperand(0).getReg(); 1453 Register Vec = MI.getOperand(1).getReg(); 1454 Register Ins = MI.getOperand(2).getReg(); 1455 1456 LLT VecTy = MRI.getType(Vec); 1457 LLT EltTy = VecTy.getElementType(); 1458 assert(EltTy == MRI.getType(Ins)); 1459 1460 B.setInstr(MI); 1461 1462 if (IdxVal.getValue() < VecTy.getNumElements()) 1463 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); 1464 else 1465 B.buildUndef(Dst); 1466 1467 MI.eraseFromParent(); 1468 return true; 1469 } 1470 1471 bool AMDGPULegalizerInfo::legalizeSinCos( 1472 MachineInstr &MI, MachineRegisterInfo &MRI, 1473 MachineIRBuilder &B) const { 1474 B.setInstr(MI); 1475 1476 Register DstReg = MI.getOperand(0).getReg(); 1477 Register SrcReg = MI.getOperand(1).getReg(); 1478 LLT Ty = MRI.getType(DstReg); 1479 unsigned Flags = MI.getFlags(); 1480 1481 Register TrigVal; 1482 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1483 if (ST.hasTrigReducedRange()) { 1484 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1485 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1486 .addUse(MulVal.getReg(0)) 1487 .setMIFlags(Flags).getReg(0); 1488 } else 1489 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1490 1491 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1492 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1493 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1494 .addUse(TrigVal) 1495 .setMIFlags(Flags); 1496 MI.eraseFromParent(); 1497 return true; 1498 } 1499 1500 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1501 MachineInstr &MI, MachineRegisterInfo &MRI, 1502 MachineIRBuilder &B) const { 1503 Register DstReg = MI.getOperand(0).getReg(); 1504 LLT Ty = MRI.getType(DstReg); 1505 unsigned AS = Ty.getAddressSpace(); 1506 1507 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1508 MachineFunction &MF = B.getMF(); 1509 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1510 1511 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1512 B.setInstr(MI); 1513 1514 if (!MFI->isEntryFunction()) { 1515 const Function &Fn = MF.getFunction(); 1516 DiagnosticInfoUnsupported BadLDSDecl( 1517 Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 1518 Fn.getContext().diagnose(BadLDSDecl); 1519 } 1520 1521 // TODO: We could emit code to handle the initialization somewhere. 1522 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1523 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1524 MI.eraseFromParent(); 1525 return true; 1526 } 1527 } else 1528 return false; 1529 1530 const Function &Fn = MF.getFunction(); 1531 DiagnosticInfoUnsupported BadInit( 1532 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1533 Fn.getContext().diagnose(BadInit); 1534 return true; 1535 } 1536 1537 bool AMDGPULegalizerInfo::legalizeLoad( 1538 MachineInstr &MI, MachineRegisterInfo &MRI, 1539 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 1540 B.setInstr(MI); 1541 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1542 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 1543 Observer.changingInstr(MI); 1544 MI.getOperand(1).setReg(Cast.getReg(0)); 1545 Observer.changedInstr(MI); 1546 return true; 1547 } 1548 1549 // Return the use branch instruction, otherwise null if the usage is invalid. 1550 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 1551 MachineRegisterInfo &MRI) { 1552 Register CondDef = MI.getOperand(0).getReg(); 1553 if (!MRI.hasOneNonDBGUse(CondDef)) 1554 return nullptr; 1555 1556 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 1557 return UseMI.getParent() == MI.getParent() && 1558 UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr; 1559 } 1560 1561 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 1562 Register Reg, LLT Ty) const { 1563 Register LiveIn = MRI.getLiveInVirtReg(Reg); 1564 if (LiveIn) 1565 return LiveIn; 1566 1567 Register NewReg = MRI.createGenericVirtualRegister(Ty); 1568 MRI.addLiveIn(Reg, NewReg); 1569 return NewReg; 1570 } 1571 1572 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 1573 const ArgDescriptor *Arg) const { 1574 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 1575 return false; // TODO: Handle these 1576 1577 assert(Arg->getRegister().isPhysical()); 1578 1579 MachineRegisterInfo &MRI = *B.getMRI(); 1580 1581 LLT Ty = MRI.getType(DstReg); 1582 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 1583 1584 if (Arg->isMasked()) { 1585 // TODO: Should we try to emit this once in the entry block? 1586 const LLT S32 = LLT::scalar(32); 1587 const unsigned Mask = Arg->getMask(); 1588 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 1589 1590 auto ShiftAmt = B.buildConstant(S32, Shift); 1591 auto LShr = B.buildLShr(S32, LiveIn, ShiftAmt); 1592 B.buildAnd(DstReg, LShr, B.buildConstant(S32, Mask >> Shift)); 1593 } else 1594 B.buildCopy(DstReg, LiveIn); 1595 1596 // Insert the argument copy if it doens't already exist. 1597 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 1598 if (!MRI.getVRegDef(LiveIn)) { 1599 // FIXME: Should have scoped insert pt 1600 MachineBasicBlock &OrigInsBB = B.getMBB(); 1601 auto OrigInsPt = B.getInsertPt(); 1602 1603 MachineBasicBlock &EntryMBB = B.getMF().front(); 1604 EntryMBB.addLiveIn(Arg->getRegister()); 1605 B.setInsertPt(EntryMBB, EntryMBB.begin()); 1606 B.buildCopy(LiveIn, Arg->getRegister()); 1607 1608 B.setInsertPt(OrigInsBB, OrigInsPt); 1609 } 1610 1611 return true; 1612 } 1613 1614 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 1615 MachineInstr &MI, 1616 MachineRegisterInfo &MRI, 1617 MachineIRBuilder &B, 1618 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 1619 B.setInstr(MI); 1620 1621 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1622 1623 const ArgDescriptor *Arg; 1624 const TargetRegisterClass *RC; 1625 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 1626 if (!Arg) { 1627 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 1628 return false; 1629 } 1630 1631 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 1632 MI.eraseFromParent(); 1633 return true; 1634 } 1635 1636 return false; 1637 } 1638 1639 bool AMDGPULegalizerInfo::legalizeFDIVFast(MachineInstr &MI, 1640 MachineRegisterInfo &MRI, 1641 MachineIRBuilder &B) const { 1642 B.setInstr(MI); 1643 Register Res = MI.getOperand(0).getReg(); 1644 Register LHS = MI.getOperand(2).getReg(); 1645 Register RHS = MI.getOperand(3).getReg(); 1646 uint16_t Flags = MI.getFlags(); 1647 1648 LLT S32 = LLT::scalar(32); 1649 LLT S1 = LLT::scalar(1); 1650 1651 auto Abs = B.buildFAbs(S32, RHS, Flags); 1652 const APFloat C0Val(1.0f); 1653 1654 auto C0 = B.buildConstant(S32, 0x6f800000); 1655 auto C1 = B.buildConstant(S32, 0x2f800000); 1656 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 1657 1658 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 1659 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 1660 1661 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 1662 1663 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 1664 .addUse(Mul0.getReg(0)) 1665 .setMIFlags(Flags); 1666 1667 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 1668 1669 B.buildFMul(Res, Sel, Mul1, Flags); 1670 1671 MI.eraseFromParent(); 1672 return true; 1673 } 1674 1675 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 1676 MachineRegisterInfo &MRI, 1677 MachineIRBuilder &B) const { 1678 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1679 if (!MFI->isEntryFunction()) { 1680 return legalizePreloadedArgIntrin(MI, MRI, B, 1681 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 1682 } 1683 1684 B.setInstr(MI); 1685 1686 uint64_t Offset = 1687 ST.getTargetLowering()->getImplicitParameterOffset( 1688 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 1689 Register DstReg = MI.getOperand(0).getReg(); 1690 LLT DstTy = MRI.getType(DstReg); 1691 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 1692 1693 const ArgDescriptor *Arg; 1694 const TargetRegisterClass *RC; 1695 std::tie(Arg, RC) 1696 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 1697 if (!Arg) 1698 return false; 1699 1700 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 1701 if (!loadInputValue(KernargPtrReg, B, Arg)) 1702 return false; 1703 1704 B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 1705 MI.eraseFromParent(); 1706 return true; 1707 } 1708 1709 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 1710 MachineRegisterInfo &MRI, 1711 MachineIRBuilder &B, 1712 unsigned AddrSpace) const { 1713 B.setInstr(MI); 1714 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 1715 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 1716 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 1717 MI.eraseFromParent(); 1718 return true; 1719 } 1720 1721 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 1722 MachineRegisterInfo &MRI, 1723 MachineIRBuilder &B) const { 1724 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 1725 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { 1726 case Intrinsic::amdgcn_if: { 1727 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 1728 const SIRegisterInfo *TRI 1729 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 1730 1731 B.setInstr(*BrCond); 1732 Register Def = MI.getOperand(1).getReg(); 1733 Register Use = MI.getOperand(3).getReg(); 1734 B.buildInstr(AMDGPU::SI_IF) 1735 .addDef(Def) 1736 .addUse(Use) 1737 .addMBB(BrCond->getOperand(1).getMBB()); 1738 1739 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 1740 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 1741 MI.eraseFromParent(); 1742 BrCond->eraseFromParent(); 1743 return true; 1744 } 1745 1746 return false; 1747 } 1748 case Intrinsic::amdgcn_loop: { 1749 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 1750 const SIRegisterInfo *TRI 1751 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 1752 1753 B.setInstr(*BrCond); 1754 Register Reg = MI.getOperand(2).getReg(); 1755 B.buildInstr(AMDGPU::SI_LOOP) 1756 .addUse(Reg) 1757 .addMBB(BrCond->getOperand(1).getMBB()); 1758 MI.eraseFromParent(); 1759 BrCond->eraseFromParent(); 1760 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 1761 return true; 1762 } 1763 1764 return false; 1765 } 1766 case Intrinsic::amdgcn_kernarg_segment_ptr: 1767 return legalizePreloadedArgIntrin( 1768 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 1769 case Intrinsic::amdgcn_implicitarg_ptr: 1770 return legalizeImplicitArgPtr(MI, MRI, B); 1771 case Intrinsic::amdgcn_workitem_id_x: 1772 return legalizePreloadedArgIntrin(MI, MRI, B, 1773 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 1774 case Intrinsic::amdgcn_workitem_id_y: 1775 return legalizePreloadedArgIntrin(MI, MRI, B, 1776 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 1777 case Intrinsic::amdgcn_workitem_id_z: 1778 return legalizePreloadedArgIntrin(MI, MRI, B, 1779 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 1780 case Intrinsic::amdgcn_workgroup_id_x: 1781 return legalizePreloadedArgIntrin(MI, MRI, B, 1782 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 1783 case Intrinsic::amdgcn_workgroup_id_y: 1784 return legalizePreloadedArgIntrin(MI, MRI, B, 1785 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 1786 case Intrinsic::amdgcn_workgroup_id_z: 1787 return legalizePreloadedArgIntrin(MI, MRI, B, 1788 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 1789 case Intrinsic::amdgcn_dispatch_ptr: 1790 return legalizePreloadedArgIntrin(MI, MRI, B, 1791 AMDGPUFunctionArgInfo::DISPATCH_PTR); 1792 case Intrinsic::amdgcn_queue_ptr: 1793 return legalizePreloadedArgIntrin(MI, MRI, B, 1794 AMDGPUFunctionArgInfo::QUEUE_PTR); 1795 case Intrinsic::amdgcn_implicit_buffer_ptr: 1796 return legalizePreloadedArgIntrin( 1797 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 1798 case Intrinsic::amdgcn_dispatch_id: 1799 return legalizePreloadedArgIntrin(MI, MRI, B, 1800 AMDGPUFunctionArgInfo::DISPATCH_ID); 1801 case Intrinsic::amdgcn_fdiv_fast: 1802 return legalizeFDIVFast(MI, MRI, B); 1803 case Intrinsic::amdgcn_is_shared: 1804 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 1805 case Intrinsic::amdgcn_is_private: 1806 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 1807 case Intrinsic::amdgcn_wavefrontsize: { 1808 B.setInstr(MI); 1809 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 1810 MI.eraseFromParent(); 1811 return true; 1812 } 1813 default: 1814 return true; 1815 } 1816 1817 return true; 1818 } 1819