1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPU.h" 22 #include "AMDGPULegalizerInfo.h" 23 #include "AMDGPUTargetMachine.h" 24 #include "SIMachineFunctionInfo.h" 25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/CodeGen/TargetOpcodes.h" 28 #include "llvm/CodeGen/ValueTypes.h" 29 #include "llvm/IR/DerivedTypes.h" 30 #include "llvm/IR/DiagnosticInfo.h" 31 #include "llvm/IR/Type.h" 32 #include "llvm/Support/Debug.h" 33 34 #define DEBUG_TYPE "amdgpu-legalinfo" 35 36 using namespace llvm; 37 using namespace LegalizeActions; 38 using namespace LegalizeMutations; 39 using namespace LegalityPredicates; 40 41 42 static LegalityPredicate isMultiple32(unsigned TypeIdx, 43 unsigned MaxSize = 512) { 44 return [=](const LegalityQuery &Query) { 45 const LLT Ty = Query.Types[TypeIdx]; 46 const LLT EltTy = Ty.getScalarType(); 47 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 48 }; 49 } 50 51 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 52 return [=](const LegalityQuery &Query) { 53 const LLT Ty = Query.Types[TypeIdx]; 54 return Ty.isVector() && 55 Ty.getNumElements() % 2 != 0 && 56 Ty.getElementType().getSizeInBits() < 32; 57 }; 58 } 59 60 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 61 return [=](const LegalityQuery &Query) { 62 const LLT Ty = Query.Types[TypeIdx]; 63 const LLT EltTy = Ty.getElementType(); 64 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 65 }; 66 } 67 68 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 69 return [=](const LegalityQuery &Query) { 70 const LLT Ty = Query.Types[TypeIdx]; 71 const LLT EltTy = Ty.getElementType(); 72 unsigned Size = Ty.getSizeInBits(); 73 unsigned Pieces = (Size + 63) / 64; 74 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 75 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 76 }; 77 } 78 79 // Increase the number of vector elements to reach the next multiple of 32-bit 80 // type. 81 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 82 return [=](const LegalityQuery &Query) { 83 const LLT Ty = Query.Types[TypeIdx]; 84 85 const LLT EltTy = Ty.getElementType(); 86 const int Size = Ty.getSizeInBits(); 87 const int EltSize = EltTy.getSizeInBits(); 88 const int NextMul32 = (Size + 31) / 32; 89 90 assert(EltSize < 32); 91 92 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 93 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 94 }; 95 } 96 97 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 98 return [=](const LegalityQuery &Query) { 99 const LLT QueryTy = Query.Types[TypeIdx]; 100 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 101 }; 102 } 103 104 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 105 return [=](const LegalityQuery &Query) { 106 const LLT QueryTy = Query.Types[TypeIdx]; 107 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 108 }; 109 } 110 111 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 112 return [=](const LegalityQuery &Query) { 113 const LLT QueryTy = Query.Types[TypeIdx]; 114 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 115 }; 116 } 117 118 // Any combination of 32 or 64-bit elements up to 512 bits, and multiples of 119 // v2s16. 120 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 121 return [=](const LegalityQuery &Query) { 122 const LLT Ty = Query.Types[TypeIdx]; 123 if (Ty.isVector()) { 124 const int EltSize = Ty.getElementType().getSizeInBits(); 125 return EltSize == 32 || EltSize == 64 || 126 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 127 EltSize == 128 || EltSize == 256; 128 } 129 130 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 512; 131 }; 132 } 133 134 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 135 return [=](const LegalityQuery &Query) { 136 return Query.Types[TypeIdx].getElementType() == Type; 137 }; 138 } 139 140 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 141 return [=](const LegalityQuery &Query) { 142 const LLT Ty = Query.Types[TypeIdx]; 143 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 144 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 145 }; 146 } 147 148 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 149 const GCNTargetMachine &TM) 150 : ST(ST_) { 151 using namespace TargetOpcode; 152 153 auto GetAddrSpacePtr = [&TM](unsigned AS) { 154 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 155 }; 156 157 const LLT S1 = LLT::scalar(1); 158 const LLT S8 = LLT::scalar(8); 159 const LLT S16 = LLT::scalar(16); 160 const LLT S32 = LLT::scalar(32); 161 const LLT S64 = LLT::scalar(64); 162 const LLT S96 = LLT::scalar(96); 163 const LLT S128 = LLT::scalar(128); 164 const LLT S256 = LLT::scalar(256); 165 const LLT S512 = LLT::scalar(512); 166 167 const LLT V2S16 = LLT::vector(2, 16); 168 const LLT V4S16 = LLT::vector(4, 16); 169 170 const LLT V2S32 = LLT::vector(2, 32); 171 const LLT V3S32 = LLT::vector(3, 32); 172 const LLT V4S32 = LLT::vector(4, 32); 173 const LLT V5S32 = LLT::vector(5, 32); 174 const LLT V6S32 = LLT::vector(6, 32); 175 const LLT V7S32 = LLT::vector(7, 32); 176 const LLT V8S32 = LLT::vector(8, 32); 177 const LLT V9S32 = LLT::vector(9, 32); 178 const LLT V10S32 = LLT::vector(10, 32); 179 const LLT V11S32 = LLT::vector(11, 32); 180 const LLT V12S32 = LLT::vector(12, 32); 181 const LLT V13S32 = LLT::vector(13, 32); 182 const LLT V14S32 = LLT::vector(14, 32); 183 const LLT V15S32 = LLT::vector(15, 32); 184 const LLT V16S32 = LLT::vector(16, 32); 185 186 const LLT V2S64 = LLT::vector(2, 64); 187 const LLT V3S64 = LLT::vector(3, 64); 188 const LLT V4S64 = LLT::vector(4, 64); 189 const LLT V5S64 = LLT::vector(5, 64); 190 const LLT V6S64 = LLT::vector(6, 64); 191 const LLT V7S64 = LLT::vector(7, 64); 192 const LLT V8S64 = LLT::vector(8, 64); 193 194 std::initializer_list<LLT> AllS32Vectors = 195 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 196 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32}; 197 std::initializer_list<LLT> AllS64Vectors = 198 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64}; 199 200 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 201 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 202 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 203 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 204 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 205 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 206 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 207 208 const LLT CodePtr = FlatPtr; 209 210 const std::initializer_list<LLT> AddrSpaces64 = { 211 GlobalPtr, ConstantPtr, FlatPtr 212 }; 213 214 const std::initializer_list<LLT> AddrSpaces32 = { 215 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 216 }; 217 218 const std::initializer_list<LLT> FPTypesBase = { 219 S32, S64 220 }; 221 222 const std::initializer_list<LLT> FPTypes16 = { 223 S32, S64, S16 224 }; 225 226 const std::initializer_list<LLT> FPTypesPK16 = { 227 S32, S64, S16, V2S16 228 }; 229 230 setAction({G_BRCOND, S1}, Legal); 231 232 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 233 // elements for v3s16 234 getActionDefinitionsBuilder(G_PHI) 235 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 236 .legalFor(AllS32Vectors) 237 .legalFor(AllS64Vectors) 238 .legalFor(AddrSpaces64) 239 .legalFor(AddrSpaces32) 240 .clampScalar(0, S32, S256) 241 .widenScalarToNextPow2(0, 32) 242 .clampMaxNumElements(0, S32, 16) 243 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 244 .legalIf(isPointer(0)); 245 246 if (ST.has16BitInsts()) { 247 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 248 .legalFor({S32, S16}) 249 .clampScalar(0, S16, S32) 250 .scalarize(0); 251 } else { 252 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 253 .legalFor({S32}) 254 .clampScalar(0, S32, S32) 255 .scalarize(0); 256 } 257 258 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 259 .legalFor({S32}) 260 .clampScalar(0, S32, S32) 261 .scalarize(0); 262 263 // Report legal for any types we can handle anywhere. For the cases only legal 264 // on the SALU, RegBankSelect will be able to re-legalize. 265 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 266 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 267 .clampScalar(0, S32, S64) 268 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 269 .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0)) 270 .widenScalarToNextPow2(0) 271 .scalarize(0); 272 273 getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO, 274 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 275 .legalFor({{S32, S1}}) 276 .clampScalar(0, S32, S32); 277 278 getActionDefinitionsBuilder(G_BITCAST) 279 .legalForCartesianProduct({S32, V2S16}) 280 .legalForCartesianProduct({S64, V2S32, V4S16}) 281 .legalForCartesianProduct({V2S64, V4S32}) 282 // Don't worry about the size constraint. 283 .legalIf(all(isPointer(0), isPointer(1))) 284 // FIXME: Testing hack 285 .legalForCartesianProduct({S16, LLT::vector(2, 8), }); 286 287 getActionDefinitionsBuilder(G_FCONSTANT) 288 .legalFor({S32, S64, S16}) 289 .clampScalar(0, S16, S64); 290 291 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 292 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 293 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 294 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 295 .clampScalarOrElt(0, S32, S512) 296 .legalIf(isMultiple32(0)) 297 .widenScalarToNextPow2(0, 32) 298 .clampMaxNumElements(0, S32, 16); 299 300 301 // FIXME: i1 operands to intrinsics should always be legal, but other i1 302 // values may not be legal. We need to figure out how to distinguish 303 // between these two scenarios. 304 getActionDefinitionsBuilder(G_CONSTANT) 305 .legalFor({S1, S32, S64, S16, GlobalPtr, 306 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 307 .clampScalar(0, S32, S64) 308 .widenScalarToNextPow2(0) 309 .legalIf(isPointer(0)); 310 311 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 312 getActionDefinitionsBuilder(G_GLOBAL_VALUE).customFor({LocalPtr}); 313 314 315 auto &FPOpActions = getActionDefinitionsBuilder( 316 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 317 .legalFor({S32, S64}); 318 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 319 .customFor({S32, S64}); 320 321 if (ST.has16BitInsts()) { 322 if (ST.hasVOP3PInsts()) 323 FPOpActions.legalFor({S16, V2S16}); 324 else 325 FPOpActions.legalFor({S16}); 326 327 TrigActions.customFor({S16}); 328 } 329 330 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 331 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 332 333 if (ST.hasVOP3PInsts()) { 334 MinNumMaxNum.customFor(FPTypesPK16) 335 .clampMaxNumElements(0, S16, 2) 336 .clampScalar(0, S16, S64) 337 .scalarize(0); 338 } else if (ST.has16BitInsts()) { 339 MinNumMaxNum.customFor(FPTypes16) 340 .clampScalar(0, S16, S64) 341 .scalarize(0); 342 } else { 343 MinNumMaxNum.customFor(FPTypesBase) 344 .clampScalar(0, S32, S64) 345 .scalarize(0); 346 } 347 348 if (ST.hasVOP3PInsts()) 349 FPOpActions.clampMaxNumElements(0, S16, 2); 350 351 FPOpActions 352 .scalarize(0) 353 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 354 355 TrigActions 356 .scalarize(0) 357 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 358 359 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 360 .legalFor(FPTypesPK16) 361 .clampMaxNumElements(0, S16, 2) 362 .scalarize(0) 363 .clampScalar(0, S16, S64); 364 365 // TODO: Implement 366 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); 367 368 if (ST.has16BitInsts()) { 369 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 370 .legalFor({S32, S64, S16}) 371 .scalarize(0) 372 .clampScalar(0, S16, S64); 373 } else { 374 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 375 .legalFor({S32, S64}) 376 .scalarize(0) 377 .clampScalar(0, S32, S64); 378 } 379 380 getActionDefinitionsBuilder(G_FPTRUNC) 381 .legalFor({{S32, S64}, {S16, S32}}) 382 .scalarize(0); 383 384 getActionDefinitionsBuilder(G_FPEXT) 385 .legalFor({{S64, S32}, {S32, S16}}) 386 .lowerFor({{S64, S16}}) // FIXME: Implement 387 .scalarize(0); 388 389 // TODO: Verify V_BFI_B32 is generated from expanded bit ops. 390 getActionDefinitionsBuilder(G_FCOPYSIGN).lower(); 391 392 getActionDefinitionsBuilder(G_FSUB) 393 // Use actual fsub instruction 394 .legalFor({S32}) 395 // Must use fadd + fneg 396 .lowerFor({S64, S16, V2S16}) 397 .scalarize(0) 398 .clampScalar(0, S32, S64); 399 400 // Whether this is legal depends on the floating point mode for the function. 401 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 402 if (ST.hasMadF16()) 403 FMad.customFor({S32, S16}); 404 else 405 FMad.customFor({S32}); 406 FMad.scalarize(0) 407 .lower(); 408 409 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 410 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 411 {S32, S1}, {S64, S1}, {S16, S1}, 412 {S96, S32}, 413 // FIXME: Hack 414 {S64, LLT::scalar(33)}, 415 {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}}) 416 .scalarize(0); 417 418 // TODO: Legal for s1->s64, requires split for VALU. 419 getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 420 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}, {S32, S1}, {S16, S1}}) 421 .lowerFor({{S32, S64}}) 422 .customFor({{S64, S64}}) 423 .scalarize(0); 424 425 getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 426 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 427 .scalarize(0); 428 429 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 430 .legalFor({S32, S64}) 431 .scalarize(0); 432 433 if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 434 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 435 .legalFor({S32, S64}) 436 .clampScalar(0, S32, S64) 437 .scalarize(0); 438 } else { 439 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 440 .legalFor({S32}) 441 .customFor({S64}) 442 .clampScalar(0, S32, S64) 443 .scalarize(0); 444 } 445 446 getActionDefinitionsBuilder(G_GEP) 447 .legalForCartesianProduct(AddrSpaces64, {S64}) 448 .legalForCartesianProduct(AddrSpaces32, {S32}) 449 .scalarize(0); 450 451 getActionDefinitionsBuilder(G_PTR_MASK) 452 .scalarize(0) 453 .alwaysLegal(); 454 455 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 456 457 auto &CmpBuilder = 458 getActionDefinitionsBuilder(G_ICMP) 459 .legalForCartesianProduct( 460 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 461 .legalFor({{S1, S32}, {S1, S64}}); 462 if (ST.has16BitInsts()) { 463 CmpBuilder.legalFor({{S1, S16}}); 464 } 465 466 CmpBuilder 467 .widenScalarToNextPow2(1) 468 .clampScalar(1, S32, S64) 469 .scalarize(0) 470 .legalIf(all(typeIs(0, S1), isPointer(1))); 471 472 getActionDefinitionsBuilder(G_FCMP) 473 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 474 .widenScalarToNextPow2(1) 475 .clampScalar(1, S32, S64) 476 .scalarize(0); 477 478 // FIXME: fexp, flog2, flog10 needs to be custom lowered. 479 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, 480 G_FLOG, G_FLOG2, G_FLOG10}) 481 .legalFor({S32}) 482 .scalarize(0); 483 484 // The 64-bit versions produce 32-bit results, but only on the SALU. 485 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, 486 G_CTTZ, G_CTTZ_ZERO_UNDEF, 487 G_CTPOP}) 488 .legalFor({{S32, S32}, {S32, S64}}) 489 .clampScalar(0, S32, S32) 490 .clampScalar(1, S32, S64) 491 .scalarize(0) 492 .widenScalarToNextPow2(0, 32) 493 .widenScalarToNextPow2(1, 32); 494 495 // TODO: Expand for > s32 496 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE}) 497 .legalFor({S32}) 498 .clampScalar(0, S32, S32) 499 .scalarize(0); 500 501 if (ST.has16BitInsts()) { 502 if (ST.hasVOP3PInsts()) { 503 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 504 .legalFor({S32, S16, V2S16}) 505 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 506 .clampMaxNumElements(0, S16, 2) 507 .clampScalar(0, S16, S32) 508 .widenScalarToNextPow2(0) 509 .scalarize(0); 510 } else { 511 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 512 .legalFor({S32, S16}) 513 .widenScalarToNextPow2(0) 514 .clampScalar(0, S16, S32) 515 .scalarize(0); 516 } 517 } else { 518 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 519 .legalFor({S32}) 520 .clampScalar(0, S32, S32) 521 .widenScalarToNextPow2(0) 522 .scalarize(0); 523 } 524 525 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 526 return [=](const LegalityQuery &Query) { 527 return Query.Types[TypeIdx0].getSizeInBits() < 528 Query.Types[TypeIdx1].getSizeInBits(); 529 }; 530 }; 531 532 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 533 return [=](const LegalityQuery &Query) { 534 return Query.Types[TypeIdx0].getSizeInBits() > 535 Query.Types[TypeIdx1].getSizeInBits(); 536 }; 537 }; 538 539 getActionDefinitionsBuilder(G_INTTOPTR) 540 // List the common cases 541 .legalForCartesianProduct(AddrSpaces64, {S64}) 542 .legalForCartesianProduct(AddrSpaces32, {S32}) 543 .scalarize(0) 544 // Accept any address space as long as the size matches 545 .legalIf(sameSize(0, 1)) 546 .widenScalarIf(smallerThan(1, 0), 547 [](const LegalityQuery &Query) { 548 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 549 }) 550 .narrowScalarIf(greaterThan(1, 0), 551 [](const LegalityQuery &Query) { 552 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 553 }); 554 555 getActionDefinitionsBuilder(G_PTRTOINT) 556 // List the common cases 557 .legalForCartesianProduct(AddrSpaces64, {S64}) 558 .legalForCartesianProduct(AddrSpaces32, {S32}) 559 .scalarize(0) 560 // Accept any address space as long as the size matches 561 .legalIf(sameSize(0, 1)) 562 .widenScalarIf(smallerThan(0, 1), 563 [](const LegalityQuery &Query) { 564 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 565 }) 566 .narrowScalarIf( 567 greaterThan(0, 1), 568 [](const LegalityQuery &Query) { 569 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 570 }); 571 572 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 573 .scalarize(0) 574 .custom(); 575 576 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 577 // handle some operations by just promoting the register during 578 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 579 auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned { 580 switch (AS) { 581 // FIXME: Private element size. 582 case AMDGPUAS::PRIVATE_ADDRESS: 583 return 32; 584 // FIXME: Check subtarget 585 case AMDGPUAS::LOCAL_ADDRESS: 586 return ST.useDS128() ? 128 : 64; 587 588 // Treat constant and global as identical. SMRD loads are sometimes usable 589 // for global loads (ideally constant address space should be eliminated) 590 // depending on the context. Legality cannot be context dependent, but 591 // RegBankSelect can split the load as necessary depending on the pointer 592 // register bank/uniformity and if the memory is invariant or not written in 593 // a kernel. 594 case AMDGPUAS::CONSTANT_ADDRESS: 595 case AMDGPUAS::GLOBAL_ADDRESS: 596 return 512; 597 default: 598 return 128; 599 } 600 }; 601 602 const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool { 603 const LLT DstTy = Query.Types[0]; 604 605 // Split vector extloads. 606 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 607 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 608 return true; 609 610 const LLT PtrTy = Query.Types[1]; 611 unsigned AS = PtrTy.getAddressSpace(); 612 if (MemSize > maxSizeForAddrSpace(AS)) 613 return true; 614 615 // Catch weird sized loads that don't evenly divide into the access sizes 616 // TODO: May be able to widen depending on alignment etc. 617 unsigned NumRegs = MemSize / 32; 618 if (NumRegs == 3 && !ST.hasDwordx3LoadStores()) 619 return true; 620 621 unsigned Align = Query.MMODescrs[0].AlignInBits; 622 if (Align < MemSize) { 623 const SITargetLowering *TLI = ST.getTargetLowering(); 624 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 625 } 626 627 return false; 628 }; 629 630 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 631 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 632 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 633 634 // TODO: Refine based on subtargets which support unaligned access or 128-bit 635 // LDS 636 // TODO: Unsupported flat for SI. 637 638 for (unsigned Op : {G_LOAD, G_STORE}) { 639 const bool IsStore = Op == G_STORE; 640 641 auto &Actions = getActionDefinitionsBuilder(Op); 642 // Whitelist the common cases. 643 // TODO: Pointer loads 644 // TODO: Wide constant loads 645 // TODO: Only CI+ has 3x loads 646 // TODO: Loads to s16 on gfx9 647 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 648 {V2S32, GlobalPtr, 64, GlobalAlign32}, 649 {V3S32, GlobalPtr, 96, GlobalAlign32}, 650 {S96, GlobalPtr, 96, GlobalAlign32}, 651 {V4S32, GlobalPtr, 128, GlobalAlign32}, 652 {S128, GlobalPtr, 128, GlobalAlign32}, 653 {S64, GlobalPtr, 64, GlobalAlign32}, 654 {V2S64, GlobalPtr, 128, GlobalAlign32}, 655 {V2S16, GlobalPtr, 32, GlobalAlign32}, 656 {S32, GlobalPtr, 8, GlobalAlign8}, 657 {S32, GlobalPtr, 16, GlobalAlign16}, 658 659 {S32, LocalPtr, 32, 32}, 660 {S64, LocalPtr, 64, 32}, 661 {V2S32, LocalPtr, 64, 32}, 662 {S32, LocalPtr, 8, 8}, 663 {S32, LocalPtr, 16, 16}, 664 {V2S16, LocalPtr, 32, 32}, 665 666 {S32, PrivatePtr, 32, 32}, 667 {S32, PrivatePtr, 8, 8}, 668 {S32, PrivatePtr, 16, 16}, 669 {V2S16, PrivatePtr, 32, 32}, 670 671 {S32, FlatPtr, 32, GlobalAlign32}, 672 {S32, FlatPtr, 16, GlobalAlign16}, 673 {S32, FlatPtr, 8, GlobalAlign8}, 674 {V2S16, FlatPtr, 32, GlobalAlign32}, 675 676 {S32, ConstantPtr, 32, GlobalAlign32}, 677 {V2S32, ConstantPtr, 64, GlobalAlign32}, 678 {V3S32, ConstantPtr, 96, GlobalAlign32}, 679 {V4S32, ConstantPtr, 128, GlobalAlign32}, 680 {S64, ConstantPtr, 64, GlobalAlign32}, 681 {S128, ConstantPtr, 128, GlobalAlign32}, 682 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 683 Actions 684 .customIf(typeIs(1, Constant32Ptr)) 685 .narrowScalarIf( 686 [=](const LegalityQuery &Query) -> bool { 687 return !Query.Types[0].isVector() && needToSplitLoad(Query); 688 }, 689 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 690 const LLT DstTy = Query.Types[0]; 691 const LLT PtrTy = Query.Types[1]; 692 693 const unsigned DstSize = DstTy.getSizeInBits(); 694 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 695 696 // Split extloads. 697 if (DstSize > MemSize) 698 return std::make_pair(0, LLT::scalar(MemSize)); 699 700 if (DstSize > 32 && (DstSize % 32 != 0)) { 701 // FIXME: Need a way to specify non-extload of larger size if 702 // suitably aligned. 703 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 704 } 705 706 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 707 if (MemSize > MaxSize) 708 return std::make_pair(0, LLT::scalar(MaxSize)); 709 710 unsigned Align = Query.MMODescrs[0].AlignInBits; 711 return std::make_pair(0, LLT::scalar(Align)); 712 }) 713 .fewerElementsIf( 714 [=](const LegalityQuery &Query) -> bool { 715 return Query.Types[0].isVector() && needToSplitLoad(Query); 716 }, 717 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 718 const LLT DstTy = Query.Types[0]; 719 const LLT PtrTy = Query.Types[1]; 720 721 LLT EltTy = DstTy.getElementType(); 722 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 723 724 // Split if it's too large for the address space. 725 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 726 unsigned NumElts = DstTy.getNumElements(); 727 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 728 729 // FIXME: Refine when odd breakdowns handled 730 // The scalars will need to be re-legalized. 731 if (NumPieces == 1 || NumPieces >= NumElts || 732 NumElts % NumPieces != 0) 733 return std::make_pair(0, EltTy); 734 735 return std::make_pair(0, 736 LLT::vector(NumElts / NumPieces, EltTy)); 737 } 738 739 // Need to split because of alignment. 740 unsigned Align = Query.MMODescrs[0].AlignInBits; 741 unsigned EltSize = EltTy.getSizeInBits(); 742 if (EltSize > Align && 743 (EltSize / Align < DstTy.getNumElements())) { 744 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 745 } 746 747 // May need relegalization for the scalars. 748 return std::make_pair(0, EltTy); 749 }) 750 .minScalar(0, S32); 751 752 if (IsStore) 753 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 754 755 // TODO: Need a bitcast lower option? 756 Actions 757 .legalIf([=](const LegalityQuery &Query) { 758 const LLT Ty0 = Query.Types[0]; 759 unsigned Size = Ty0.getSizeInBits(); 760 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 761 unsigned Align = Query.MMODescrs[0].AlignInBits; 762 763 // No extending vector loads. 764 if (Size > MemSize && Ty0.isVector()) 765 return false; 766 767 // FIXME: Widening store from alignment not valid. 768 if (MemSize < Size) 769 MemSize = std::max(MemSize, Align); 770 771 switch (MemSize) { 772 case 8: 773 case 16: 774 return Size == 32; 775 case 32: 776 case 64: 777 case 128: 778 return true; 779 case 96: 780 return ST.hasDwordx3LoadStores(); 781 case 256: 782 case 512: 783 return true; 784 default: 785 return false; 786 } 787 }) 788 .widenScalarToNextPow2(0) 789 // TODO: v3s32->v4s32 with alignment 790 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 791 } 792 793 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 794 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 795 {S32, GlobalPtr, 16, 2 * 8}, 796 {S32, LocalPtr, 8, 8}, 797 {S32, LocalPtr, 16, 16}, 798 {S32, PrivatePtr, 8, 8}, 799 {S32, PrivatePtr, 16, 16}, 800 {S32, ConstantPtr, 8, 8}, 801 {S32, ConstantPtr, 16, 2 * 8}}); 802 if (ST.hasFlatAddressSpace()) { 803 ExtLoads.legalForTypesWithMemDesc( 804 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 805 } 806 807 ExtLoads.clampScalar(0, S32, S32) 808 .widenScalarToNextPow2(0) 809 .unsupportedIfMemSizeNotPow2() 810 .lower(); 811 812 auto &Atomics = getActionDefinitionsBuilder( 813 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 814 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 815 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 816 G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG}) 817 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 818 {S64, GlobalPtr}, {S64, LocalPtr}}); 819 if (ST.hasFlatAddressSpace()) { 820 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 821 } 822 823 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 824 .legalFor({{S32, LocalPtr}}); 825 826 // TODO: Pointer types, any 32-bit or 64-bit vector 827 getActionDefinitionsBuilder(G_SELECT) 828 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 829 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 830 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1}) 831 .clampScalar(0, S16, S64) 832 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 833 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 834 .scalarize(1) 835 .clampMaxNumElements(0, S32, 2) 836 .clampMaxNumElements(0, LocalPtr, 2) 837 .clampMaxNumElements(0, PrivatePtr, 2) 838 .scalarize(0) 839 .widenScalarToNextPow2(0) 840 .legalIf(all(isPointer(0), typeIs(1, S1))); 841 842 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 843 // be more flexible with the shift amount type. 844 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 845 .legalFor({{S32, S32}, {S64, S32}}); 846 if (ST.has16BitInsts()) { 847 if (ST.hasVOP3PInsts()) { 848 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 849 .clampMaxNumElements(0, S16, 2); 850 } else 851 Shifts.legalFor({{S16, S32}, {S16, S16}}); 852 853 Shifts.clampScalar(1, S16, S32); 854 Shifts.clampScalar(0, S16, S64); 855 Shifts.widenScalarToNextPow2(0, 16); 856 } else { 857 // Make sure we legalize the shift amount type first, as the general 858 // expansion for the shifted type will produce much worse code if it hasn't 859 // been truncated already. 860 Shifts.clampScalar(1, S32, S32); 861 Shifts.clampScalar(0, S32, S64); 862 Shifts.widenScalarToNextPow2(0, 32); 863 } 864 Shifts.scalarize(0); 865 866 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 867 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 868 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 869 unsigned IdxTypeIdx = 2; 870 871 getActionDefinitionsBuilder(Op) 872 .customIf([=](const LegalityQuery &Query) { 873 const LLT EltTy = Query.Types[EltTypeIdx]; 874 const LLT VecTy = Query.Types[VecTypeIdx]; 875 const LLT IdxTy = Query.Types[IdxTypeIdx]; 876 return (EltTy.getSizeInBits() == 16 || 877 EltTy.getSizeInBits() % 32 == 0) && 878 VecTy.getSizeInBits() % 32 == 0 && 879 VecTy.getSizeInBits() <= 512 && 880 IdxTy.getSizeInBits() == 32; 881 }) 882 .clampScalar(EltTypeIdx, S32, S64) 883 .clampScalar(VecTypeIdx, S32, S64) 884 .clampScalar(IdxTypeIdx, S32, S32); 885 } 886 887 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 888 .unsupportedIf([=](const LegalityQuery &Query) { 889 const LLT &EltTy = Query.Types[1].getElementType(); 890 return Query.Types[0] != EltTy; 891 }); 892 893 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 894 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 895 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 896 897 // FIXME: Doesn't handle extract of illegal sizes. 898 getActionDefinitionsBuilder(Op) 899 .legalIf([=](const LegalityQuery &Query) { 900 const LLT BigTy = Query.Types[BigTyIdx]; 901 const LLT LitTy = Query.Types[LitTyIdx]; 902 return (BigTy.getSizeInBits() % 32 == 0) && 903 (LitTy.getSizeInBits() % 16 == 0); 904 }) 905 .widenScalarIf( 906 [=](const LegalityQuery &Query) { 907 const LLT BigTy = Query.Types[BigTyIdx]; 908 return (BigTy.getScalarSizeInBits() < 16); 909 }, 910 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 911 .widenScalarIf( 912 [=](const LegalityQuery &Query) { 913 const LLT LitTy = Query.Types[LitTyIdx]; 914 return (LitTy.getScalarSizeInBits() < 16); 915 }, 916 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 917 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 918 .widenScalarToNextPow2(BigTyIdx, 32); 919 920 } 921 922 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 923 .legalForCartesianProduct(AllS32Vectors, {S32}) 924 .legalForCartesianProduct(AllS64Vectors, {S64}) 925 .clampNumElements(0, V16S32, V16S32) 926 .clampNumElements(0, V2S64, V8S64); 927 928 if (ST.hasScalarPackInsts()) 929 BuildVector.legalFor({V2S16, S32}); 930 931 BuildVector 932 .minScalarSameAs(1, 0) 933 .legalIf(isRegisterType(0)) 934 .minScalarOrElt(0, S32); 935 936 if (ST.hasScalarPackInsts()) { 937 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 938 .legalFor({V2S16, S32}) 939 .lower(); 940 } else { 941 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 942 .lower(); 943 } 944 945 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 946 .legalIf(isRegisterType(0)); 947 948 // TODO: Don't fully scalarize v2s16 pieces 949 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 950 951 // Merge/Unmerge 952 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 953 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 954 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 955 956 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 957 const LLT &Ty = Query.Types[TypeIdx]; 958 if (Ty.isVector()) { 959 const LLT &EltTy = Ty.getElementType(); 960 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 961 return true; 962 if (!isPowerOf2_32(EltTy.getSizeInBits())) 963 return true; 964 } 965 return false; 966 }; 967 968 getActionDefinitionsBuilder(Op) 969 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 970 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 971 // worth considering the multiples of 64 since 2*192 and 2*384 are not 972 // valid. 973 .clampScalar(LitTyIdx, S16, S256) 974 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 975 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 976 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 977 elementTypeIs(1, S16)), 978 changeTo(1, V2S16)) 979 // Break up vectors with weird elements into scalars 980 .fewerElementsIf( 981 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 982 scalarize(0)) 983 .fewerElementsIf( 984 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 985 scalarize(1)) 986 .clampScalar(BigTyIdx, S32, S512) 987 .lowerFor({{S16, V2S16}}) 988 .widenScalarIf( 989 [=](const LegalityQuery &Query) { 990 const LLT &Ty = Query.Types[BigTyIdx]; 991 return !isPowerOf2_32(Ty.getSizeInBits()) && 992 Ty.getSizeInBits() % 16 != 0; 993 }, 994 [=](const LegalityQuery &Query) { 995 // Pick the next power of 2, or a multiple of 64 over 128. 996 // Whichever is smaller. 997 const LLT &Ty = Query.Types[BigTyIdx]; 998 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 999 if (NewSizeInBits >= 256) { 1000 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1001 if (RoundedTo < NewSizeInBits) 1002 NewSizeInBits = RoundedTo; 1003 } 1004 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1005 }) 1006 .legalIf([=](const LegalityQuery &Query) { 1007 const LLT &BigTy = Query.Types[BigTyIdx]; 1008 const LLT &LitTy = Query.Types[LitTyIdx]; 1009 1010 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1011 return false; 1012 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1013 return false; 1014 1015 return BigTy.getSizeInBits() % 16 == 0 && 1016 LitTy.getSizeInBits() % 16 == 0 && 1017 BigTy.getSizeInBits() <= 512; 1018 }) 1019 // Any vectors left are the wrong size. Scalarize them. 1020 .scalarize(0) 1021 .scalarize(1); 1022 } 1023 1024 getActionDefinitionsBuilder(G_SEXT_INREG).lower(); 1025 1026 computeTables(); 1027 verify(*ST.getInstrInfo()); 1028 } 1029 1030 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1031 MachineRegisterInfo &MRI, 1032 MachineIRBuilder &B, 1033 GISelChangeObserver &Observer) const { 1034 switch (MI.getOpcode()) { 1035 case TargetOpcode::G_ADDRSPACE_CAST: 1036 return legalizeAddrSpaceCast(MI, MRI, B); 1037 case TargetOpcode::G_FRINT: 1038 return legalizeFrint(MI, MRI, B); 1039 case TargetOpcode::G_FCEIL: 1040 return legalizeFceil(MI, MRI, B); 1041 case TargetOpcode::G_INTRINSIC_TRUNC: 1042 return legalizeIntrinsicTrunc(MI, MRI, B); 1043 case TargetOpcode::G_SITOFP: 1044 return legalizeITOFP(MI, MRI, B, true); 1045 case TargetOpcode::G_UITOFP: 1046 return legalizeITOFP(MI, MRI, B, false); 1047 case TargetOpcode::G_FMINNUM: 1048 case TargetOpcode::G_FMAXNUM: 1049 case TargetOpcode::G_FMINNUM_IEEE: 1050 case TargetOpcode::G_FMAXNUM_IEEE: 1051 return legalizeMinNumMaxNum(MI, MRI, B); 1052 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1053 return legalizeExtractVectorElt(MI, MRI, B); 1054 case TargetOpcode::G_INSERT_VECTOR_ELT: 1055 return legalizeInsertVectorElt(MI, MRI, B); 1056 case TargetOpcode::G_FSIN: 1057 case TargetOpcode::G_FCOS: 1058 return legalizeSinCos(MI, MRI, B); 1059 case TargetOpcode::G_GLOBAL_VALUE: 1060 return legalizeGlobalValue(MI, MRI, B); 1061 case TargetOpcode::G_LOAD: 1062 return legalizeLoad(MI, MRI, B, Observer); 1063 case TargetOpcode::G_FMAD: 1064 return legalizeFMad(MI, MRI, B); 1065 default: 1066 return false; 1067 } 1068 1069 llvm_unreachable("expected switch to return"); 1070 } 1071 1072 Register AMDGPULegalizerInfo::getSegmentAperture( 1073 unsigned AS, 1074 MachineRegisterInfo &MRI, 1075 MachineIRBuilder &B) const { 1076 MachineFunction &MF = B.getMF(); 1077 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1078 const LLT S32 = LLT::scalar(32); 1079 1080 if (ST.hasApertureRegs()) { 1081 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1082 // getreg. 1083 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1084 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1085 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1086 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1087 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1088 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1089 unsigned Encoding = 1090 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1091 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1092 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1093 1094 Register ApertureReg = MRI.createGenericVirtualRegister(S32); 1095 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1096 1097 B.buildInstr(AMDGPU::S_GETREG_B32) 1098 .addDef(GetReg) 1099 .addImm(Encoding); 1100 MRI.setType(GetReg, S32); 1101 1102 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1103 B.buildInstr(TargetOpcode::G_SHL) 1104 .addDef(ApertureReg) 1105 .addUse(GetReg) 1106 .addUse(ShiftAmt.getReg(0)); 1107 1108 return ApertureReg; 1109 } 1110 1111 Register QueuePtr = MRI.createGenericVirtualRegister( 1112 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1113 1114 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1115 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1116 return Register(); 1117 1118 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1119 // private_segment_aperture_base_hi. 1120 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1121 1122 // FIXME: Don't use undef 1123 Value *V = UndefValue::get(PointerType::get( 1124 Type::getInt8Ty(MF.getFunction().getContext()), 1125 AMDGPUAS::CONSTANT_ADDRESS)); 1126 1127 MachinePointerInfo PtrInfo(V, StructOffset); 1128 MachineMemOperand *MMO = MF.getMachineMemOperand( 1129 PtrInfo, 1130 MachineMemOperand::MOLoad | 1131 MachineMemOperand::MODereferenceable | 1132 MachineMemOperand::MOInvariant, 1133 4, 1134 MinAlign(64, StructOffset)); 1135 1136 Register LoadResult = MRI.createGenericVirtualRegister(S32); 1137 Register LoadAddr; 1138 1139 B.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1140 B.buildLoad(LoadResult, LoadAddr, *MMO); 1141 return LoadResult; 1142 } 1143 1144 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1145 MachineInstr &MI, MachineRegisterInfo &MRI, 1146 MachineIRBuilder &B) const { 1147 MachineFunction &MF = B.getMF(); 1148 1149 B.setInstr(MI); 1150 1151 const LLT S32 = LLT::scalar(32); 1152 Register Dst = MI.getOperand(0).getReg(); 1153 Register Src = MI.getOperand(1).getReg(); 1154 1155 LLT DstTy = MRI.getType(Dst); 1156 LLT SrcTy = MRI.getType(Src); 1157 unsigned DestAS = DstTy.getAddressSpace(); 1158 unsigned SrcAS = SrcTy.getAddressSpace(); 1159 1160 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1161 // vector element. 1162 assert(!DstTy.isVector()); 1163 1164 const AMDGPUTargetMachine &TM 1165 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1166 1167 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1168 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1169 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1170 return true; 1171 } 1172 1173 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1174 // Truncate. 1175 B.buildExtract(Dst, Src, 0); 1176 MI.eraseFromParent(); 1177 return true; 1178 } 1179 1180 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1181 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1182 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1183 1184 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1185 // another. Merge operands are required to be the same type, but creating an 1186 // extra ptrtoint would be kind of pointless. 1187 auto HighAddr = B.buildConstant( 1188 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1189 B.buildMerge(Dst, {Src, HighAddr.getReg(0)}); 1190 MI.eraseFromParent(); 1191 return true; 1192 } 1193 1194 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1195 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1196 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1197 unsigned NullVal = TM.getNullPointerValue(DestAS); 1198 1199 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1200 auto FlatNull = B.buildConstant(SrcTy, 0); 1201 1202 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy); 1203 1204 // Extract low 32-bits of the pointer. 1205 B.buildExtract(PtrLo32, Src, 0); 1206 1207 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1208 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); 1209 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1210 1211 MI.eraseFromParent(); 1212 return true; 1213 } 1214 1215 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1216 return false; 1217 1218 if (!ST.hasFlatAddressSpace()) 1219 return false; 1220 1221 auto SegmentNull = 1222 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1223 auto FlatNull = 1224 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1225 1226 Register ApertureReg = getSegmentAperture(DestAS, MRI, B); 1227 if (!ApertureReg.isValid()) 1228 return false; 1229 1230 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1231 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); 1232 1233 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy); 1234 1235 // Coerce the type of the low half of the result so we can use merge_values. 1236 Register SrcAsInt = MRI.createGenericVirtualRegister(S32); 1237 B.buildInstr(TargetOpcode::G_PTRTOINT) 1238 .addDef(SrcAsInt) 1239 .addUse(Src); 1240 1241 // TODO: Should we allow mismatched types but matching sizes in merges to 1242 // avoid the ptrtoint? 1243 B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); 1244 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); 1245 1246 MI.eraseFromParent(); 1247 return true; 1248 } 1249 1250 bool AMDGPULegalizerInfo::legalizeFrint( 1251 MachineInstr &MI, MachineRegisterInfo &MRI, 1252 MachineIRBuilder &B) const { 1253 B.setInstr(MI); 1254 1255 Register Src = MI.getOperand(1).getReg(); 1256 LLT Ty = MRI.getType(Src); 1257 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1258 1259 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1260 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1261 1262 auto C1 = B.buildFConstant(Ty, C1Val); 1263 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1264 1265 // TODO: Should this propagate fast-math-flags? 1266 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1267 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1268 1269 auto C2 = B.buildFConstant(Ty, C2Val); 1270 auto Fabs = B.buildFAbs(Ty, Src); 1271 1272 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1273 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1274 return true; 1275 } 1276 1277 bool AMDGPULegalizerInfo::legalizeFceil( 1278 MachineInstr &MI, MachineRegisterInfo &MRI, 1279 MachineIRBuilder &B) const { 1280 B.setInstr(MI); 1281 1282 const LLT S1 = LLT::scalar(1); 1283 const LLT S64 = LLT::scalar(64); 1284 1285 Register Src = MI.getOperand(1).getReg(); 1286 assert(MRI.getType(Src) == S64); 1287 1288 // result = trunc(src) 1289 // if (src > 0.0 && src != result) 1290 // result += 1.0 1291 1292 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src}); 1293 1294 const auto Zero = B.buildFConstant(S64, 0.0); 1295 const auto One = B.buildFConstant(S64, 1.0); 1296 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1297 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1298 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1299 auto Add = B.buildSelect(S64, And, One, Zero); 1300 1301 // TODO: Should this propagate fast-math-flags? 1302 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1303 return true; 1304 } 1305 1306 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1307 MachineIRBuilder &B) { 1308 const unsigned FractBits = 52; 1309 const unsigned ExpBits = 11; 1310 LLT S32 = LLT::scalar(32); 1311 1312 auto Const0 = B.buildConstant(S32, FractBits - 32); 1313 auto Const1 = B.buildConstant(S32, ExpBits); 1314 1315 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1316 .addUse(Const0.getReg(0)) 1317 .addUse(Const1.getReg(0)); 1318 1319 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1320 } 1321 1322 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1323 MachineInstr &MI, MachineRegisterInfo &MRI, 1324 MachineIRBuilder &B) const { 1325 B.setInstr(MI); 1326 1327 const LLT S1 = LLT::scalar(1); 1328 const LLT S32 = LLT::scalar(32); 1329 const LLT S64 = LLT::scalar(64); 1330 1331 Register Src = MI.getOperand(1).getReg(); 1332 assert(MRI.getType(Src) == S64); 1333 1334 // TODO: Should this use extract since the low half is unused? 1335 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1336 Register Hi = Unmerge.getReg(1); 1337 1338 // Extract the upper half, since this is where we will find the sign and 1339 // exponent. 1340 auto Exp = extractF64Exponent(Hi, B); 1341 1342 const unsigned FractBits = 52; 1343 1344 // Extract the sign bit. 1345 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1346 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1347 1348 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1349 1350 const auto Zero32 = B.buildConstant(S32, 0); 1351 1352 // Extend back to 64-bits. 1353 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); 1354 1355 auto Shr = B.buildAShr(S64, FractMask, Exp); 1356 auto Not = B.buildNot(S64, Shr); 1357 auto Tmp0 = B.buildAnd(S64, Src, Not); 1358 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1359 1360 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1361 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1362 1363 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1364 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1365 return true; 1366 } 1367 1368 bool AMDGPULegalizerInfo::legalizeITOFP( 1369 MachineInstr &MI, MachineRegisterInfo &MRI, 1370 MachineIRBuilder &B, bool Signed) const { 1371 B.setInstr(MI); 1372 1373 Register Dst = MI.getOperand(0).getReg(); 1374 Register Src = MI.getOperand(1).getReg(); 1375 1376 const LLT S64 = LLT::scalar(64); 1377 const LLT S32 = LLT::scalar(32); 1378 1379 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1380 1381 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1382 1383 auto CvtHi = Signed ? 1384 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1385 B.buildUITOFP(S64, Unmerge.getReg(1)); 1386 1387 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1388 1389 auto ThirtyTwo = B.buildConstant(S32, 32); 1390 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1391 .addUse(CvtHi.getReg(0)) 1392 .addUse(ThirtyTwo.getReg(0)); 1393 1394 // TODO: Should this propagate fast-math-flags? 1395 B.buildFAdd(Dst, LdExp, CvtLo); 1396 MI.eraseFromParent(); 1397 return true; 1398 } 1399 1400 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1401 MachineInstr &MI, MachineRegisterInfo &MRI, 1402 MachineIRBuilder &B) const { 1403 MachineFunction &MF = B.getMF(); 1404 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1405 1406 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1407 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1408 1409 // With ieee_mode disabled, the instructions have the correct behavior 1410 // already for G_FMINNUM/G_FMAXNUM 1411 if (!MFI->getMode().IEEE) 1412 return !IsIEEEOp; 1413 1414 if (IsIEEEOp) 1415 return true; 1416 1417 MachineIRBuilder HelperBuilder(MI); 1418 GISelObserverWrapper DummyObserver; 1419 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1420 HelperBuilder.setInstr(MI); 1421 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1422 } 1423 1424 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1425 MachineInstr &MI, MachineRegisterInfo &MRI, 1426 MachineIRBuilder &B) const { 1427 // TODO: Should move some of this into LegalizerHelper. 1428 1429 // TODO: Promote dynamic indexing of s16 to s32 1430 // TODO: Dynamic s64 indexing is only legal for SGPR. 1431 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); 1432 if (!IdxVal) // Dynamic case will be selected to register indexing. 1433 return true; 1434 1435 Register Dst = MI.getOperand(0).getReg(); 1436 Register Vec = MI.getOperand(1).getReg(); 1437 1438 LLT VecTy = MRI.getType(Vec); 1439 LLT EltTy = VecTy.getElementType(); 1440 assert(EltTy == MRI.getType(Dst)); 1441 1442 B.setInstr(MI); 1443 1444 if (IdxVal.getValue() < VecTy.getNumElements()) 1445 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); 1446 else 1447 B.buildUndef(Dst); 1448 1449 MI.eraseFromParent(); 1450 return true; 1451 } 1452 1453 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1454 MachineInstr &MI, MachineRegisterInfo &MRI, 1455 MachineIRBuilder &B) const { 1456 // TODO: Should move some of this into LegalizerHelper. 1457 1458 // TODO: Promote dynamic indexing of s16 to s32 1459 // TODO: Dynamic s64 indexing is only legal for SGPR. 1460 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); 1461 if (!IdxVal) // Dynamic case will be selected to register indexing. 1462 return true; 1463 1464 Register Dst = MI.getOperand(0).getReg(); 1465 Register Vec = MI.getOperand(1).getReg(); 1466 Register Ins = MI.getOperand(2).getReg(); 1467 1468 LLT VecTy = MRI.getType(Vec); 1469 LLT EltTy = VecTy.getElementType(); 1470 assert(EltTy == MRI.getType(Ins)); 1471 1472 B.setInstr(MI); 1473 1474 if (IdxVal.getValue() < VecTy.getNumElements()) 1475 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); 1476 else 1477 B.buildUndef(Dst); 1478 1479 MI.eraseFromParent(); 1480 return true; 1481 } 1482 1483 bool AMDGPULegalizerInfo::legalizeSinCos( 1484 MachineInstr &MI, MachineRegisterInfo &MRI, 1485 MachineIRBuilder &B) const { 1486 B.setInstr(MI); 1487 1488 Register DstReg = MI.getOperand(0).getReg(); 1489 Register SrcReg = MI.getOperand(1).getReg(); 1490 LLT Ty = MRI.getType(DstReg); 1491 unsigned Flags = MI.getFlags(); 1492 1493 Register TrigVal; 1494 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1495 if (ST.hasTrigReducedRange()) { 1496 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1497 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1498 .addUse(MulVal.getReg(0)) 1499 .setMIFlags(Flags).getReg(0); 1500 } else 1501 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1502 1503 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1504 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1505 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1506 .addUse(TrigVal) 1507 .setMIFlags(Flags); 1508 MI.eraseFromParent(); 1509 return true; 1510 } 1511 1512 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1513 MachineInstr &MI, MachineRegisterInfo &MRI, 1514 MachineIRBuilder &B) const { 1515 Register DstReg = MI.getOperand(0).getReg(); 1516 LLT Ty = MRI.getType(DstReg); 1517 unsigned AS = Ty.getAddressSpace(); 1518 1519 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1520 MachineFunction &MF = B.getMF(); 1521 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1522 1523 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1524 B.setInstr(MI); 1525 1526 if (!MFI->isEntryFunction()) { 1527 const Function &Fn = MF.getFunction(); 1528 DiagnosticInfoUnsupported BadLDSDecl( 1529 Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 1530 Fn.getContext().diagnose(BadLDSDecl); 1531 } 1532 1533 // TODO: We could emit code to handle the initialization somewhere. 1534 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1535 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1536 MI.eraseFromParent(); 1537 return true; 1538 } 1539 } else 1540 return false; 1541 1542 const Function &Fn = MF.getFunction(); 1543 DiagnosticInfoUnsupported BadInit( 1544 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1545 Fn.getContext().diagnose(BadInit); 1546 return true; 1547 } 1548 1549 bool AMDGPULegalizerInfo::legalizeLoad( 1550 MachineInstr &MI, MachineRegisterInfo &MRI, 1551 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 1552 B.setInstr(MI); 1553 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1554 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 1555 Observer.changingInstr(MI); 1556 MI.getOperand(1).setReg(Cast.getReg(0)); 1557 Observer.changedInstr(MI); 1558 return true; 1559 } 1560 1561 bool AMDGPULegalizerInfo::legalizeFMad( 1562 MachineInstr &MI, MachineRegisterInfo &MRI, 1563 MachineIRBuilder &B) const { 1564 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 1565 assert(Ty.isScalar()); 1566 1567 // TODO: Always legal with future ftz flag. 1568 if (Ty == LLT::scalar(32) && !ST.hasFP32Denormals()) 1569 return true; 1570 if (Ty == LLT::scalar(16) && !ST.hasFP16Denormals()) 1571 return true; 1572 1573 MachineFunction &MF = B.getMF(); 1574 1575 MachineIRBuilder HelperBuilder(MI); 1576 GISelObserverWrapper DummyObserver; 1577 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1578 HelperBuilder.setMBB(*MI.getParent()); 1579 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 1580 } 1581 1582 // Return the use branch instruction, otherwise null if the usage is invalid. 1583 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 1584 MachineRegisterInfo &MRI) { 1585 Register CondDef = MI.getOperand(0).getReg(); 1586 if (!MRI.hasOneNonDBGUse(CondDef)) 1587 return nullptr; 1588 1589 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 1590 return UseMI.getParent() == MI.getParent() && 1591 UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr; 1592 } 1593 1594 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 1595 Register Reg, LLT Ty) const { 1596 Register LiveIn = MRI.getLiveInVirtReg(Reg); 1597 if (LiveIn) 1598 return LiveIn; 1599 1600 Register NewReg = MRI.createGenericVirtualRegister(Ty); 1601 MRI.addLiveIn(Reg, NewReg); 1602 return NewReg; 1603 } 1604 1605 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 1606 const ArgDescriptor *Arg) const { 1607 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 1608 return false; // TODO: Handle these 1609 1610 assert(Arg->getRegister().isPhysical()); 1611 1612 MachineRegisterInfo &MRI = *B.getMRI(); 1613 1614 LLT Ty = MRI.getType(DstReg); 1615 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 1616 1617 if (Arg->isMasked()) { 1618 // TODO: Should we try to emit this once in the entry block? 1619 const LLT S32 = LLT::scalar(32); 1620 const unsigned Mask = Arg->getMask(); 1621 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 1622 1623 auto ShiftAmt = B.buildConstant(S32, Shift); 1624 auto LShr = B.buildLShr(S32, LiveIn, ShiftAmt); 1625 B.buildAnd(DstReg, LShr, B.buildConstant(S32, Mask >> Shift)); 1626 } else 1627 B.buildCopy(DstReg, LiveIn); 1628 1629 // Insert the argument copy if it doens't already exist. 1630 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 1631 if (!MRI.getVRegDef(LiveIn)) { 1632 // FIXME: Should have scoped insert pt 1633 MachineBasicBlock &OrigInsBB = B.getMBB(); 1634 auto OrigInsPt = B.getInsertPt(); 1635 1636 MachineBasicBlock &EntryMBB = B.getMF().front(); 1637 EntryMBB.addLiveIn(Arg->getRegister()); 1638 B.setInsertPt(EntryMBB, EntryMBB.begin()); 1639 B.buildCopy(LiveIn, Arg->getRegister()); 1640 1641 B.setInsertPt(OrigInsBB, OrigInsPt); 1642 } 1643 1644 return true; 1645 } 1646 1647 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 1648 MachineInstr &MI, 1649 MachineRegisterInfo &MRI, 1650 MachineIRBuilder &B, 1651 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 1652 B.setInstr(MI); 1653 1654 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1655 1656 const ArgDescriptor *Arg; 1657 const TargetRegisterClass *RC; 1658 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 1659 if (!Arg) { 1660 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 1661 return false; 1662 } 1663 1664 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 1665 MI.eraseFromParent(); 1666 return true; 1667 } 1668 1669 return false; 1670 } 1671 1672 bool AMDGPULegalizerInfo::legalizeFDIVFast(MachineInstr &MI, 1673 MachineRegisterInfo &MRI, 1674 MachineIRBuilder &B) const { 1675 B.setInstr(MI); 1676 Register Res = MI.getOperand(0).getReg(); 1677 Register LHS = MI.getOperand(2).getReg(); 1678 Register RHS = MI.getOperand(3).getReg(); 1679 uint16_t Flags = MI.getFlags(); 1680 1681 LLT S32 = LLT::scalar(32); 1682 LLT S1 = LLT::scalar(1); 1683 1684 auto Abs = B.buildFAbs(S32, RHS, Flags); 1685 const APFloat C0Val(1.0f); 1686 1687 auto C0 = B.buildConstant(S32, 0x6f800000); 1688 auto C1 = B.buildConstant(S32, 0x2f800000); 1689 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 1690 1691 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 1692 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 1693 1694 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 1695 1696 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 1697 .addUse(Mul0.getReg(0)) 1698 .setMIFlags(Flags); 1699 1700 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 1701 1702 B.buildFMul(Res, Sel, Mul1, Flags); 1703 1704 MI.eraseFromParent(); 1705 return true; 1706 } 1707 1708 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 1709 MachineRegisterInfo &MRI, 1710 MachineIRBuilder &B) const { 1711 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1712 if (!MFI->isEntryFunction()) { 1713 return legalizePreloadedArgIntrin(MI, MRI, B, 1714 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 1715 } 1716 1717 B.setInstr(MI); 1718 1719 uint64_t Offset = 1720 ST.getTargetLowering()->getImplicitParameterOffset( 1721 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 1722 Register DstReg = MI.getOperand(0).getReg(); 1723 LLT DstTy = MRI.getType(DstReg); 1724 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 1725 1726 const ArgDescriptor *Arg; 1727 const TargetRegisterClass *RC; 1728 std::tie(Arg, RC) 1729 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 1730 if (!Arg) 1731 return false; 1732 1733 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 1734 if (!loadInputValue(KernargPtrReg, B, Arg)) 1735 return false; 1736 1737 B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 1738 MI.eraseFromParent(); 1739 return true; 1740 } 1741 1742 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 1743 MachineRegisterInfo &MRI, 1744 MachineIRBuilder &B, 1745 unsigned AddrSpace) const { 1746 B.setInstr(MI); 1747 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 1748 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 1749 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 1750 MI.eraseFromParent(); 1751 return true; 1752 } 1753 1754 /// Handle register layout difference for f16 images for some subtargets. 1755 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 1756 MachineRegisterInfo &MRI, 1757 Register Reg) const { 1758 if (!ST.hasUnpackedD16VMem()) 1759 return Reg; 1760 1761 const LLT S16 = LLT::scalar(16); 1762 const LLT S32 = LLT::scalar(32); 1763 LLT StoreVT = MRI.getType(Reg); 1764 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 1765 1766 auto Unmerge = B.buildUnmerge(S16, Reg); 1767 1768 SmallVector<Register, 4> WideRegs; 1769 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 1770 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 1771 1772 int NumElts = StoreVT.getNumElements(); 1773 1774 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 1775 } 1776 1777 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI, 1778 MachineRegisterInfo &MRI, 1779 MachineIRBuilder &B, 1780 bool IsFormat) const { 1781 // TODO: Reject f16 format on targets where unsupported. 1782 Register VData = MI.getOperand(1).getReg(); 1783 LLT Ty = MRI.getType(VData); 1784 1785 B.setInstr(MI); 1786 1787 const LLT S32 = LLT::scalar(32); 1788 const LLT S16 = LLT::scalar(16); 1789 1790 // Fixup illegal register types for i8 stores. 1791 if (Ty == LLT::scalar(8) || Ty == S16) { 1792 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 1793 MI.getOperand(1).setReg(AnyExt); 1794 return true; 1795 } 1796 1797 if (Ty.isVector()) { 1798 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 1799 if (IsFormat) 1800 MI.getOperand(1).setReg(handleD16VData(B, MRI, VData)); 1801 return true; 1802 } 1803 1804 return Ty.getElementType() == S32 && Ty.getNumElements() <= 4; 1805 } 1806 1807 return Ty == S32; 1808 } 1809 1810 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 1811 MachineRegisterInfo &MRI, 1812 MachineIRBuilder &B) const { 1813 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 1814 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { 1815 case Intrinsic::amdgcn_if: { 1816 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 1817 const SIRegisterInfo *TRI 1818 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 1819 1820 B.setInstr(*BrCond); 1821 Register Def = MI.getOperand(1).getReg(); 1822 Register Use = MI.getOperand(3).getReg(); 1823 B.buildInstr(AMDGPU::SI_IF) 1824 .addDef(Def) 1825 .addUse(Use) 1826 .addMBB(BrCond->getOperand(1).getMBB()); 1827 1828 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 1829 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 1830 MI.eraseFromParent(); 1831 BrCond->eraseFromParent(); 1832 return true; 1833 } 1834 1835 return false; 1836 } 1837 case Intrinsic::amdgcn_loop: { 1838 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 1839 const SIRegisterInfo *TRI 1840 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 1841 1842 B.setInstr(*BrCond); 1843 Register Reg = MI.getOperand(2).getReg(); 1844 B.buildInstr(AMDGPU::SI_LOOP) 1845 .addUse(Reg) 1846 .addMBB(BrCond->getOperand(1).getMBB()); 1847 MI.eraseFromParent(); 1848 BrCond->eraseFromParent(); 1849 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 1850 return true; 1851 } 1852 1853 return false; 1854 } 1855 case Intrinsic::amdgcn_kernarg_segment_ptr: 1856 return legalizePreloadedArgIntrin( 1857 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 1858 case Intrinsic::amdgcn_implicitarg_ptr: 1859 return legalizeImplicitArgPtr(MI, MRI, B); 1860 case Intrinsic::amdgcn_workitem_id_x: 1861 return legalizePreloadedArgIntrin(MI, MRI, B, 1862 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 1863 case Intrinsic::amdgcn_workitem_id_y: 1864 return legalizePreloadedArgIntrin(MI, MRI, B, 1865 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 1866 case Intrinsic::amdgcn_workitem_id_z: 1867 return legalizePreloadedArgIntrin(MI, MRI, B, 1868 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 1869 case Intrinsic::amdgcn_workgroup_id_x: 1870 return legalizePreloadedArgIntrin(MI, MRI, B, 1871 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 1872 case Intrinsic::amdgcn_workgroup_id_y: 1873 return legalizePreloadedArgIntrin(MI, MRI, B, 1874 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 1875 case Intrinsic::amdgcn_workgroup_id_z: 1876 return legalizePreloadedArgIntrin(MI, MRI, B, 1877 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 1878 case Intrinsic::amdgcn_dispatch_ptr: 1879 return legalizePreloadedArgIntrin(MI, MRI, B, 1880 AMDGPUFunctionArgInfo::DISPATCH_PTR); 1881 case Intrinsic::amdgcn_queue_ptr: 1882 return legalizePreloadedArgIntrin(MI, MRI, B, 1883 AMDGPUFunctionArgInfo::QUEUE_PTR); 1884 case Intrinsic::amdgcn_implicit_buffer_ptr: 1885 return legalizePreloadedArgIntrin( 1886 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 1887 case Intrinsic::amdgcn_dispatch_id: 1888 return legalizePreloadedArgIntrin(MI, MRI, B, 1889 AMDGPUFunctionArgInfo::DISPATCH_ID); 1890 case Intrinsic::amdgcn_fdiv_fast: 1891 return legalizeFDIVFast(MI, MRI, B); 1892 case Intrinsic::amdgcn_is_shared: 1893 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 1894 case Intrinsic::amdgcn_is_private: 1895 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 1896 case Intrinsic::amdgcn_wavefrontsize: { 1897 B.setInstr(MI); 1898 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 1899 MI.eraseFromParent(); 1900 return true; 1901 } 1902 case Intrinsic::amdgcn_raw_buffer_store: 1903 return legalizeRawBufferStore(MI, MRI, B, false); 1904 case Intrinsic::amdgcn_raw_buffer_store_format: 1905 return legalizeRawBufferStore(MI, MRI, B, true); 1906 default: 1907 return true; 1908 } 1909 1910 return true; 1911 } 1912