1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPU.h" 22 #include "AMDGPULegalizerInfo.h" 23 #include "AMDGPUTargetMachine.h" 24 #include "SIMachineFunctionInfo.h" 25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/CodeGen/TargetOpcodes.h" 28 #include "llvm/CodeGen/ValueTypes.h" 29 #include "llvm/IR/DerivedTypes.h" 30 #include "llvm/IR/DiagnosticInfo.h" 31 #include "llvm/IR/Type.h" 32 #include "llvm/Support/Debug.h" 33 34 #define DEBUG_TYPE "amdgpu-legalinfo" 35 36 using namespace llvm; 37 using namespace LegalizeActions; 38 using namespace LegalizeMutations; 39 using namespace LegalityPredicates; 40 41 42 static LegalityPredicate isMultiple32(unsigned TypeIdx, 43 unsigned MaxSize = 1024) { 44 return [=](const LegalityQuery &Query) { 45 const LLT Ty = Query.Types[TypeIdx]; 46 const LLT EltTy = Ty.getScalarType(); 47 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 48 }; 49 } 50 51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 52 return [=](const LegalityQuery &Query) { 53 return Query.Types[TypeIdx].getSizeInBits() == Size; 54 }; 55 } 56 57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 58 return [=](const LegalityQuery &Query) { 59 const LLT Ty = Query.Types[TypeIdx]; 60 return Ty.isVector() && 61 Ty.getNumElements() % 2 != 0 && 62 Ty.getElementType().getSizeInBits() < 32 && 63 Ty.getSizeInBits() % 32 != 0; 64 }; 65 } 66 67 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 68 return [=](const LegalityQuery &Query) { 69 const LLT Ty = Query.Types[TypeIdx]; 70 const LLT EltTy = Ty.getScalarType(); 71 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 72 }; 73 } 74 75 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 76 return [=](const LegalityQuery &Query) { 77 const LLT Ty = Query.Types[TypeIdx]; 78 const LLT EltTy = Ty.getElementType(); 79 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 80 }; 81 } 82 83 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 84 return [=](const LegalityQuery &Query) { 85 const LLT Ty = Query.Types[TypeIdx]; 86 const LLT EltTy = Ty.getElementType(); 87 unsigned Size = Ty.getSizeInBits(); 88 unsigned Pieces = (Size + 63) / 64; 89 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 90 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 91 }; 92 } 93 94 // Increase the number of vector elements to reach the next multiple of 32-bit 95 // type. 96 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 97 return [=](const LegalityQuery &Query) { 98 const LLT Ty = Query.Types[TypeIdx]; 99 100 const LLT EltTy = Ty.getElementType(); 101 const int Size = Ty.getSizeInBits(); 102 const int EltSize = EltTy.getSizeInBits(); 103 const int NextMul32 = (Size + 31) / 32; 104 105 assert(EltSize < 32); 106 107 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 108 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 109 }; 110 } 111 112 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 113 return [=](const LegalityQuery &Query) { 114 const LLT QueryTy = Query.Types[TypeIdx]; 115 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 116 }; 117 } 118 119 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 120 return [=](const LegalityQuery &Query) { 121 const LLT QueryTy = Query.Types[TypeIdx]; 122 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 123 }; 124 } 125 126 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 127 return [=](const LegalityQuery &Query) { 128 const LLT QueryTy = Query.Types[TypeIdx]; 129 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 130 }; 131 } 132 133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 134 // v2s16. 135 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 136 return [=](const LegalityQuery &Query) { 137 const LLT Ty = Query.Types[TypeIdx]; 138 if (Ty.isVector()) { 139 const int EltSize = Ty.getElementType().getSizeInBits(); 140 return EltSize == 32 || EltSize == 64 || 141 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 142 EltSize == 128 || EltSize == 256; 143 } 144 145 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 146 }; 147 } 148 149 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 150 return [=](const LegalityQuery &Query) { 151 return Query.Types[TypeIdx].getElementType() == Type; 152 }; 153 } 154 155 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 156 return [=](const LegalityQuery &Query) { 157 const LLT Ty = Query.Types[TypeIdx]; 158 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 159 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 160 }; 161 } 162 163 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 164 const GCNTargetMachine &TM) 165 : ST(ST_) { 166 using namespace TargetOpcode; 167 168 auto GetAddrSpacePtr = [&TM](unsigned AS) { 169 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 170 }; 171 172 const LLT S1 = LLT::scalar(1); 173 const LLT S8 = LLT::scalar(8); 174 const LLT S16 = LLT::scalar(16); 175 const LLT S32 = LLT::scalar(32); 176 const LLT S64 = LLT::scalar(64); 177 const LLT S96 = LLT::scalar(96); 178 const LLT S128 = LLT::scalar(128); 179 const LLT S256 = LLT::scalar(256); 180 const LLT S1024 = LLT::scalar(1024); 181 182 const LLT V2S16 = LLT::vector(2, 16); 183 const LLT V4S16 = LLT::vector(4, 16); 184 185 const LLT V2S32 = LLT::vector(2, 32); 186 const LLT V3S32 = LLT::vector(3, 32); 187 const LLT V4S32 = LLT::vector(4, 32); 188 const LLT V5S32 = LLT::vector(5, 32); 189 const LLT V6S32 = LLT::vector(6, 32); 190 const LLT V7S32 = LLT::vector(7, 32); 191 const LLT V8S32 = LLT::vector(8, 32); 192 const LLT V9S32 = LLT::vector(9, 32); 193 const LLT V10S32 = LLT::vector(10, 32); 194 const LLT V11S32 = LLT::vector(11, 32); 195 const LLT V12S32 = LLT::vector(12, 32); 196 const LLT V13S32 = LLT::vector(13, 32); 197 const LLT V14S32 = LLT::vector(14, 32); 198 const LLT V15S32 = LLT::vector(15, 32); 199 const LLT V16S32 = LLT::vector(16, 32); 200 const LLT V32S32 = LLT::vector(32, 32); 201 202 const LLT V2S64 = LLT::vector(2, 64); 203 const LLT V3S64 = LLT::vector(3, 64); 204 const LLT V4S64 = LLT::vector(4, 64); 205 const LLT V5S64 = LLT::vector(5, 64); 206 const LLT V6S64 = LLT::vector(6, 64); 207 const LLT V7S64 = LLT::vector(7, 64); 208 const LLT V8S64 = LLT::vector(8, 64); 209 const LLT V16S64 = LLT::vector(16, 64); 210 211 std::initializer_list<LLT> AllS32Vectors = 212 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 213 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 214 std::initializer_list<LLT> AllS64Vectors = 215 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 216 217 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 218 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 219 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 220 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 221 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 222 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 223 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 224 225 const LLT CodePtr = FlatPtr; 226 227 const std::initializer_list<LLT> AddrSpaces64 = { 228 GlobalPtr, ConstantPtr, FlatPtr 229 }; 230 231 const std::initializer_list<LLT> AddrSpaces32 = { 232 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 233 }; 234 235 const std::initializer_list<LLT> FPTypesBase = { 236 S32, S64 237 }; 238 239 const std::initializer_list<LLT> FPTypes16 = { 240 S32, S64, S16 241 }; 242 243 const std::initializer_list<LLT> FPTypesPK16 = { 244 S32, S64, S16, V2S16 245 }; 246 247 setAction({G_BRCOND, S1}, Legal); 248 249 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 250 // elements for v3s16 251 getActionDefinitionsBuilder(G_PHI) 252 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 253 .legalFor(AllS32Vectors) 254 .legalFor(AllS64Vectors) 255 .legalFor(AddrSpaces64) 256 .legalFor(AddrSpaces32) 257 .clampScalar(0, S32, S256) 258 .widenScalarToNextPow2(0, 32) 259 .clampMaxNumElements(0, S32, 16) 260 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 261 .legalIf(isPointer(0)); 262 263 if (ST.has16BitInsts()) { 264 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 265 .legalFor({S32, S16}) 266 .clampScalar(0, S16, S32) 267 .scalarize(0); 268 } else { 269 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 270 .legalFor({S32}) 271 .clampScalar(0, S32, S32) 272 .scalarize(0); 273 } 274 275 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 276 .legalFor({S32}) 277 .clampScalar(0, S32, S32) 278 .scalarize(0); 279 280 // Report legal for any types we can handle anywhere. For the cases only legal 281 // on the SALU, RegBankSelect will be able to re-legalize. 282 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 283 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 284 .clampScalar(0, S32, S64) 285 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 286 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 287 .widenScalarToNextPow2(0) 288 .scalarize(0); 289 290 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 291 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 292 .legalFor({{S32, S1}}) 293 .clampScalar(0, S32, S32) 294 .scalarize(0); // TODO: Implement. 295 296 getActionDefinitionsBuilder({G_SADDO, G_SSUBO}) 297 .lower(); 298 299 getActionDefinitionsBuilder(G_BITCAST) 300 // Don't worry about the size constraint. 301 .legalIf(all(isRegisterType(0), isRegisterType(1))) 302 // FIXME: Testing hack 303 .legalForCartesianProduct({S16, LLT::vector(2, 8), }); 304 305 getActionDefinitionsBuilder(G_FCONSTANT) 306 .legalFor({S32, S64, S16}) 307 .clampScalar(0, S16, S64); 308 309 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 310 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 311 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 312 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 313 .clampScalarOrElt(0, S32, S1024) 314 .legalIf(isMultiple32(0)) 315 .widenScalarToNextPow2(0, 32) 316 .clampMaxNumElements(0, S32, 16); 317 318 319 // FIXME: i1 operands to intrinsics should always be legal, but other i1 320 // values may not be legal. We need to figure out how to distinguish 321 // between these two scenarios. 322 getActionDefinitionsBuilder(G_CONSTANT) 323 .legalFor({S1, S32, S64, S16, GlobalPtr, 324 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 325 .clampScalar(0, S32, S64) 326 .widenScalarToNextPow2(0) 327 .legalIf(isPointer(0)); 328 329 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 330 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 331 .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr}); 332 333 334 auto &FPOpActions = getActionDefinitionsBuilder( 335 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 336 .legalFor({S32, S64}); 337 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 338 .customFor({S32, S64}); 339 340 if (ST.has16BitInsts()) { 341 if (ST.hasVOP3PInsts()) 342 FPOpActions.legalFor({S16, V2S16}); 343 else 344 FPOpActions.legalFor({S16}); 345 346 TrigActions.customFor({S16}); 347 } 348 349 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 350 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 351 352 if (ST.hasVOP3PInsts()) { 353 MinNumMaxNum.customFor(FPTypesPK16) 354 .clampMaxNumElements(0, S16, 2) 355 .clampScalar(0, S16, S64) 356 .scalarize(0); 357 } else if (ST.has16BitInsts()) { 358 MinNumMaxNum.customFor(FPTypes16) 359 .clampScalar(0, S16, S64) 360 .scalarize(0); 361 } else { 362 MinNumMaxNum.customFor(FPTypesBase) 363 .clampScalar(0, S32, S64) 364 .scalarize(0); 365 } 366 367 if (ST.hasVOP3PInsts()) 368 FPOpActions.clampMaxNumElements(0, S16, 2); 369 370 FPOpActions 371 .scalarize(0) 372 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 373 374 TrigActions 375 .scalarize(0) 376 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 377 378 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 379 .legalFor(FPTypesPK16) 380 .clampMaxNumElements(0, S16, 2) 381 .scalarize(0) 382 .clampScalar(0, S16, S64); 383 384 // TODO: Implement 385 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); 386 387 if (ST.has16BitInsts()) { 388 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 389 .legalFor({S32, S64, S16}) 390 .scalarize(0) 391 .clampScalar(0, S16, S64); 392 } else { 393 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 394 .legalFor({S32, S64}) 395 .scalarize(0) 396 .clampScalar(0, S32, S64); 397 } 398 399 getActionDefinitionsBuilder(G_FPTRUNC) 400 .legalFor({{S32, S64}, {S16, S32}}) 401 .scalarize(0); 402 403 getActionDefinitionsBuilder(G_FPEXT) 404 .legalFor({{S64, S32}, {S32, S16}}) 405 .lowerFor({{S64, S16}}) // FIXME: Implement 406 .scalarize(0); 407 408 // TODO: Verify V_BFI_B32 is generated from expanded bit ops. 409 getActionDefinitionsBuilder(G_FCOPYSIGN).lower(); 410 411 getActionDefinitionsBuilder(G_FSUB) 412 // Use actual fsub instruction 413 .legalFor({S32}) 414 // Must use fadd + fneg 415 .lowerFor({S64, S16, V2S16}) 416 .scalarize(0) 417 .clampScalar(0, S32, S64); 418 419 // Whether this is legal depends on the floating point mode for the function. 420 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 421 if (ST.hasMadF16()) 422 FMad.customFor({S32, S16}); 423 else 424 FMad.customFor({S32}); 425 FMad.scalarize(0) 426 .lower(); 427 428 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 429 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 430 {S32, S1}, {S64, S1}, {S16, S1}, 431 {S96, S32}, 432 // FIXME: Hack 433 {S64, LLT::scalar(33)}, 434 {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}}) 435 .scalarize(0); 436 437 // TODO: Split s1->s64 during regbankselect for VALU. 438 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 439 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}, {S32, S1}, {S16, S1}, {S64, S1}}) 440 .lowerFor({{S32, S64}}) 441 .customFor({{S64, S64}}); 442 if (ST.has16BitInsts()) 443 IToFP.legalFor({{S16, S16}}); 444 IToFP.clampScalar(1, S32, S64) 445 .scalarize(0); 446 447 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 448 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}); 449 if (ST.has16BitInsts()) 450 FPToI.legalFor({{S16, S16}}); 451 else 452 FPToI.minScalar(1, S32); 453 454 FPToI.minScalar(0, S32) 455 .scalarize(0); 456 457 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 458 .legalFor({S32, S64}) 459 .scalarize(0); 460 461 if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 462 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 463 .legalFor({S32, S64}) 464 .clampScalar(0, S32, S64) 465 .scalarize(0); 466 } else { 467 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 468 .legalFor({S32}) 469 .customFor({S64}) 470 .clampScalar(0, S32, S64) 471 .scalarize(0); 472 } 473 474 getActionDefinitionsBuilder(G_GEP) 475 .legalForCartesianProduct(AddrSpaces64, {S64}) 476 .legalForCartesianProduct(AddrSpaces32, {S32}) 477 .scalarize(0); 478 479 getActionDefinitionsBuilder(G_PTR_MASK) 480 .scalarize(0) 481 .alwaysLegal(); 482 483 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 484 485 auto &CmpBuilder = 486 getActionDefinitionsBuilder(G_ICMP) 487 .legalForCartesianProduct( 488 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 489 .legalFor({{S1, S32}, {S1, S64}}); 490 if (ST.has16BitInsts()) { 491 CmpBuilder.legalFor({{S1, S16}}); 492 } 493 494 CmpBuilder 495 .widenScalarToNextPow2(1) 496 .clampScalar(1, S32, S64) 497 .scalarize(0) 498 .legalIf(all(typeIs(0, S1), isPointer(1))); 499 500 getActionDefinitionsBuilder(G_FCMP) 501 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 502 .widenScalarToNextPow2(1) 503 .clampScalar(1, S32, S64) 504 .scalarize(0); 505 506 // FIXME: fexp, flog2, flog10 needs to be custom lowered. 507 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, 508 G_FLOG, G_FLOG2, G_FLOG10}) 509 .legalFor({S32}) 510 .scalarize(0); 511 512 // The 64-bit versions produce 32-bit results, but only on the SALU. 513 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, 514 G_CTTZ, G_CTTZ_ZERO_UNDEF, 515 G_CTPOP}) 516 .legalFor({{S32, S32}, {S32, S64}}) 517 .clampScalar(0, S32, S32) 518 .clampScalar(1, S32, S64) 519 .scalarize(0) 520 .widenScalarToNextPow2(0, 32) 521 .widenScalarToNextPow2(1, 32); 522 523 // TODO: Expand for > s32 524 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE}) 525 .legalFor({S32}) 526 .clampScalar(0, S32, S32) 527 .scalarize(0); 528 529 if (ST.has16BitInsts()) { 530 if (ST.hasVOP3PInsts()) { 531 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 532 .legalFor({S32, S16, V2S16}) 533 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 534 .clampMaxNumElements(0, S16, 2) 535 .clampScalar(0, S16, S32) 536 .widenScalarToNextPow2(0) 537 .scalarize(0); 538 } else { 539 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 540 .legalFor({S32, S16}) 541 .widenScalarToNextPow2(0) 542 .clampScalar(0, S16, S32) 543 .scalarize(0); 544 } 545 } else { 546 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 547 .legalFor({S32}) 548 .clampScalar(0, S32, S32) 549 .widenScalarToNextPow2(0) 550 .scalarize(0); 551 } 552 553 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 554 return [=](const LegalityQuery &Query) { 555 return Query.Types[TypeIdx0].getSizeInBits() < 556 Query.Types[TypeIdx1].getSizeInBits(); 557 }; 558 }; 559 560 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 561 return [=](const LegalityQuery &Query) { 562 return Query.Types[TypeIdx0].getSizeInBits() > 563 Query.Types[TypeIdx1].getSizeInBits(); 564 }; 565 }; 566 567 getActionDefinitionsBuilder(G_INTTOPTR) 568 // List the common cases 569 .legalForCartesianProduct(AddrSpaces64, {S64}) 570 .legalForCartesianProduct(AddrSpaces32, {S32}) 571 .scalarize(0) 572 // Accept any address space as long as the size matches 573 .legalIf(sameSize(0, 1)) 574 .widenScalarIf(smallerThan(1, 0), 575 [](const LegalityQuery &Query) { 576 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 577 }) 578 .narrowScalarIf(greaterThan(1, 0), 579 [](const LegalityQuery &Query) { 580 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 581 }); 582 583 getActionDefinitionsBuilder(G_PTRTOINT) 584 // List the common cases 585 .legalForCartesianProduct(AddrSpaces64, {S64}) 586 .legalForCartesianProduct(AddrSpaces32, {S32}) 587 .scalarize(0) 588 // Accept any address space as long as the size matches 589 .legalIf(sameSize(0, 1)) 590 .widenScalarIf(smallerThan(0, 1), 591 [](const LegalityQuery &Query) { 592 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 593 }) 594 .narrowScalarIf( 595 greaterThan(0, 1), 596 [](const LegalityQuery &Query) { 597 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 598 }); 599 600 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 601 .scalarize(0) 602 .custom(); 603 604 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 605 // handle some operations by just promoting the register during 606 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 607 auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned { 608 switch (AS) { 609 // FIXME: Private element size. 610 case AMDGPUAS::PRIVATE_ADDRESS: 611 return 32; 612 // FIXME: Check subtarget 613 case AMDGPUAS::LOCAL_ADDRESS: 614 return ST.useDS128() ? 128 : 64; 615 616 // Treat constant and global as identical. SMRD loads are sometimes usable 617 // for global loads (ideally constant address space should be eliminated) 618 // depending on the context. Legality cannot be context dependent, but 619 // RegBankSelect can split the load as necessary depending on the pointer 620 // register bank/uniformity and if the memory is invariant or not written in 621 // a kernel. 622 case AMDGPUAS::CONSTANT_ADDRESS: 623 case AMDGPUAS::GLOBAL_ADDRESS: 624 return 512; 625 default: 626 return 128; 627 } 628 }; 629 630 const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool { 631 const LLT DstTy = Query.Types[0]; 632 633 // Split vector extloads. 634 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 635 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 636 return true; 637 638 const LLT PtrTy = Query.Types[1]; 639 unsigned AS = PtrTy.getAddressSpace(); 640 if (MemSize > maxSizeForAddrSpace(AS)) 641 return true; 642 643 // Catch weird sized loads that don't evenly divide into the access sizes 644 // TODO: May be able to widen depending on alignment etc. 645 unsigned NumRegs = MemSize / 32; 646 if (NumRegs == 3 && !ST.hasDwordx3LoadStores()) 647 return true; 648 649 unsigned Align = Query.MMODescrs[0].AlignInBits; 650 if (Align < MemSize) { 651 const SITargetLowering *TLI = ST.getTargetLowering(); 652 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 653 } 654 655 return false; 656 }; 657 658 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 659 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 660 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 661 662 // TODO: Refine based on subtargets which support unaligned access or 128-bit 663 // LDS 664 // TODO: Unsupported flat for SI. 665 666 for (unsigned Op : {G_LOAD, G_STORE}) { 667 const bool IsStore = Op == G_STORE; 668 669 auto &Actions = getActionDefinitionsBuilder(Op); 670 // Whitelist the common cases. 671 // TODO: Pointer loads 672 // TODO: Wide constant loads 673 // TODO: Only CI+ has 3x loads 674 // TODO: Loads to s16 on gfx9 675 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 676 {V2S32, GlobalPtr, 64, GlobalAlign32}, 677 {V3S32, GlobalPtr, 96, GlobalAlign32}, 678 {S96, GlobalPtr, 96, GlobalAlign32}, 679 {V4S32, GlobalPtr, 128, GlobalAlign32}, 680 {S128, GlobalPtr, 128, GlobalAlign32}, 681 {S64, GlobalPtr, 64, GlobalAlign32}, 682 {V2S64, GlobalPtr, 128, GlobalAlign32}, 683 {V2S16, GlobalPtr, 32, GlobalAlign32}, 684 {S32, GlobalPtr, 8, GlobalAlign8}, 685 {S32, GlobalPtr, 16, GlobalAlign16}, 686 687 {S32, LocalPtr, 32, 32}, 688 {S64, LocalPtr, 64, 32}, 689 {V2S32, LocalPtr, 64, 32}, 690 {S32, LocalPtr, 8, 8}, 691 {S32, LocalPtr, 16, 16}, 692 {V2S16, LocalPtr, 32, 32}, 693 694 {S32, PrivatePtr, 32, 32}, 695 {S32, PrivatePtr, 8, 8}, 696 {S32, PrivatePtr, 16, 16}, 697 {V2S16, PrivatePtr, 32, 32}, 698 699 {S32, FlatPtr, 32, GlobalAlign32}, 700 {S32, FlatPtr, 16, GlobalAlign16}, 701 {S32, FlatPtr, 8, GlobalAlign8}, 702 {V2S16, FlatPtr, 32, GlobalAlign32}, 703 704 {S32, ConstantPtr, 32, GlobalAlign32}, 705 {V2S32, ConstantPtr, 64, GlobalAlign32}, 706 {V3S32, ConstantPtr, 96, GlobalAlign32}, 707 {V4S32, ConstantPtr, 128, GlobalAlign32}, 708 {S64, ConstantPtr, 64, GlobalAlign32}, 709 {S128, ConstantPtr, 128, GlobalAlign32}, 710 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 711 Actions 712 .customIf(typeIs(1, Constant32Ptr)) 713 .narrowScalarIf( 714 [=](const LegalityQuery &Query) -> bool { 715 return !Query.Types[0].isVector() && needToSplitLoad(Query); 716 }, 717 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 718 const LLT DstTy = Query.Types[0]; 719 const LLT PtrTy = Query.Types[1]; 720 721 const unsigned DstSize = DstTy.getSizeInBits(); 722 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 723 724 // Split extloads. 725 if (DstSize > MemSize) 726 return std::make_pair(0, LLT::scalar(MemSize)); 727 728 if (DstSize > 32 && (DstSize % 32 != 0)) { 729 // FIXME: Need a way to specify non-extload of larger size if 730 // suitably aligned. 731 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 732 } 733 734 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 735 if (MemSize > MaxSize) 736 return std::make_pair(0, LLT::scalar(MaxSize)); 737 738 unsigned Align = Query.MMODescrs[0].AlignInBits; 739 return std::make_pair(0, LLT::scalar(Align)); 740 }) 741 .fewerElementsIf( 742 [=](const LegalityQuery &Query) -> bool { 743 return Query.Types[0].isVector() && needToSplitLoad(Query); 744 }, 745 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 746 const LLT DstTy = Query.Types[0]; 747 const LLT PtrTy = Query.Types[1]; 748 749 LLT EltTy = DstTy.getElementType(); 750 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 751 752 // Split if it's too large for the address space. 753 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 754 unsigned NumElts = DstTy.getNumElements(); 755 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 756 757 // FIXME: Refine when odd breakdowns handled 758 // The scalars will need to be re-legalized. 759 if (NumPieces == 1 || NumPieces >= NumElts || 760 NumElts % NumPieces != 0) 761 return std::make_pair(0, EltTy); 762 763 return std::make_pair(0, 764 LLT::vector(NumElts / NumPieces, EltTy)); 765 } 766 767 // Need to split because of alignment. 768 unsigned Align = Query.MMODescrs[0].AlignInBits; 769 unsigned EltSize = EltTy.getSizeInBits(); 770 if (EltSize > Align && 771 (EltSize / Align < DstTy.getNumElements())) { 772 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 773 } 774 775 // May need relegalization for the scalars. 776 return std::make_pair(0, EltTy); 777 }) 778 .minScalar(0, S32); 779 780 if (IsStore) 781 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 782 783 // TODO: Need a bitcast lower option? 784 Actions 785 .legalIf([=](const LegalityQuery &Query) { 786 const LLT Ty0 = Query.Types[0]; 787 unsigned Size = Ty0.getSizeInBits(); 788 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 789 unsigned Align = Query.MMODescrs[0].AlignInBits; 790 791 // No extending vector loads. 792 if (Size > MemSize && Ty0.isVector()) 793 return false; 794 795 // FIXME: Widening store from alignment not valid. 796 if (MemSize < Size) 797 MemSize = std::max(MemSize, Align); 798 799 switch (MemSize) { 800 case 8: 801 case 16: 802 return Size == 32; 803 case 32: 804 case 64: 805 case 128: 806 return true; 807 case 96: 808 return ST.hasDwordx3LoadStores(); 809 case 256: 810 case 512: 811 return true; 812 default: 813 return false; 814 } 815 }) 816 .widenScalarToNextPow2(0) 817 // TODO: v3s32->v4s32 with alignment 818 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 819 } 820 821 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 822 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 823 {S32, GlobalPtr, 16, 2 * 8}, 824 {S32, LocalPtr, 8, 8}, 825 {S32, LocalPtr, 16, 16}, 826 {S32, PrivatePtr, 8, 8}, 827 {S32, PrivatePtr, 16, 16}, 828 {S32, ConstantPtr, 8, 8}, 829 {S32, ConstantPtr, 16, 2 * 8}}); 830 if (ST.hasFlatAddressSpace()) { 831 ExtLoads.legalForTypesWithMemDesc( 832 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 833 } 834 835 ExtLoads.clampScalar(0, S32, S32) 836 .widenScalarToNextPow2(0) 837 .unsupportedIfMemSizeNotPow2() 838 .lower(); 839 840 auto &Atomics = getActionDefinitionsBuilder( 841 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 842 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 843 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 844 G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG}) 845 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 846 {S64, GlobalPtr}, {S64, LocalPtr}}); 847 if (ST.hasFlatAddressSpace()) { 848 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 849 } 850 851 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 852 .legalFor({{S32, LocalPtr}}); 853 854 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS) 855 .lower(); 856 857 // TODO: Pointer types, any 32-bit or 64-bit vector 858 getActionDefinitionsBuilder(G_SELECT) 859 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 860 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 861 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1}) 862 .clampScalar(0, S16, S64) 863 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 864 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 865 .scalarize(1) 866 .clampMaxNumElements(0, S32, 2) 867 .clampMaxNumElements(0, LocalPtr, 2) 868 .clampMaxNumElements(0, PrivatePtr, 2) 869 .scalarize(0) 870 .widenScalarToNextPow2(0) 871 .legalIf(all(isPointer(0), typeIs(1, S1))); 872 873 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 874 // be more flexible with the shift amount type. 875 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 876 .legalFor({{S32, S32}, {S64, S32}}); 877 if (ST.has16BitInsts()) { 878 if (ST.hasVOP3PInsts()) { 879 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 880 .clampMaxNumElements(0, S16, 2); 881 } else 882 Shifts.legalFor({{S16, S32}, {S16, S16}}); 883 884 Shifts.clampScalar(1, S16, S32); 885 Shifts.clampScalar(0, S16, S64); 886 Shifts.widenScalarToNextPow2(0, 16); 887 } else { 888 // Make sure we legalize the shift amount type first, as the general 889 // expansion for the shifted type will produce much worse code if it hasn't 890 // been truncated already. 891 Shifts.clampScalar(1, S32, S32); 892 Shifts.clampScalar(0, S32, S64); 893 Shifts.widenScalarToNextPow2(0, 32); 894 } 895 Shifts.scalarize(0); 896 897 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 898 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 899 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 900 unsigned IdxTypeIdx = 2; 901 902 getActionDefinitionsBuilder(Op) 903 .customIf([=](const LegalityQuery &Query) { 904 const LLT EltTy = Query.Types[EltTypeIdx]; 905 const LLT VecTy = Query.Types[VecTypeIdx]; 906 const LLT IdxTy = Query.Types[IdxTypeIdx]; 907 return (EltTy.getSizeInBits() == 16 || 908 EltTy.getSizeInBits() % 32 == 0) && 909 VecTy.getSizeInBits() % 32 == 0 && 910 VecTy.getSizeInBits() <= 1024 && 911 IdxTy.getSizeInBits() == 32; 912 }) 913 .clampScalar(EltTypeIdx, S32, S64) 914 .clampScalar(VecTypeIdx, S32, S64) 915 .clampScalar(IdxTypeIdx, S32, S32); 916 } 917 918 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 919 .unsupportedIf([=](const LegalityQuery &Query) { 920 const LLT &EltTy = Query.Types[1].getElementType(); 921 return Query.Types[0] != EltTy; 922 }); 923 924 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 925 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 926 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 927 928 // FIXME: Doesn't handle extract of illegal sizes. 929 getActionDefinitionsBuilder(Op) 930 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 931 // FIXME: Multiples of 16 should not be legal. 932 .legalIf([=](const LegalityQuery &Query) { 933 const LLT BigTy = Query.Types[BigTyIdx]; 934 const LLT LitTy = Query.Types[LitTyIdx]; 935 return (BigTy.getSizeInBits() % 32 == 0) && 936 (LitTy.getSizeInBits() % 16 == 0); 937 }) 938 .widenScalarIf( 939 [=](const LegalityQuery &Query) { 940 const LLT BigTy = Query.Types[BigTyIdx]; 941 return (BigTy.getScalarSizeInBits() < 16); 942 }, 943 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 944 .widenScalarIf( 945 [=](const LegalityQuery &Query) { 946 const LLT LitTy = Query.Types[LitTyIdx]; 947 return (LitTy.getScalarSizeInBits() < 16); 948 }, 949 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 950 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 951 .widenScalarToNextPow2(BigTyIdx, 32); 952 953 } 954 955 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 956 .legalForCartesianProduct(AllS32Vectors, {S32}) 957 .legalForCartesianProduct(AllS64Vectors, {S64}) 958 .clampNumElements(0, V16S32, V32S32) 959 .clampNumElements(0, V2S64, V16S64) 960 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 961 962 if (ST.hasScalarPackInsts()) 963 BuildVector.legalFor({V2S16, S32}); 964 965 BuildVector 966 .minScalarSameAs(1, 0) 967 .legalIf(isRegisterType(0)) 968 .minScalarOrElt(0, S32); 969 970 if (ST.hasScalarPackInsts()) { 971 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 972 .legalFor({V2S16, S32}) 973 .lower(); 974 } else { 975 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 976 .lower(); 977 } 978 979 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 980 .legalIf(isRegisterType(0)); 981 982 // TODO: Don't fully scalarize v2s16 pieces 983 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 984 985 // Merge/Unmerge 986 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 987 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 988 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 989 990 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 991 const LLT &Ty = Query.Types[TypeIdx]; 992 if (Ty.isVector()) { 993 const LLT &EltTy = Ty.getElementType(); 994 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 995 return true; 996 if (!isPowerOf2_32(EltTy.getSizeInBits())) 997 return true; 998 } 999 return false; 1000 }; 1001 1002 auto &Builder = getActionDefinitionsBuilder(Op) 1003 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1004 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1005 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1006 // valid. 1007 .clampScalar(LitTyIdx, S16, S256) 1008 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1009 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1010 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1011 elementTypeIs(1, S16)), 1012 changeTo(1, V2S16)) 1013 // Break up vectors with weird elements into scalars 1014 .fewerElementsIf( 1015 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 1016 scalarize(0)) 1017 .fewerElementsIf( 1018 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 1019 scalarize(1)) 1020 .clampScalar(BigTyIdx, S32, S1024) 1021 .lowerFor({{S16, V2S16}}); 1022 1023 if (Op == G_MERGE_VALUES) { 1024 Builder.widenScalarIf( 1025 // TODO: Use 16-bit shifts if legal for 8-bit values? 1026 [=](const LegalityQuery &Query) { 1027 const LLT Ty = Query.Types[LitTyIdx]; 1028 return Ty.getSizeInBits() < 32; 1029 }, 1030 changeTo(LitTyIdx, S32)); 1031 } 1032 1033 Builder.widenScalarIf( 1034 [=](const LegalityQuery &Query) { 1035 const LLT Ty = Query.Types[BigTyIdx]; 1036 return !isPowerOf2_32(Ty.getSizeInBits()) && 1037 Ty.getSizeInBits() % 16 != 0; 1038 }, 1039 [=](const LegalityQuery &Query) { 1040 // Pick the next power of 2, or a multiple of 64 over 128. 1041 // Whichever is smaller. 1042 const LLT &Ty = Query.Types[BigTyIdx]; 1043 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1044 if (NewSizeInBits >= 256) { 1045 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1046 if (RoundedTo < NewSizeInBits) 1047 NewSizeInBits = RoundedTo; 1048 } 1049 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1050 }) 1051 .legalIf([=](const LegalityQuery &Query) { 1052 const LLT &BigTy = Query.Types[BigTyIdx]; 1053 const LLT &LitTy = Query.Types[LitTyIdx]; 1054 1055 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1056 return false; 1057 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1058 return false; 1059 1060 return BigTy.getSizeInBits() % 16 == 0 && 1061 LitTy.getSizeInBits() % 16 == 0 && 1062 BigTy.getSizeInBits() <= 1024; 1063 }) 1064 // Any vectors left are the wrong size. Scalarize them. 1065 .scalarize(0) 1066 .scalarize(1); 1067 } 1068 1069 getActionDefinitionsBuilder(G_SEXT_INREG).lower(); 1070 1071 computeTables(); 1072 verify(*ST.getInstrInfo()); 1073 } 1074 1075 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1076 MachineRegisterInfo &MRI, 1077 MachineIRBuilder &B, 1078 GISelChangeObserver &Observer) const { 1079 switch (MI.getOpcode()) { 1080 case TargetOpcode::G_ADDRSPACE_CAST: 1081 return legalizeAddrSpaceCast(MI, MRI, B); 1082 case TargetOpcode::G_FRINT: 1083 return legalizeFrint(MI, MRI, B); 1084 case TargetOpcode::G_FCEIL: 1085 return legalizeFceil(MI, MRI, B); 1086 case TargetOpcode::G_INTRINSIC_TRUNC: 1087 return legalizeIntrinsicTrunc(MI, MRI, B); 1088 case TargetOpcode::G_SITOFP: 1089 return legalizeITOFP(MI, MRI, B, true); 1090 case TargetOpcode::G_UITOFP: 1091 return legalizeITOFP(MI, MRI, B, false); 1092 case TargetOpcode::G_FMINNUM: 1093 case TargetOpcode::G_FMAXNUM: 1094 case TargetOpcode::G_FMINNUM_IEEE: 1095 case TargetOpcode::G_FMAXNUM_IEEE: 1096 return legalizeMinNumMaxNum(MI, MRI, B); 1097 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1098 return legalizeExtractVectorElt(MI, MRI, B); 1099 case TargetOpcode::G_INSERT_VECTOR_ELT: 1100 return legalizeInsertVectorElt(MI, MRI, B); 1101 case TargetOpcode::G_FSIN: 1102 case TargetOpcode::G_FCOS: 1103 return legalizeSinCos(MI, MRI, B); 1104 case TargetOpcode::G_GLOBAL_VALUE: 1105 return legalizeGlobalValue(MI, MRI, B); 1106 case TargetOpcode::G_LOAD: 1107 return legalizeLoad(MI, MRI, B, Observer); 1108 case TargetOpcode::G_FMAD: 1109 return legalizeFMad(MI, MRI, B); 1110 default: 1111 return false; 1112 } 1113 1114 llvm_unreachable("expected switch to return"); 1115 } 1116 1117 Register AMDGPULegalizerInfo::getSegmentAperture( 1118 unsigned AS, 1119 MachineRegisterInfo &MRI, 1120 MachineIRBuilder &B) const { 1121 MachineFunction &MF = B.getMF(); 1122 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1123 const LLT S32 = LLT::scalar(32); 1124 1125 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1126 1127 if (ST.hasApertureRegs()) { 1128 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1129 // getreg. 1130 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1131 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1132 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1133 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1134 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1135 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1136 unsigned Encoding = 1137 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1138 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1139 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1140 1141 Register ApertureReg = MRI.createGenericVirtualRegister(S32); 1142 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1143 1144 B.buildInstr(AMDGPU::S_GETREG_B32) 1145 .addDef(GetReg) 1146 .addImm(Encoding); 1147 MRI.setType(GetReg, S32); 1148 1149 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1150 B.buildInstr(TargetOpcode::G_SHL) 1151 .addDef(ApertureReg) 1152 .addUse(GetReg) 1153 .addUse(ShiftAmt.getReg(0)); 1154 1155 return ApertureReg; 1156 } 1157 1158 Register QueuePtr = MRI.createGenericVirtualRegister( 1159 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1160 1161 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1162 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1163 return Register(); 1164 1165 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1166 // private_segment_aperture_base_hi. 1167 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1168 1169 // FIXME: Don't use undef 1170 Value *V = UndefValue::get(PointerType::get( 1171 Type::getInt8Ty(MF.getFunction().getContext()), 1172 AMDGPUAS::CONSTANT_ADDRESS)); 1173 1174 MachinePointerInfo PtrInfo(V, StructOffset); 1175 MachineMemOperand *MMO = MF.getMachineMemOperand( 1176 PtrInfo, 1177 MachineMemOperand::MOLoad | 1178 MachineMemOperand::MODereferenceable | 1179 MachineMemOperand::MOInvariant, 1180 4, 1181 MinAlign(64, StructOffset)); 1182 1183 Register LoadResult = MRI.createGenericVirtualRegister(S32); 1184 Register LoadAddr; 1185 1186 B.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1187 B.buildLoad(LoadResult, LoadAddr, *MMO); 1188 return LoadResult; 1189 } 1190 1191 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1192 MachineInstr &MI, MachineRegisterInfo &MRI, 1193 MachineIRBuilder &B) const { 1194 MachineFunction &MF = B.getMF(); 1195 1196 B.setInstr(MI); 1197 1198 const LLT S32 = LLT::scalar(32); 1199 Register Dst = MI.getOperand(0).getReg(); 1200 Register Src = MI.getOperand(1).getReg(); 1201 1202 LLT DstTy = MRI.getType(Dst); 1203 LLT SrcTy = MRI.getType(Src); 1204 unsigned DestAS = DstTy.getAddressSpace(); 1205 unsigned SrcAS = SrcTy.getAddressSpace(); 1206 1207 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1208 // vector element. 1209 assert(!DstTy.isVector()); 1210 1211 const AMDGPUTargetMachine &TM 1212 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1213 1214 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1215 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1216 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1217 return true; 1218 } 1219 1220 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1221 // Truncate. 1222 B.buildExtract(Dst, Src, 0); 1223 MI.eraseFromParent(); 1224 return true; 1225 } 1226 1227 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1228 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1229 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1230 1231 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1232 // another. Merge operands are required to be the same type, but creating an 1233 // extra ptrtoint would be kind of pointless. 1234 auto HighAddr = B.buildConstant( 1235 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1236 B.buildMerge(Dst, {Src, HighAddr.getReg(0)}); 1237 MI.eraseFromParent(); 1238 return true; 1239 } 1240 1241 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1242 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1243 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1244 unsigned NullVal = TM.getNullPointerValue(DestAS); 1245 1246 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1247 auto FlatNull = B.buildConstant(SrcTy, 0); 1248 1249 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy); 1250 1251 // Extract low 32-bits of the pointer. 1252 B.buildExtract(PtrLo32, Src, 0); 1253 1254 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1255 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); 1256 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1257 1258 MI.eraseFromParent(); 1259 return true; 1260 } 1261 1262 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1263 return false; 1264 1265 if (!ST.hasFlatAddressSpace()) 1266 return false; 1267 1268 auto SegmentNull = 1269 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1270 auto FlatNull = 1271 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1272 1273 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1274 if (!ApertureReg.isValid()) 1275 return false; 1276 1277 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1278 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); 1279 1280 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy); 1281 1282 // Coerce the type of the low half of the result so we can use merge_values. 1283 Register SrcAsInt = MRI.createGenericVirtualRegister(S32); 1284 B.buildInstr(TargetOpcode::G_PTRTOINT) 1285 .addDef(SrcAsInt) 1286 .addUse(Src); 1287 1288 // TODO: Should we allow mismatched types but matching sizes in merges to 1289 // avoid the ptrtoint? 1290 B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); 1291 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); 1292 1293 MI.eraseFromParent(); 1294 return true; 1295 } 1296 1297 bool AMDGPULegalizerInfo::legalizeFrint( 1298 MachineInstr &MI, MachineRegisterInfo &MRI, 1299 MachineIRBuilder &B) const { 1300 B.setInstr(MI); 1301 1302 Register Src = MI.getOperand(1).getReg(); 1303 LLT Ty = MRI.getType(Src); 1304 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1305 1306 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1307 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1308 1309 auto C1 = B.buildFConstant(Ty, C1Val); 1310 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1311 1312 // TODO: Should this propagate fast-math-flags? 1313 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1314 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1315 1316 auto C2 = B.buildFConstant(Ty, C2Val); 1317 auto Fabs = B.buildFAbs(Ty, Src); 1318 1319 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1320 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1321 return true; 1322 } 1323 1324 bool AMDGPULegalizerInfo::legalizeFceil( 1325 MachineInstr &MI, MachineRegisterInfo &MRI, 1326 MachineIRBuilder &B) const { 1327 B.setInstr(MI); 1328 1329 const LLT S1 = LLT::scalar(1); 1330 const LLT S64 = LLT::scalar(64); 1331 1332 Register Src = MI.getOperand(1).getReg(); 1333 assert(MRI.getType(Src) == S64); 1334 1335 // result = trunc(src) 1336 // if (src > 0.0 && src != result) 1337 // result += 1.0 1338 1339 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src}); 1340 1341 const auto Zero = B.buildFConstant(S64, 0.0); 1342 const auto One = B.buildFConstant(S64, 1.0); 1343 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1344 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1345 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1346 auto Add = B.buildSelect(S64, And, One, Zero); 1347 1348 // TODO: Should this propagate fast-math-flags? 1349 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1350 return true; 1351 } 1352 1353 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1354 MachineIRBuilder &B) { 1355 const unsigned FractBits = 52; 1356 const unsigned ExpBits = 11; 1357 LLT S32 = LLT::scalar(32); 1358 1359 auto Const0 = B.buildConstant(S32, FractBits - 32); 1360 auto Const1 = B.buildConstant(S32, ExpBits); 1361 1362 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1363 .addUse(Const0.getReg(0)) 1364 .addUse(Const1.getReg(0)); 1365 1366 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1367 } 1368 1369 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1370 MachineInstr &MI, MachineRegisterInfo &MRI, 1371 MachineIRBuilder &B) const { 1372 B.setInstr(MI); 1373 1374 const LLT S1 = LLT::scalar(1); 1375 const LLT S32 = LLT::scalar(32); 1376 const LLT S64 = LLT::scalar(64); 1377 1378 Register Src = MI.getOperand(1).getReg(); 1379 assert(MRI.getType(Src) == S64); 1380 1381 // TODO: Should this use extract since the low half is unused? 1382 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1383 Register Hi = Unmerge.getReg(1); 1384 1385 // Extract the upper half, since this is where we will find the sign and 1386 // exponent. 1387 auto Exp = extractF64Exponent(Hi, B); 1388 1389 const unsigned FractBits = 52; 1390 1391 // Extract the sign bit. 1392 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1393 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1394 1395 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1396 1397 const auto Zero32 = B.buildConstant(S32, 0); 1398 1399 // Extend back to 64-bits. 1400 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); 1401 1402 auto Shr = B.buildAShr(S64, FractMask, Exp); 1403 auto Not = B.buildNot(S64, Shr); 1404 auto Tmp0 = B.buildAnd(S64, Src, Not); 1405 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1406 1407 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1408 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1409 1410 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1411 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1412 return true; 1413 } 1414 1415 bool AMDGPULegalizerInfo::legalizeITOFP( 1416 MachineInstr &MI, MachineRegisterInfo &MRI, 1417 MachineIRBuilder &B, bool Signed) const { 1418 B.setInstr(MI); 1419 1420 Register Dst = MI.getOperand(0).getReg(); 1421 Register Src = MI.getOperand(1).getReg(); 1422 1423 const LLT S64 = LLT::scalar(64); 1424 const LLT S32 = LLT::scalar(32); 1425 1426 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1427 1428 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1429 1430 auto CvtHi = Signed ? 1431 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1432 B.buildUITOFP(S64, Unmerge.getReg(1)); 1433 1434 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1435 1436 auto ThirtyTwo = B.buildConstant(S32, 32); 1437 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1438 .addUse(CvtHi.getReg(0)) 1439 .addUse(ThirtyTwo.getReg(0)); 1440 1441 // TODO: Should this propagate fast-math-flags? 1442 B.buildFAdd(Dst, LdExp, CvtLo); 1443 MI.eraseFromParent(); 1444 return true; 1445 } 1446 1447 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1448 MachineInstr &MI, MachineRegisterInfo &MRI, 1449 MachineIRBuilder &B) const { 1450 MachineFunction &MF = B.getMF(); 1451 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1452 1453 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1454 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1455 1456 // With ieee_mode disabled, the instructions have the correct behavior 1457 // already for G_FMINNUM/G_FMAXNUM 1458 if (!MFI->getMode().IEEE) 1459 return !IsIEEEOp; 1460 1461 if (IsIEEEOp) 1462 return true; 1463 1464 MachineIRBuilder HelperBuilder(MI); 1465 GISelObserverWrapper DummyObserver; 1466 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1467 HelperBuilder.setInstr(MI); 1468 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1469 } 1470 1471 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1472 MachineInstr &MI, MachineRegisterInfo &MRI, 1473 MachineIRBuilder &B) const { 1474 // TODO: Should move some of this into LegalizerHelper. 1475 1476 // TODO: Promote dynamic indexing of s16 to s32 1477 // TODO: Dynamic s64 indexing is only legal for SGPR. 1478 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); 1479 if (!IdxVal) // Dynamic case will be selected to register indexing. 1480 return true; 1481 1482 Register Dst = MI.getOperand(0).getReg(); 1483 Register Vec = MI.getOperand(1).getReg(); 1484 1485 LLT VecTy = MRI.getType(Vec); 1486 LLT EltTy = VecTy.getElementType(); 1487 assert(EltTy == MRI.getType(Dst)); 1488 1489 B.setInstr(MI); 1490 1491 if (IdxVal.getValue() < VecTy.getNumElements()) 1492 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); 1493 else 1494 B.buildUndef(Dst); 1495 1496 MI.eraseFromParent(); 1497 return true; 1498 } 1499 1500 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1501 MachineInstr &MI, MachineRegisterInfo &MRI, 1502 MachineIRBuilder &B) const { 1503 // TODO: Should move some of this into LegalizerHelper. 1504 1505 // TODO: Promote dynamic indexing of s16 to s32 1506 // TODO: Dynamic s64 indexing is only legal for SGPR. 1507 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); 1508 if (!IdxVal) // Dynamic case will be selected to register indexing. 1509 return true; 1510 1511 Register Dst = MI.getOperand(0).getReg(); 1512 Register Vec = MI.getOperand(1).getReg(); 1513 Register Ins = MI.getOperand(2).getReg(); 1514 1515 LLT VecTy = MRI.getType(Vec); 1516 LLT EltTy = VecTy.getElementType(); 1517 assert(EltTy == MRI.getType(Ins)); 1518 1519 B.setInstr(MI); 1520 1521 if (IdxVal.getValue() < VecTy.getNumElements()) 1522 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); 1523 else 1524 B.buildUndef(Dst); 1525 1526 MI.eraseFromParent(); 1527 return true; 1528 } 1529 1530 bool AMDGPULegalizerInfo::legalizeSinCos( 1531 MachineInstr &MI, MachineRegisterInfo &MRI, 1532 MachineIRBuilder &B) const { 1533 B.setInstr(MI); 1534 1535 Register DstReg = MI.getOperand(0).getReg(); 1536 Register SrcReg = MI.getOperand(1).getReg(); 1537 LLT Ty = MRI.getType(DstReg); 1538 unsigned Flags = MI.getFlags(); 1539 1540 Register TrigVal; 1541 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1542 if (ST.hasTrigReducedRange()) { 1543 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1544 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1545 .addUse(MulVal.getReg(0)) 1546 .setMIFlags(Flags).getReg(0); 1547 } else 1548 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1549 1550 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1551 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1552 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1553 .addUse(TrigVal) 1554 .setMIFlags(Flags); 1555 MI.eraseFromParent(); 1556 return true; 1557 } 1558 1559 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1560 Register DstReg, LLT PtrTy, 1561 MachineIRBuilder &B, const GlobalValue *GV, 1562 unsigned Offset, unsigned GAFlags) const { 1563 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1564 // to the following code sequence: 1565 // 1566 // For constant address space: 1567 // s_getpc_b64 s[0:1] 1568 // s_add_u32 s0, s0, $symbol 1569 // s_addc_u32 s1, s1, 0 1570 // 1571 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1572 // a fixup or relocation is emitted to replace $symbol with a literal 1573 // constant, which is a pc-relative offset from the encoding of the $symbol 1574 // operand to the global variable. 1575 // 1576 // For global address space: 1577 // s_getpc_b64 s[0:1] 1578 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1579 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1580 // 1581 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1582 // fixups or relocations are emitted to replace $symbol@*@lo and 1583 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1584 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1585 // operand to the global variable. 1586 // 1587 // What we want here is an offset from the value returned by s_getpc 1588 // (which is the address of the s_add_u32 instruction) to the global 1589 // variable, but since the encoding of $symbol starts 4 bytes after the start 1590 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1591 // small. This requires us to add 4 to the global variable offset in order to 1592 // compute the correct address. 1593 1594 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1595 1596 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1597 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1598 1599 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1600 .addDef(PCReg); 1601 1602 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1603 if (GAFlags == SIInstrInfo::MO_NONE) 1604 MIB.addImm(0); 1605 else 1606 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1607 1608 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1609 1610 if (PtrTy.getSizeInBits() == 32) 1611 B.buildExtract(DstReg, PCReg, 0); 1612 return true; 1613 } 1614 1615 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1616 MachineInstr &MI, MachineRegisterInfo &MRI, 1617 MachineIRBuilder &B) const { 1618 Register DstReg = MI.getOperand(0).getReg(); 1619 LLT Ty = MRI.getType(DstReg); 1620 unsigned AS = Ty.getAddressSpace(); 1621 1622 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1623 MachineFunction &MF = B.getMF(); 1624 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1625 B.setInstr(MI); 1626 1627 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1628 if (!MFI->isEntryFunction()) { 1629 const Function &Fn = MF.getFunction(); 1630 DiagnosticInfoUnsupported BadLDSDecl( 1631 Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 1632 Fn.getContext().diagnose(BadLDSDecl); 1633 } 1634 1635 // TODO: We could emit code to handle the initialization somewhere. 1636 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1637 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1638 MI.eraseFromParent(); 1639 return true; 1640 } 1641 1642 const Function &Fn = MF.getFunction(); 1643 DiagnosticInfoUnsupported BadInit( 1644 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1645 Fn.getContext().diagnose(BadInit); 1646 return true; 1647 } 1648 1649 const SITargetLowering *TLI = ST.getTargetLowering(); 1650 1651 if (TLI->shouldEmitFixup(GV)) { 1652 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 1653 MI.eraseFromParent(); 1654 return true; 1655 } 1656 1657 if (TLI->shouldEmitPCReloc(GV)) { 1658 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 1659 MI.eraseFromParent(); 1660 return true; 1661 } 1662 1663 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1664 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 1665 1666 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 1667 MachinePointerInfo::getGOT(MF), 1668 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1669 MachineMemOperand::MOInvariant, 1670 8 /*Size*/, 8 /*Align*/); 1671 1672 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 1673 1674 if (Ty.getSizeInBits() == 32) { 1675 // Truncate if this is a 32-bit constant adrdess. 1676 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 1677 B.buildExtract(DstReg, Load, 0); 1678 } else 1679 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 1680 1681 MI.eraseFromParent(); 1682 return true; 1683 } 1684 1685 bool AMDGPULegalizerInfo::legalizeLoad( 1686 MachineInstr &MI, MachineRegisterInfo &MRI, 1687 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 1688 B.setInstr(MI); 1689 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1690 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 1691 Observer.changingInstr(MI); 1692 MI.getOperand(1).setReg(Cast.getReg(0)); 1693 Observer.changedInstr(MI); 1694 return true; 1695 } 1696 1697 bool AMDGPULegalizerInfo::legalizeFMad( 1698 MachineInstr &MI, MachineRegisterInfo &MRI, 1699 MachineIRBuilder &B) const { 1700 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 1701 assert(Ty.isScalar()); 1702 1703 // TODO: Always legal with future ftz flag. 1704 if (Ty == LLT::scalar(32) && !ST.hasFP32Denormals()) 1705 return true; 1706 if (Ty == LLT::scalar(16) && !ST.hasFP16Denormals()) 1707 return true; 1708 1709 MachineFunction &MF = B.getMF(); 1710 1711 MachineIRBuilder HelperBuilder(MI); 1712 GISelObserverWrapper DummyObserver; 1713 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1714 HelperBuilder.setMBB(*MI.getParent()); 1715 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 1716 } 1717 1718 // Return the use branch instruction, otherwise null if the usage is invalid. 1719 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 1720 MachineRegisterInfo &MRI) { 1721 Register CondDef = MI.getOperand(0).getReg(); 1722 if (!MRI.hasOneNonDBGUse(CondDef)) 1723 return nullptr; 1724 1725 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 1726 return UseMI.getParent() == MI.getParent() && 1727 UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr; 1728 } 1729 1730 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 1731 Register Reg, LLT Ty) const { 1732 Register LiveIn = MRI.getLiveInVirtReg(Reg); 1733 if (LiveIn) 1734 return LiveIn; 1735 1736 Register NewReg = MRI.createGenericVirtualRegister(Ty); 1737 MRI.addLiveIn(Reg, NewReg); 1738 return NewReg; 1739 } 1740 1741 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 1742 const ArgDescriptor *Arg) const { 1743 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 1744 return false; // TODO: Handle these 1745 1746 assert(Arg->getRegister().isPhysical()); 1747 1748 MachineRegisterInfo &MRI = *B.getMRI(); 1749 1750 LLT Ty = MRI.getType(DstReg); 1751 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 1752 1753 if (Arg->isMasked()) { 1754 // TODO: Should we try to emit this once in the entry block? 1755 const LLT S32 = LLT::scalar(32); 1756 const unsigned Mask = Arg->getMask(); 1757 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 1758 1759 Register AndMaskSrc = LiveIn; 1760 1761 if (Shift != 0) { 1762 auto ShiftAmt = B.buildConstant(S32, Shift); 1763 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 1764 } 1765 1766 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 1767 } else 1768 B.buildCopy(DstReg, LiveIn); 1769 1770 // Insert the argument copy if it doens't already exist. 1771 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 1772 if (!MRI.getVRegDef(LiveIn)) { 1773 // FIXME: Should have scoped insert pt 1774 MachineBasicBlock &OrigInsBB = B.getMBB(); 1775 auto OrigInsPt = B.getInsertPt(); 1776 1777 MachineBasicBlock &EntryMBB = B.getMF().front(); 1778 EntryMBB.addLiveIn(Arg->getRegister()); 1779 B.setInsertPt(EntryMBB, EntryMBB.begin()); 1780 B.buildCopy(LiveIn, Arg->getRegister()); 1781 1782 B.setInsertPt(OrigInsBB, OrigInsPt); 1783 } 1784 1785 return true; 1786 } 1787 1788 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 1789 MachineInstr &MI, 1790 MachineRegisterInfo &MRI, 1791 MachineIRBuilder &B, 1792 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 1793 B.setInstr(MI); 1794 1795 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1796 1797 const ArgDescriptor *Arg; 1798 const TargetRegisterClass *RC; 1799 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 1800 if (!Arg) { 1801 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 1802 return false; 1803 } 1804 1805 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 1806 MI.eraseFromParent(); 1807 return true; 1808 } 1809 1810 return false; 1811 } 1812 1813 bool AMDGPULegalizerInfo::legalizeFDIVFast(MachineInstr &MI, 1814 MachineRegisterInfo &MRI, 1815 MachineIRBuilder &B) const { 1816 B.setInstr(MI); 1817 Register Res = MI.getOperand(0).getReg(); 1818 Register LHS = MI.getOperand(2).getReg(); 1819 Register RHS = MI.getOperand(3).getReg(); 1820 uint16_t Flags = MI.getFlags(); 1821 1822 LLT S32 = LLT::scalar(32); 1823 LLT S1 = LLT::scalar(1); 1824 1825 auto Abs = B.buildFAbs(S32, RHS, Flags); 1826 const APFloat C0Val(1.0f); 1827 1828 auto C0 = B.buildConstant(S32, 0x6f800000); 1829 auto C1 = B.buildConstant(S32, 0x2f800000); 1830 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 1831 1832 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 1833 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 1834 1835 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 1836 1837 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 1838 .addUse(Mul0.getReg(0)) 1839 .setMIFlags(Flags); 1840 1841 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 1842 1843 B.buildFMul(Res, Sel, Mul1, Flags); 1844 1845 MI.eraseFromParent(); 1846 return true; 1847 } 1848 1849 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 1850 MachineRegisterInfo &MRI, 1851 MachineIRBuilder &B) const { 1852 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1853 if (!MFI->isEntryFunction()) { 1854 return legalizePreloadedArgIntrin(MI, MRI, B, 1855 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 1856 } 1857 1858 B.setInstr(MI); 1859 1860 uint64_t Offset = 1861 ST.getTargetLowering()->getImplicitParameterOffset( 1862 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 1863 Register DstReg = MI.getOperand(0).getReg(); 1864 LLT DstTy = MRI.getType(DstReg); 1865 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 1866 1867 const ArgDescriptor *Arg; 1868 const TargetRegisterClass *RC; 1869 std::tie(Arg, RC) 1870 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 1871 if (!Arg) 1872 return false; 1873 1874 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 1875 if (!loadInputValue(KernargPtrReg, B, Arg)) 1876 return false; 1877 1878 B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 1879 MI.eraseFromParent(); 1880 return true; 1881 } 1882 1883 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 1884 MachineRegisterInfo &MRI, 1885 MachineIRBuilder &B, 1886 unsigned AddrSpace) const { 1887 B.setInstr(MI); 1888 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 1889 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 1890 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 1891 MI.eraseFromParent(); 1892 return true; 1893 } 1894 1895 /// Handle register layout difference for f16 images for some subtargets. 1896 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 1897 MachineRegisterInfo &MRI, 1898 Register Reg) const { 1899 if (!ST.hasUnpackedD16VMem()) 1900 return Reg; 1901 1902 const LLT S16 = LLT::scalar(16); 1903 const LLT S32 = LLT::scalar(32); 1904 LLT StoreVT = MRI.getType(Reg); 1905 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 1906 1907 auto Unmerge = B.buildUnmerge(S16, Reg); 1908 1909 SmallVector<Register, 4> WideRegs; 1910 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 1911 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 1912 1913 int NumElts = StoreVT.getNumElements(); 1914 1915 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 1916 } 1917 1918 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI, 1919 MachineRegisterInfo &MRI, 1920 MachineIRBuilder &B, 1921 bool IsFormat) const { 1922 // TODO: Reject f16 format on targets where unsupported. 1923 Register VData = MI.getOperand(1).getReg(); 1924 LLT Ty = MRI.getType(VData); 1925 1926 B.setInstr(MI); 1927 1928 const LLT S32 = LLT::scalar(32); 1929 const LLT S16 = LLT::scalar(16); 1930 1931 // Fixup illegal register types for i8 stores. 1932 if (Ty == LLT::scalar(8) || Ty == S16) { 1933 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 1934 MI.getOperand(1).setReg(AnyExt); 1935 return true; 1936 } 1937 1938 if (Ty.isVector()) { 1939 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 1940 if (IsFormat) 1941 MI.getOperand(1).setReg(handleD16VData(B, MRI, VData)); 1942 return true; 1943 } 1944 1945 return Ty.getElementType() == S32 && Ty.getNumElements() <= 4; 1946 } 1947 1948 return Ty == S32; 1949 } 1950 1951 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 1952 MachineRegisterInfo &MRI, 1953 MachineIRBuilder &B) const { 1954 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 1955 switch (MI.getIntrinsicID()) { 1956 case Intrinsic::amdgcn_if: { 1957 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 1958 const SIRegisterInfo *TRI 1959 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 1960 1961 B.setInstr(*BrCond); 1962 Register Def = MI.getOperand(1).getReg(); 1963 Register Use = MI.getOperand(3).getReg(); 1964 B.buildInstr(AMDGPU::SI_IF) 1965 .addDef(Def) 1966 .addUse(Use) 1967 .addMBB(BrCond->getOperand(1).getMBB()); 1968 1969 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 1970 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 1971 MI.eraseFromParent(); 1972 BrCond->eraseFromParent(); 1973 return true; 1974 } 1975 1976 return false; 1977 } 1978 case Intrinsic::amdgcn_loop: { 1979 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 1980 const SIRegisterInfo *TRI 1981 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 1982 1983 B.setInstr(*BrCond); 1984 Register Reg = MI.getOperand(2).getReg(); 1985 B.buildInstr(AMDGPU::SI_LOOP) 1986 .addUse(Reg) 1987 .addMBB(BrCond->getOperand(1).getMBB()); 1988 MI.eraseFromParent(); 1989 BrCond->eraseFromParent(); 1990 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 1991 return true; 1992 } 1993 1994 return false; 1995 } 1996 case Intrinsic::amdgcn_kernarg_segment_ptr: 1997 return legalizePreloadedArgIntrin( 1998 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 1999 case Intrinsic::amdgcn_implicitarg_ptr: 2000 return legalizeImplicitArgPtr(MI, MRI, B); 2001 case Intrinsic::amdgcn_workitem_id_x: 2002 return legalizePreloadedArgIntrin(MI, MRI, B, 2003 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 2004 case Intrinsic::amdgcn_workitem_id_y: 2005 return legalizePreloadedArgIntrin(MI, MRI, B, 2006 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 2007 case Intrinsic::amdgcn_workitem_id_z: 2008 return legalizePreloadedArgIntrin(MI, MRI, B, 2009 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 2010 case Intrinsic::amdgcn_workgroup_id_x: 2011 return legalizePreloadedArgIntrin(MI, MRI, B, 2012 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 2013 case Intrinsic::amdgcn_workgroup_id_y: 2014 return legalizePreloadedArgIntrin(MI, MRI, B, 2015 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 2016 case Intrinsic::amdgcn_workgroup_id_z: 2017 return legalizePreloadedArgIntrin(MI, MRI, B, 2018 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 2019 case Intrinsic::amdgcn_dispatch_ptr: 2020 return legalizePreloadedArgIntrin(MI, MRI, B, 2021 AMDGPUFunctionArgInfo::DISPATCH_PTR); 2022 case Intrinsic::amdgcn_queue_ptr: 2023 return legalizePreloadedArgIntrin(MI, MRI, B, 2024 AMDGPUFunctionArgInfo::QUEUE_PTR); 2025 case Intrinsic::amdgcn_implicit_buffer_ptr: 2026 return legalizePreloadedArgIntrin( 2027 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 2028 case Intrinsic::amdgcn_dispatch_id: 2029 return legalizePreloadedArgIntrin(MI, MRI, B, 2030 AMDGPUFunctionArgInfo::DISPATCH_ID); 2031 case Intrinsic::amdgcn_fdiv_fast: 2032 return legalizeFDIVFast(MI, MRI, B); 2033 case Intrinsic::amdgcn_is_shared: 2034 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 2035 case Intrinsic::amdgcn_is_private: 2036 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 2037 case Intrinsic::amdgcn_wavefrontsize: { 2038 B.setInstr(MI); 2039 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 2040 MI.eraseFromParent(); 2041 return true; 2042 } 2043 case Intrinsic::amdgcn_raw_buffer_store: 2044 return legalizeRawBufferStore(MI, MRI, B, false); 2045 case Intrinsic::amdgcn_raw_buffer_store_format: 2046 return legalizeRawBufferStore(MI, MRI, B, true); 2047 default: 2048 return true; 2049 } 2050 2051 return true; 2052 } 2053