1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPU.h" 22 #include "AMDGPULegalizerInfo.h" 23 #include "AMDGPUTargetMachine.h" 24 #include "SIMachineFunctionInfo.h" 25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/CodeGen/TargetOpcodes.h" 28 #include "llvm/CodeGen/ValueTypes.h" 29 #include "llvm/IR/DerivedTypes.h" 30 #include "llvm/IR/DiagnosticInfo.h" 31 #include "llvm/IR/Type.h" 32 #include "llvm/Support/Debug.h" 33 34 #define DEBUG_TYPE "amdgpu-legalinfo" 35 36 using namespace llvm; 37 using namespace LegalizeActions; 38 using namespace LegalizeMutations; 39 using namespace LegalityPredicates; 40 41 42 static LegalityPredicate isMultiple32(unsigned TypeIdx, 43 unsigned MaxSize = 1024) { 44 return [=](const LegalityQuery &Query) { 45 const LLT Ty = Query.Types[TypeIdx]; 46 const LLT EltTy = Ty.getScalarType(); 47 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 48 }; 49 } 50 51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 52 return [=](const LegalityQuery &Query) { 53 return Query.Types[TypeIdx].getSizeInBits() == Size; 54 }; 55 } 56 57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 58 return [=](const LegalityQuery &Query) { 59 const LLT Ty = Query.Types[TypeIdx]; 60 return Ty.isVector() && 61 Ty.getNumElements() % 2 != 0 && 62 Ty.getElementType().getSizeInBits() < 32 && 63 Ty.getSizeInBits() % 32 != 0; 64 }; 65 } 66 67 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 68 return [=](const LegalityQuery &Query) { 69 const LLT Ty = Query.Types[TypeIdx]; 70 const LLT EltTy = Ty.getScalarType(); 71 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 72 }; 73 } 74 75 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 76 return [=](const LegalityQuery &Query) { 77 const LLT Ty = Query.Types[TypeIdx]; 78 const LLT EltTy = Ty.getElementType(); 79 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 80 }; 81 } 82 83 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 84 return [=](const LegalityQuery &Query) { 85 const LLT Ty = Query.Types[TypeIdx]; 86 const LLT EltTy = Ty.getElementType(); 87 unsigned Size = Ty.getSizeInBits(); 88 unsigned Pieces = (Size + 63) / 64; 89 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 90 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 91 }; 92 } 93 94 // Increase the number of vector elements to reach the next multiple of 32-bit 95 // type. 96 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 97 return [=](const LegalityQuery &Query) { 98 const LLT Ty = Query.Types[TypeIdx]; 99 100 const LLT EltTy = Ty.getElementType(); 101 const int Size = Ty.getSizeInBits(); 102 const int EltSize = EltTy.getSizeInBits(); 103 const int NextMul32 = (Size + 31) / 32; 104 105 assert(EltSize < 32); 106 107 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 108 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 109 }; 110 } 111 112 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 113 return [=](const LegalityQuery &Query) { 114 const LLT QueryTy = Query.Types[TypeIdx]; 115 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 116 }; 117 } 118 119 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 120 return [=](const LegalityQuery &Query) { 121 const LLT QueryTy = Query.Types[TypeIdx]; 122 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 123 }; 124 } 125 126 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 127 return [=](const LegalityQuery &Query) { 128 const LLT QueryTy = Query.Types[TypeIdx]; 129 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 130 }; 131 } 132 133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 134 // v2s16. 135 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 136 return [=](const LegalityQuery &Query) { 137 const LLT Ty = Query.Types[TypeIdx]; 138 if (Ty.isVector()) { 139 const int EltSize = Ty.getElementType().getSizeInBits(); 140 return EltSize == 32 || EltSize == 64 || 141 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 142 EltSize == 128 || EltSize == 256; 143 } 144 145 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 146 }; 147 } 148 149 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 150 return [=](const LegalityQuery &Query) { 151 return Query.Types[TypeIdx].getElementType() == Type; 152 }; 153 } 154 155 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 156 return [=](const LegalityQuery &Query) { 157 const LLT Ty = Query.Types[TypeIdx]; 158 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 159 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 160 }; 161 } 162 163 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 164 const GCNTargetMachine &TM) 165 : ST(ST_) { 166 using namespace TargetOpcode; 167 168 auto GetAddrSpacePtr = [&TM](unsigned AS) { 169 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 170 }; 171 172 const LLT S1 = LLT::scalar(1); 173 const LLT S8 = LLT::scalar(8); 174 const LLT S16 = LLT::scalar(16); 175 const LLT S32 = LLT::scalar(32); 176 const LLT S64 = LLT::scalar(64); 177 const LLT S96 = LLT::scalar(96); 178 const LLT S128 = LLT::scalar(128); 179 const LLT S256 = LLT::scalar(256); 180 const LLT S1024 = LLT::scalar(1024); 181 182 const LLT V2S16 = LLT::vector(2, 16); 183 const LLT V4S16 = LLT::vector(4, 16); 184 185 const LLT V2S32 = LLT::vector(2, 32); 186 const LLT V3S32 = LLT::vector(3, 32); 187 const LLT V4S32 = LLT::vector(4, 32); 188 const LLT V5S32 = LLT::vector(5, 32); 189 const LLT V6S32 = LLT::vector(6, 32); 190 const LLT V7S32 = LLT::vector(7, 32); 191 const LLT V8S32 = LLT::vector(8, 32); 192 const LLT V9S32 = LLT::vector(9, 32); 193 const LLT V10S32 = LLT::vector(10, 32); 194 const LLT V11S32 = LLT::vector(11, 32); 195 const LLT V12S32 = LLT::vector(12, 32); 196 const LLT V13S32 = LLT::vector(13, 32); 197 const LLT V14S32 = LLT::vector(14, 32); 198 const LLT V15S32 = LLT::vector(15, 32); 199 const LLT V16S32 = LLT::vector(16, 32); 200 const LLT V32S32 = LLT::vector(32, 32); 201 202 const LLT V2S64 = LLT::vector(2, 64); 203 const LLT V3S64 = LLT::vector(3, 64); 204 const LLT V4S64 = LLT::vector(4, 64); 205 const LLT V5S64 = LLT::vector(5, 64); 206 const LLT V6S64 = LLT::vector(6, 64); 207 const LLT V7S64 = LLT::vector(7, 64); 208 const LLT V8S64 = LLT::vector(8, 64); 209 const LLT V16S64 = LLT::vector(16, 64); 210 211 std::initializer_list<LLT> AllS32Vectors = 212 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 213 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 214 std::initializer_list<LLT> AllS64Vectors = 215 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 216 217 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 218 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 219 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 220 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 221 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 222 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 223 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 224 225 const LLT CodePtr = FlatPtr; 226 227 const std::initializer_list<LLT> AddrSpaces64 = { 228 GlobalPtr, ConstantPtr, FlatPtr 229 }; 230 231 const std::initializer_list<LLT> AddrSpaces32 = { 232 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 233 }; 234 235 const std::initializer_list<LLT> FPTypesBase = { 236 S32, S64 237 }; 238 239 const std::initializer_list<LLT> FPTypes16 = { 240 S32, S64, S16 241 }; 242 243 const std::initializer_list<LLT> FPTypesPK16 = { 244 S32, S64, S16, V2S16 245 }; 246 247 setAction({G_BRCOND, S1}, Legal); 248 249 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 250 // elements for v3s16 251 getActionDefinitionsBuilder(G_PHI) 252 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 253 .legalFor(AllS32Vectors) 254 .legalFor(AllS64Vectors) 255 .legalFor(AddrSpaces64) 256 .legalFor(AddrSpaces32) 257 .clampScalar(0, S32, S256) 258 .widenScalarToNextPow2(0, 32) 259 .clampMaxNumElements(0, S32, 16) 260 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 261 .legalIf(isPointer(0)); 262 263 if (ST.has16BitInsts()) { 264 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 265 .legalFor({S32, S16}) 266 .clampScalar(0, S16, S32) 267 .scalarize(0); 268 } else { 269 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 270 .legalFor({S32}) 271 .clampScalar(0, S32, S32) 272 .scalarize(0); 273 } 274 275 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 276 .legalFor({S32}) 277 .clampScalar(0, S32, S32) 278 .scalarize(0); 279 280 // Report legal for any types we can handle anywhere. For the cases only legal 281 // on the SALU, RegBankSelect will be able to re-legalize. 282 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 283 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 284 .clampScalar(0, S32, S64) 285 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 286 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 287 .widenScalarToNextPow2(0) 288 .scalarize(0); 289 290 getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO, 291 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 292 .legalFor({{S32, S1}}) 293 .clampScalar(0, S32, S32) 294 .scalarize(0); // TODO: Implement. 295 296 getActionDefinitionsBuilder(G_BITCAST) 297 // Don't worry about the size constraint. 298 .legalIf(all(isRegisterType(0), isRegisterType(1))) 299 // FIXME: Testing hack 300 .legalForCartesianProduct({S16, LLT::vector(2, 8), }); 301 302 getActionDefinitionsBuilder(G_FCONSTANT) 303 .legalFor({S32, S64, S16}) 304 .clampScalar(0, S16, S64); 305 306 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 307 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 308 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 309 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 310 .clampScalarOrElt(0, S32, S1024) 311 .legalIf(isMultiple32(0)) 312 .widenScalarToNextPow2(0, 32) 313 .clampMaxNumElements(0, S32, 16); 314 315 316 // FIXME: i1 operands to intrinsics should always be legal, but other i1 317 // values may not be legal. We need to figure out how to distinguish 318 // between these two scenarios. 319 getActionDefinitionsBuilder(G_CONSTANT) 320 .legalFor({S1, S32, S64, S16, GlobalPtr, 321 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 322 .clampScalar(0, S32, S64) 323 .widenScalarToNextPow2(0) 324 .legalIf(isPointer(0)); 325 326 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 327 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 328 .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr}); 329 330 331 auto &FPOpActions = getActionDefinitionsBuilder( 332 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 333 .legalFor({S32, S64}); 334 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 335 .customFor({S32, S64}); 336 337 if (ST.has16BitInsts()) { 338 if (ST.hasVOP3PInsts()) 339 FPOpActions.legalFor({S16, V2S16}); 340 else 341 FPOpActions.legalFor({S16}); 342 343 TrigActions.customFor({S16}); 344 } 345 346 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 347 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 348 349 if (ST.hasVOP3PInsts()) { 350 MinNumMaxNum.customFor(FPTypesPK16) 351 .clampMaxNumElements(0, S16, 2) 352 .clampScalar(0, S16, S64) 353 .scalarize(0); 354 } else if (ST.has16BitInsts()) { 355 MinNumMaxNum.customFor(FPTypes16) 356 .clampScalar(0, S16, S64) 357 .scalarize(0); 358 } else { 359 MinNumMaxNum.customFor(FPTypesBase) 360 .clampScalar(0, S32, S64) 361 .scalarize(0); 362 } 363 364 if (ST.hasVOP3PInsts()) 365 FPOpActions.clampMaxNumElements(0, S16, 2); 366 367 FPOpActions 368 .scalarize(0) 369 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 370 371 TrigActions 372 .scalarize(0) 373 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 374 375 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 376 .legalFor(FPTypesPK16) 377 .clampMaxNumElements(0, S16, 2) 378 .scalarize(0) 379 .clampScalar(0, S16, S64); 380 381 // TODO: Implement 382 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); 383 384 if (ST.has16BitInsts()) { 385 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 386 .legalFor({S32, S64, S16}) 387 .scalarize(0) 388 .clampScalar(0, S16, S64); 389 } else { 390 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 391 .legalFor({S32, S64}) 392 .scalarize(0) 393 .clampScalar(0, S32, S64); 394 } 395 396 getActionDefinitionsBuilder(G_FPTRUNC) 397 .legalFor({{S32, S64}, {S16, S32}}) 398 .scalarize(0); 399 400 getActionDefinitionsBuilder(G_FPEXT) 401 .legalFor({{S64, S32}, {S32, S16}}) 402 .lowerFor({{S64, S16}}) // FIXME: Implement 403 .scalarize(0); 404 405 // TODO: Verify V_BFI_B32 is generated from expanded bit ops. 406 getActionDefinitionsBuilder(G_FCOPYSIGN).lower(); 407 408 getActionDefinitionsBuilder(G_FSUB) 409 // Use actual fsub instruction 410 .legalFor({S32}) 411 // Must use fadd + fneg 412 .lowerFor({S64, S16, V2S16}) 413 .scalarize(0) 414 .clampScalar(0, S32, S64); 415 416 // Whether this is legal depends on the floating point mode for the function. 417 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 418 if (ST.hasMadF16()) 419 FMad.customFor({S32, S16}); 420 else 421 FMad.customFor({S32}); 422 FMad.scalarize(0) 423 .lower(); 424 425 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 426 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 427 {S32, S1}, {S64, S1}, {S16, S1}, 428 {S96, S32}, 429 // FIXME: Hack 430 {S64, LLT::scalar(33)}, 431 {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}}) 432 .scalarize(0); 433 434 // TODO: Split s1->s64 during regbankselect for VALU. 435 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 436 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}, {S32, S1}, {S16, S1}, {S64, S1}}) 437 .lowerFor({{S32, S64}}) 438 .customFor({{S64, S64}}); 439 if (ST.has16BitInsts()) 440 IToFP.legalFor({{S16, S16}}); 441 IToFP.clampScalar(1, S32, S64) 442 .scalarize(0); 443 444 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 445 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}); 446 if (ST.has16BitInsts()) 447 FPToI.legalFor({{S16, S16}}); 448 else 449 FPToI.minScalar(1, S32); 450 451 FPToI.minScalar(0, S32) 452 .scalarize(0); 453 454 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 455 .legalFor({S32, S64}) 456 .scalarize(0); 457 458 if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 459 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 460 .legalFor({S32, S64}) 461 .clampScalar(0, S32, S64) 462 .scalarize(0); 463 } else { 464 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 465 .legalFor({S32}) 466 .customFor({S64}) 467 .clampScalar(0, S32, S64) 468 .scalarize(0); 469 } 470 471 getActionDefinitionsBuilder(G_GEP) 472 .legalForCartesianProduct(AddrSpaces64, {S64}) 473 .legalForCartesianProduct(AddrSpaces32, {S32}) 474 .scalarize(0); 475 476 getActionDefinitionsBuilder(G_PTR_MASK) 477 .scalarize(0) 478 .alwaysLegal(); 479 480 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 481 482 auto &CmpBuilder = 483 getActionDefinitionsBuilder(G_ICMP) 484 .legalForCartesianProduct( 485 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 486 .legalFor({{S1, S32}, {S1, S64}}); 487 if (ST.has16BitInsts()) { 488 CmpBuilder.legalFor({{S1, S16}}); 489 } 490 491 CmpBuilder 492 .widenScalarToNextPow2(1) 493 .clampScalar(1, S32, S64) 494 .scalarize(0) 495 .legalIf(all(typeIs(0, S1), isPointer(1))); 496 497 getActionDefinitionsBuilder(G_FCMP) 498 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 499 .widenScalarToNextPow2(1) 500 .clampScalar(1, S32, S64) 501 .scalarize(0); 502 503 // FIXME: fexp, flog2, flog10 needs to be custom lowered. 504 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, 505 G_FLOG, G_FLOG2, G_FLOG10}) 506 .legalFor({S32}) 507 .scalarize(0); 508 509 // The 64-bit versions produce 32-bit results, but only on the SALU. 510 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, 511 G_CTTZ, G_CTTZ_ZERO_UNDEF, 512 G_CTPOP}) 513 .legalFor({{S32, S32}, {S32, S64}}) 514 .clampScalar(0, S32, S32) 515 .clampScalar(1, S32, S64) 516 .scalarize(0) 517 .widenScalarToNextPow2(0, 32) 518 .widenScalarToNextPow2(1, 32); 519 520 // TODO: Expand for > s32 521 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE}) 522 .legalFor({S32}) 523 .clampScalar(0, S32, S32) 524 .scalarize(0); 525 526 if (ST.has16BitInsts()) { 527 if (ST.hasVOP3PInsts()) { 528 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 529 .legalFor({S32, S16, V2S16}) 530 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 531 .clampMaxNumElements(0, S16, 2) 532 .clampScalar(0, S16, S32) 533 .widenScalarToNextPow2(0) 534 .scalarize(0); 535 } else { 536 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 537 .legalFor({S32, S16}) 538 .widenScalarToNextPow2(0) 539 .clampScalar(0, S16, S32) 540 .scalarize(0); 541 } 542 } else { 543 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 544 .legalFor({S32}) 545 .clampScalar(0, S32, S32) 546 .widenScalarToNextPow2(0) 547 .scalarize(0); 548 } 549 550 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 551 return [=](const LegalityQuery &Query) { 552 return Query.Types[TypeIdx0].getSizeInBits() < 553 Query.Types[TypeIdx1].getSizeInBits(); 554 }; 555 }; 556 557 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 558 return [=](const LegalityQuery &Query) { 559 return Query.Types[TypeIdx0].getSizeInBits() > 560 Query.Types[TypeIdx1].getSizeInBits(); 561 }; 562 }; 563 564 getActionDefinitionsBuilder(G_INTTOPTR) 565 // List the common cases 566 .legalForCartesianProduct(AddrSpaces64, {S64}) 567 .legalForCartesianProduct(AddrSpaces32, {S32}) 568 .scalarize(0) 569 // Accept any address space as long as the size matches 570 .legalIf(sameSize(0, 1)) 571 .widenScalarIf(smallerThan(1, 0), 572 [](const LegalityQuery &Query) { 573 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 574 }) 575 .narrowScalarIf(greaterThan(1, 0), 576 [](const LegalityQuery &Query) { 577 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 578 }); 579 580 getActionDefinitionsBuilder(G_PTRTOINT) 581 // List the common cases 582 .legalForCartesianProduct(AddrSpaces64, {S64}) 583 .legalForCartesianProduct(AddrSpaces32, {S32}) 584 .scalarize(0) 585 // Accept any address space as long as the size matches 586 .legalIf(sameSize(0, 1)) 587 .widenScalarIf(smallerThan(0, 1), 588 [](const LegalityQuery &Query) { 589 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 590 }) 591 .narrowScalarIf( 592 greaterThan(0, 1), 593 [](const LegalityQuery &Query) { 594 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 595 }); 596 597 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 598 .scalarize(0) 599 .custom(); 600 601 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 602 // handle some operations by just promoting the register during 603 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 604 auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned { 605 switch (AS) { 606 // FIXME: Private element size. 607 case AMDGPUAS::PRIVATE_ADDRESS: 608 return 32; 609 // FIXME: Check subtarget 610 case AMDGPUAS::LOCAL_ADDRESS: 611 return ST.useDS128() ? 128 : 64; 612 613 // Treat constant and global as identical. SMRD loads are sometimes usable 614 // for global loads (ideally constant address space should be eliminated) 615 // depending on the context. Legality cannot be context dependent, but 616 // RegBankSelect can split the load as necessary depending on the pointer 617 // register bank/uniformity and if the memory is invariant or not written in 618 // a kernel. 619 case AMDGPUAS::CONSTANT_ADDRESS: 620 case AMDGPUAS::GLOBAL_ADDRESS: 621 return 512; 622 default: 623 return 128; 624 } 625 }; 626 627 const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool { 628 const LLT DstTy = Query.Types[0]; 629 630 // Split vector extloads. 631 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 632 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 633 return true; 634 635 const LLT PtrTy = Query.Types[1]; 636 unsigned AS = PtrTy.getAddressSpace(); 637 if (MemSize > maxSizeForAddrSpace(AS)) 638 return true; 639 640 // Catch weird sized loads that don't evenly divide into the access sizes 641 // TODO: May be able to widen depending on alignment etc. 642 unsigned NumRegs = MemSize / 32; 643 if (NumRegs == 3 && !ST.hasDwordx3LoadStores()) 644 return true; 645 646 unsigned Align = Query.MMODescrs[0].AlignInBits; 647 if (Align < MemSize) { 648 const SITargetLowering *TLI = ST.getTargetLowering(); 649 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 650 } 651 652 return false; 653 }; 654 655 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 656 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 657 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 658 659 // TODO: Refine based on subtargets which support unaligned access or 128-bit 660 // LDS 661 // TODO: Unsupported flat for SI. 662 663 for (unsigned Op : {G_LOAD, G_STORE}) { 664 const bool IsStore = Op == G_STORE; 665 666 auto &Actions = getActionDefinitionsBuilder(Op); 667 // Whitelist the common cases. 668 // TODO: Pointer loads 669 // TODO: Wide constant loads 670 // TODO: Only CI+ has 3x loads 671 // TODO: Loads to s16 on gfx9 672 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 673 {V2S32, GlobalPtr, 64, GlobalAlign32}, 674 {V3S32, GlobalPtr, 96, GlobalAlign32}, 675 {S96, GlobalPtr, 96, GlobalAlign32}, 676 {V4S32, GlobalPtr, 128, GlobalAlign32}, 677 {S128, GlobalPtr, 128, GlobalAlign32}, 678 {S64, GlobalPtr, 64, GlobalAlign32}, 679 {V2S64, GlobalPtr, 128, GlobalAlign32}, 680 {V2S16, GlobalPtr, 32, GlobalAlign32}, 681 {S32, GlobalPtr, 8, GlobalAlign8}, 682 {S32, GlobalPtr, 16, GlobalAlign16}, 683 684 {S32, LocalPtr, 32, 32}, 685 {S64, LocalPtr, 64, 32}, 686 {V2S32, LocalPtr, 64, 32}, 687 {S32, LocalPtr, 8, 8}, 688 {S32, LocalPtr, 16, 16}, 689 {V2S16, LocalPtr, 32, 32}, 690 691 {S32, PrivatePtr, 32, 32}, 692 {S32, PrivatePtr, 8, 8}, 693 {S32, PrivatePtr, 16, 16}, 694 {V2S16, PrivatePtr, 32, 32}, 695 696 {S32, FlatPtr, 32, GlobalAlign32}, 697 {S32, FlatPtr, 16, GlobalAlign16}, 698 {S32, FlatPtr, 8, GlobalAlign8}, 699 {V2S16, FlatPtr, 32, GlobalAlign32}, 700 701 {S32, ConstantPtr, 32, GlobalAlign32}, 702 {V2S32, ConstantPtr, 64, GlobalAlign32}, 703 {V3S32, ConstantPtr, 96, GlobalAlign32}, 704 {V4S32, ConstantPtr, 128, GlobalAlign32}, 705 {S64, ConstantPtr, 64, GlobalAlign32}, 706 {S128, ConstantPtr, 128, GlobalAlign32}, 707 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 708 Actions 709 .customIf(typeIs(1, Constant32Ptr)) 710 .narrowScalarIf( 711 [=](const LegalityQuery &Query) -> bool { 712 return !Query.Types[0].isVector() && needToSplitLoad(Query); 713 }, 714 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 715 const LLT DstTy = Query.Types[0]; 716 const LLT PtrTy = Query.Types[1]; 717 718 const unsigned DstSize = DstTy.getSizeInBits(); 719 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 720 721 // Split extloads. 722 if (DstSize > MemSize) 723 return std::make_pair(0, LLT::scalar(MemSize)); 724 725 if (DstSize > 32 && (DstSize % 32 != 0)) { 726 // FIXME: Need a way to specify non-extload of larger size if 727 // suitably aligned. 728 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 729 } 730 731 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 732 if (MemSize > MaxSize) 733 return std::make_pair(0, LLT::scalar(MaxSize)); 734 735 unsigned Align = Query.MMODescrs[0].AlignInBits; 736 return std::make_pair(0, LLT::scalar(Align)); 737 }) 738 .fewerElementsIf( 739 [=](const LegalityQuery &Query) -> bool { 740 return Query.Types[0].isVector() && needToSplitLoad(Query); 741 }, 742 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 743 const LLT DstTy = Query.Types[0]; 744 const LLT PtrTy = Query.Types[1]; 745 746 LLT EltTy = DstTy.getElementType(); 747 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 748 749 // Split if it's too large for the address space. 750 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 751 unsigned NumElts = DstTy.getNumElements(); 752 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 753 754 // FIXME: Refine when odd breakdowns handled 755 // The scalars will need to be re-legalized. 756 if (NumPieces == 1 || NumPieces >= NumElts || 757 NumElts % NumPieces != 0) 758 return std::make_pair(0, EltTy); 759 760 return std::make_pair(0, 761 LLT::vector(NumElts / NumPieces, EltTy)); 762 } 763 764 // Need to split because of alignment. 765 unsigned Align = Query.MMODescrs[0].AlignInBits; 766 unsigned EltSize = EltTy.getSizeInBits(); 767 if (EltSize > Align && 768 (EltSize / Align < DstTy.getNumElements())) { 769 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 770 } 771 772 // May need relegalization for the scalars. 773 return std::make_pair(0, EltTy); 774 }) 775 .minScalar(0, S32); 776 777 if (IsStore) 778 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 779 780 // TODO: Need a bitcast lower option? 781 Actions 782 .legalIf([=](const LegalityQuery &Query) { 783 const LLT Ty0 = Query.Types[0]; 784 unsigned Size = Ty0.getSizeInBits(); 785 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 786 unsigned Align = Query.MMODescrs[0].AlignInBits; 787 788 // No extending vector loads. 789 if (Size > MemSize && Ty0.isVector()) 790 return false; 791 792 // FIXME: Widening store from alignment not valid. 793 if (MemSize < Size) 794 MemSize = std::max(MemSize, Align); 795 796 switch (MemSize) { 797 case 8: 798 case 16: 799 return Size == 32; 800 case 32: 801 case 64: 802 case 128: 803 return true; 804 case 96: 805 return ST.hasDwordx3LoadStores(); 806 case 256: 807 case 512: 808 return true; 809 default: 810 return false; 811 } 812 }) 813 .widenScalarToNextPow2(0) 814 // TODO: v3s32->v4s32 with alignment 815 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 816 } 817 818 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 819 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 820 {S32, GlobalPtr, 16, 2 * 8}, 821 {S32, LocalPtr, 8, 8}, 822 {S32, LocalPtr, 16, 16}, 823 {S32, PrivatePtr, 8, 8}, 824 {S32, PrivatePtr, 16, 16}, 825 {S32, ConstantPtr, 8, 8}, 826 {S32, ConstantPtr, 16, 2 * 8}}); 827 if (ST.hasFlatAddressSpace()) { 828 ExtLoads.legalForTypesWithMemDesc( 829 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 830 } 831 832 ExtLoads.clampScalar(0, S32, S32) 833 .widenScalarToNextPow2(0) 834 .unsupportedIfMemSizeNotPow2() 835 .lower(); 836 837 auto &Atomics = getActionDefinitionsBuilder( 838 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 839 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 840 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 841 G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG}) 842 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 843 {S64, GlobalPtr}, {S64, LocalPtr}}); 844 if (ST.hasFlatAddressSpace()) { 845 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 846 } 847 848 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 849 .legalFor({{S32, LocalPtr}}); 850 851 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS) 852 .lower(); 853 854 // TODO: Pointer types, any 32-bit or 64-bit vector 855 getActionDefinitionsBuilder(G_SELECT) 856 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 857 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 858 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1}) 859 .clampScalar(0, S16, S64) 860 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 861 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 862 .scalarize(1) 863 .clampMaxNumElements(0, S32, 2) 864 .clampMaxNumElements(0, LocalPtr, 2) 865 .clampMaxNumElements(0, PrivatePtr, 2) 866 .scalarize(0) 867 .widenScalarToNextPow2(0) 868 .legalIf(all(isPointer(0), typeIs(1, S1))); 869 870 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 871 // be more flexible with the shift amount type. 872 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 873 .legalFor({{S32, S32}, {S64, S32}}); 874 if (ST.has16BitInsts()) { 875 if (ST.hasVOP3PInsts()) { 876 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 877 .clampMaxNumElements(0, S16, 2); 878 } else 879 Shifts.legalFor({{S16, S32}, {S16, S16}}); 880 881 Shifts.clampScalar(1, S16, S32); 882 Shifts.clampScalar(0, S16, S64); 883 Shifts.widenScalarToNextPow2(0, 16); 884 } else { 885 // Make sure we legalize the shift amount type first, as the general 886 // expansion for the shifted type will produce much worse code if it hasn't 887 // been truncated already. 888 Shifts.clampScalar(1, S32, S32); 889 Shifts.clampScalar(0, S32, S64); 890 Shifts.widenScalarToNextPow2(0, 32); 891 } 892 Shifts.scalarize(0); 893 894 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 895 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 896 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 897 unsigned IdxTypeIdx = 2; 898 899 getActionDefinitionsBuilder(Op) 900 .customIf([=](const LegalityQuery &Query) { 901 const LLT EltTy = Query.Types[EltTypeIdx]; 902 const LLT VecTy = Query.Types[VecTypeIdx]; 903 const LLT IdxTy = Query.Types[IdxTypeIdx]; 904 return (EltTy.getSizeInBits() == 16 || 905 EltTy.getSizeInBits() % 32 == 0) && 906 VecTy.getSizeInBits() % 32 == 0 && 907 VecTy.getSizeInBits() <= 1024 && 908 IdxTy.getSizeInBits() == 32; 909 }) 910 .clampScalar(EltTypeIdx, S32, S64) 911 .clampScalar(VecTypeIdx, S32, S64) 912 .clampScalar(IdxTypeIdx, S32, S32); 913 } 914 915 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 916 .unsupportedIf([=](const LegalityQuery &Query) { 917 const LLT &EltTy = Query.Types[1].getElementType(); 918 return Query.Types[0] != EltTy; 919 }); 920 921 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 922 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 923 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 924 925 // FIXME: Doesn't handle extract of illegal sizes. 926 getActionDefinitionsBuilder(Op) 927 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 928 // FIXME: Multiples of 16 should not be legal. 929 .legalIf([=](const LegalityQuery &Query) { 930 const LLT BigTy = Query.Types[BigTyIdx]; 931 const LLT LitTy = Query.Types[LitTyIdx]; 932 return (BigTy.getSizeInBits() % 32 == 0) && 933 (LitTy.getSizeInBits() % 16 == 0); 934 }) 935 .widenScalarIf( 936 [=](const LegalityQuery &Query) { 937 const LLT BigTy = Query.Types[BigTyIdx]; 938 return (BigTy.getScalarSizeInBits() < 16); 939 }, 940 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 941 .widenScalarIf( 942 [=](const LegalityQuery &Query) { 943 const LLT LitTy = Query.Types[LitTyIdx]; 944 return (LitTy.getScalarSizeInBits() < 16); 945 }, 946 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 947 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 948 .widenScalarToNextPow2(BigTyIdx, 32); 949 950 } 951 952 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 953 .legalForCartesianProduct(AllS32Vectors, {S32}) 954 .legalForCartesianProduct(AllS64Vectors, {S64}) 955 .clampNumElements(0, V16S32, V32S32) 956 .clampNumElements(0, V2S64, V16S64) 957 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 958 959 if (ST.hasScalarPackInsts()) 960 BuildVector.legalFor({V2S16, S32}); 961 962 BuildVector 963 .minScalarSameAs(1, 0) 964 .legalIf(isRegisterType(0)) 965 .minScalarOrElt(0, S32); 966 967 if (ST.hasScalarPackInsts()) { 968 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 969 .legalFor({V2S16, S32}) 970 .lower(); 971 } else { 972 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 973 .lower(); 974 } 975 976 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 977 .legalIf(isRegisterType(0)); 978 979 // TODO: Don't fully scalarize v2s16 pieces 980 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 981 982 // Merge/Unmerge 983 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 984 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 985 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 986 987 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 988 const LLT &Ty = Query.Types[TypeIdx]; 989 if (Ty.isVector()) { 990 const LLT &EltTy = Ty.getElementType(); 991 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 992 return true; 993 if (!isPowerOf2_32(EltTy.getSizeInBits())) 994 return true; 995 } 996 return false; 997 }; 998 999 auto &Builder = getActionDefinitionsBuilder(Op) 1000 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1001 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1002 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1003 // valid. 1004 .clampScalar(LitTyIdx, S16, S256) 1005 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1006 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1007 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1008 elementTypeIs(1, S16)), 1009 changeTo(1, V2S16)) 1010 // Break up vectors with weird elements into scalars 1011 .fewerElementsIf( 1012 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 1013 scalarize(0)) 1014 .fewerElementsIf( 1015 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 1016 scalarize(1)) 1017 .clampScalar(BigTyIdx, S32, S1024) 1018 .lowerFor({{S16, V2S16}}); 1019 1020 if (Op == G_MERGE_VALUES) { 1021 Builder.widenScalarIf( 1022 // TODO: Use 16-bit shifts if legal for 8-bit values? 1023 [=](const LegalityQuery &Query) { 1024 const LLT Ty = Query.Types[LitTyIdx]; 1025 return Ty.getSizeInBits() < 32; 1026 }, 1027 changeTo(LitTyIdx, S32)); 1028 } 1029 1030 Builder.widenScalarIf( 1031 [=](const LegalityQuery &Query) { 1032 const LLT Ty = Query.Types[BigTyIdx]; 1033 return !isPowerOf2_32(Ty.getSizeInBits()) && 1034 Ty.getSizeInBits() % 16 != 0; 1035 }, 1036 [=](const LegalityQuery &Query) { 1037 // Pick the next power of 2, or a multiple of 64 over 128. 1038 // Whichever is smaller. 1039 const LLT &Ty = Query.Types[BigTyIdx]; 1040 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1041 if (NewSizeInBits >= 256) { 1042 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1043 if (RoundedTo < NewSizeInBits) 1044 NewSizeInBits = RoundedTo; 1045 } 1046 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1047 }) 1048 .legalIf([=](const LegalityQuery &Query) { 1049 const LLT &BigTy = Query.Types[BigTyIdx]; 1050 const LLT &LitTy = Query.Types[LitTyIdx]; 1051 1052 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1053 return false; 1054 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1055 return false; 1056 1057 return BigTy.getSizeInBits() % 16 == 0 && 1058 LitTy.getSizeInBits() % 16 == 0 && 1059 BigTy.getSizeInBits() <= 1024; 1060 }) 1061 // Any vectors left are the wrong size. Scalarize them. 1062 .scalarize(0) 1063 .scalarize(1); 1064 } 1065 1066 getActionDefinitionsBuilder(G_SEXT_INREG).lower(); 1067 1068 computeTables(); 1069 verify(*ST.getInstrInfo()); 1070 } 1071 1072 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1073 MachineRegisterInfo &MRI, 1074 MachineIRBuilder &B, 1075 GISelChangeObserver &Observer) const { 1076 switch (MI.getOpcode()) { 1077 case TargetOpcode::G_ADDRSPACE_CAST: 1078 return legalizeAddrSpaceCast(MI, MRI, B); 1079 case TargetOpcode::G_FRINT: 1080 return legalizeFrint(MI, MRI, B); 1081 case TargetOpcode::G_FCEIL: 1082 return legalizeFceil(MI, MRI, B); 1083 case TargetOpcode::G_INTRINSIC_TRUNC: 1084 return legalizeIntrinsicTrunc(MI, MRI, B); 1085 case TargetOpcode::G_SITOFP: 1086 return legalizeITOFP(MI, MRI, B, true); 1087 case TargetOpcode::G_UITOFP: 1088 return legalizeITOFP(MI, MRI, B, false); 1089 case TargetOpcode::G_FMINNUM: 1090 case TargetOpcode::G_FMAXNUM: 1091 case TargetOpcode::G_FMINNUM_IEEE: 1092 case TargetOpcode::G_FMAXNUM_IEEE: 1093 return legalizeMinNumMaxNum(MI, MRI, B); 1094 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1095 return legalizeExtractVectorElt(MI, MRI, B); 1096 case TargetOpcode::G_INSERT_VECTOR_ELT: 1097 return legalizeInsertVectorElt(MI, MRI, B); 1098 case TargetOpcode::G_FSIN: 1099 case TargetOpcode::G_FCOS: 1100 return legalizeSinCos(MI, MRI, B); 1101 case TargetOpcode::G_GLOBAL_VALUE: 1102 return legalizeGlobalValue(MI, MRI, B); 1103 case TargetOpcode::G_LOAD: 1104 return legalizeLoad(MI, MRI, B, Observer); 1105 case TargetOpcode::G_FMAD: 1106 return legalizeFMad(MI, MRI, B); 1107 default: 1108 return false; 1109 } 1110 1111 llvm_unreachable("expected switch to return"); 1112 } 1113 1114 Register AMDGPULegalizerInfo::getSegmentAperture( 1115 unsigned AS, 1116 MachineRegisterInfo &MRI, 1117 MachineIRBuilder &B) const { 1118 MachineFunction &MF = B.getMF(); 1119 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1120 const LLT S32 = LLT::scalar(32); 1121 1122 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1123 1124 if (ST.hasApertureRegs()) { 1125 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1126 // getreg. 1127 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1128 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1129 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1130 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1131 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1132 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1133 unsigned Encoding = 1134 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1135 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1136 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1137 1138 Register ApertureReg = MRI.createGenericVirtualRegister(S32); 1139 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1140 1141 B.buildInstr(AMDGPU::S_GETREG_B32) 1142 .addDef(GetReg) 1143 .addImm(Encoding); 1144 MRI.setType(GetReg, S32); 1145 1146 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1147 B.buildInstr(TargetOpcode::G_SHL) 1148 .addDef(ApertureReg) 1149 .addUse(GetReg) 1150 .addUse(ShiftAmt.getReg(0)); 1151 1152 return ApertureReg; 1153 } 1154 1155 Register QueuePtr = MRI.createGenericVirtualRegister( 1156 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1157 1158 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1159 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1160 return Register(); 1161 1162 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1163 // private_segment_aperture_base_hi. 1164 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1165 1166 // FIXME: Don't use undef 1167 Value *V = UndefValue::get(PointerType::get( 1168 Type::getInt8Ty(MF.getFunction().getContext()), 1169 AMDGPUAS::CONSTANT_ADDRESS)); 1170 1171 MachinePointerInfo PtrInfo(V, StructOffset); 1172 MachineMemOperand *MMO = MF.getMachineMemOperand( 1173 PtrInfo, 1174 MachineMemOperand::MOLoad | 1175 MachineMemOperand::MODereferenceable | 1176 MachineMemOperand::MOInvariant, 1177 4, 1178 MinAlign(64, StructOffset)); 1179 1180 Register LoadResult = MRI.createGenericVirtualRegister(S32); 1181 Register LoadAddr; 1182 1183 B.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1184 B.buildLoad(LoadResult, LoadAddr, *MMO); 1185 return LoadResult; 1186 } 1187 1188 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1189 MachineInstr &MI, MachineRegisterInfo &MRI, 1190 MachineIRBuilder &B) const { 1191 MachineFunction &MF = B.getMF(); 1192 1193 B.setInstr(MI); 1194 1195 const LLT S32 = LLT::scalar(32); 1196 Register Dst = MI.getOperand(0).getReg(); 1197 Register Src = MI.getOperand(1).getReg(); 1198 1199 LLT DstTy = MRI.getType(Dst); 1200 LLT SrcTy = MRI.getType(Src); 1201 unsigned DestAS = DstTy.getAddressSpace(); 1202 unsigned SrcAS = SrcTy.getAddressSpace(); 1203 1204 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1205 // vector element. 1206 assert(!DstTy.isVector()); 1207 1208 const AMDGPUTargetMachine &TM 1209 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1210 1211 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1212 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1213 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1214 return true; 1215 } 1216 1217 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1218 // Truncate. 1219 B.buildExtract(Dst, Src, 0); 1220 MI.eraseFromParent(); 1221 return true; 1222 } 1223 1224 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1225 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1226 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1227 1228 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1229 // another. Merge operands are required to be the same type, but creating an 1230 // extra ptrtoint would be kind of pointless. 1231 auto HighAddr = B.buildConstant( 1232 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1233 B.buildMerge(Dst, {Src, HighAddr.getReg(0)}); 1234 MI.eraseFromParent(); 1235 return true; 1236 } 1237 1238 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1239 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1240 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1241 unsigned NullVal = TM.getNullPointerValue(DestAS); 1242 1243 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1244 auto FlatNull = B.buildConstant(SrcTy, 0); 1245 1246 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy); 1247 1248 // Extract low 32-bits of the pointer. 1249 B.buildExtract(PtrLo32, Src, 0); 1250 1251 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1252 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); 1253 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1254 1255 MI.eraseFromParent(); 1256 return true; 1257 } 1258 1259 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1260 return false; 1261 1262 if (!ST.hasFlatAddressSpace()) 1263 return false; 1264 1265 auto SegmentNull = 1266 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1267 auto FlatNull = 1268 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1269 1270 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1271 if (!ApertureReg.isValid()) 1272 return false; 1273 1274 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1275 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); 1276 1277 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy); 1278 1279 // Coerce the type of the low half of the result so we can use merge_values. 1280 Register SrcAsInt = MRI.createGenericVirtualRegister(S32); 1281 B.buildInstr(TargetOpcode::G_PTRTOINT) 1282 .addDef(SrcAsInt) 1283 .addUse(Src); 1284 1285 // TODO: Should we allow mismatched types but matching sizes in merges to 1286 // avoid the ptrtoint? 1287 B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); 1288 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); 1289 1290 MI.eraseFromParent(); 1291 return true; 1292 } 1293 1294 bool AMDGPULegalizerInfo::legalizeFrint( 1295 MachineInstr &MI, MachineRegisterInfo &MRI, 1296 MachineIRBuilder &B) const { 1297 B.setInstr(MI); 1298 1299 Register Src = MI.getOperand(1).getReg(); 1300 LLT Ty = MRI.getType(Src); 1301 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1302 1303 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1304 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1305 1306 auto C1 = B.buildFConstant(Ty, C1Val); 1307 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1308 1309 // TODO: Should this propagate fast-math-flags? 1310 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1311 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1312 1313 auto C2 = B.buildFConstant(Ty, C2Val); 1314 auto Fabs = B.buildFAbs(Ty, Src); 1315 1316 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1317 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1318 return true; 1319 } 1320 1321 bool AMDGPULegalizerInfo::legalizeFceil( 1322 MachineInstr &MI, MachineRegisterInfo &MRI, 1323 MachineIRBuilder &B) const { 1324 B.setInstr(MI); 1325 1326 const LLT S1 = LLT::scalar(1); 1327 const LLT S64 = LLT::scalar(64); 1328 1329 Register Src = MI.getOperand(1).getReg(); 1330 assert(MRI.getType(Src) == S64); 1331 1332 // result = trunc(src) 1333 // if (src > 0.0 && src != result) 1334 // result += 1.0 1335 1336 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src}); 1337 1338 const auto Zero = B.buildFConstant(S64, 0.0); 1339 const auto One = B.buildFConstant(S64, 1.0); 1340 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1341 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1342 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1343 auto Add = B.buildSelect(S64, And, One, Zero); 1344 1345 // TODO: Should this propagate fast-math-flags? 1346 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1347 return true; 1348 } 1349 1350 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1351 MachineIRBuilder &B) { 1352 const unsigned FractBits = 52; 1353 const unsigned ExpBits = 11; 1354 LLT S32 = LLT::scalar(32); 1355 1356 auto Const0 = B.buildConstant(S32, FractBits - 32); 1357 auto Const1 = B.buildConstant(S32, ExpBits); 1358 1359 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1360 .addUse(Const0.getReg(0)) 1361 .addUse(Const1.getReg(0)); 1362 1363 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1364 } 1365 1366 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1367 MachineInstr &MI, MachineRegisterInfo &MRI, 1368 MachineIRBuilder &B) const { 1369 B.setInstr(MI); 1370 1371 const LLT S1 = LLT::scalar(1); 1372 const LLT S32 = LLT::scalar(32); 1373 const LLT S64 = LLT::scalar(64); 1374 1375 Register Src = MI.getOperand(1).getReg(); 1376 assert(MRI.getType(Src) == S64); 1377 1378 // TODO: Should this use extract since the low half is unused? 1379 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1380 Register Hi = Unmerge.getReg(1); 1381 1382 // Extract the upper half, since this is where we will find the sign and 1383 // exponent. 1384 auto Exp = extractF64Exponent(Hi, B); 1385 1386 const unsigned FractBits = 52; 1387 1388 // Extract the sign bit. 1389 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1390 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1391 1392 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1393 1394 const auto Zero32 = B.buildConstant(S32, 0); 1395 1396 // Extend back to 64-bits. 1397 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); 1398 1399 auto Shr = B.buildAShr(S64, FractMask, Exp); 1400 auto Not = B.buildNot(S64, Shr); 1401 auto Tmp0 = B.buildAnd(S64, Src, Not); 1402 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1403 1404 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1405 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1406 1407 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1408 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1409 return true; 1410 } 1411 1412 bool AMDGPULegalizerInfo::legalizeITOFP( 1413 MachineInstr &MI, MachineRegisterInfo &MRI, 1414 MachineIRBuilder &B, bool Signed) const { 1415 B.setInstr(MI); 1416 1417 Register Dst = MI.getOperand(0).getReg(); 1418 Register Src = MI.getOperand(1).getReg(); 1419 1420 const LLT S64 = LLT::scalar(64); 1421 const LLT S32 = LLT::scalar(32); 1422 1423 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1424 1425 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1426 1427 auto CvtHi = Signed ? 1428 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1429 B.buildUITOFP(S64, Unmerge.getReg(1)); 1430 1431 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1432 1433 auto ThirtyTwo = B.buildConstant(S32, 32); 1434 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1435 .addUse(CvtHi.getReg(0)) 1436 .addUse(ThirtyTwo.getReg(0)); 1437 1438 // TODO: Should this propagate fast-math-flags? 1439 B.buildFAdd(Dst, LdExp, CvtLo); 1440 MI.eraseFromParent(); 1441 return true; 1442 } 1443 1444 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1445 MachineInstr &MI, MachineRegisterInfo &MRI, 1446 MachineIRBuilder &B) const { 1447 MachineFunction &MF = B.getMF(); 1448 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1449 1450 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1451 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1452 1453 // With ieee_mode disabled, the instructions have the correct behavior 1454 // already for G_FMINNUM/G_FMAXNUM 1455 if (!MFI->getMode().IEEE) 1456 return !IsIEEEOp; 1457 1458 if (IsIEEEOp) 1459 return true; 1460 1461 MachineIRBuilder HelperBuilder(MI); 1462 GISelObserverWrapper DummyObserver; 1463 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1464 HelperBuilder.setInstr(MI); 1465 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1466 } 1467 1468 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1469 MachineInstr &MI, MachineRegisterInfo &MRI, 1470 MachineIRBuilder &B) const { 1471 // TODO: Should move some of this into LegalizerHelper. 1472 1473 // TODO: Promote dynamic indexing of s16 to s32 1474 // TODO: Dynamic s64 indexing is only legal for SGPR. 1475 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); 1476 if (!IdxVal) // Dynamic case will be selected to register indexing. 1477 return true; 1478 1479 Register Dst = MI.getOperand(0).getReg(); 1480 Register Vec = MI.getOperand(1).getReg(); 1481 1482 LLT VecTy = MRI.getType(Vec); 1483 LLT EltTy = VecTy.getElementType(); 1484 assert(EltTy == MRI.getType(Dst)); 1485 1486 B.setInstr(MI); 1487 1488 if (IdxVal.getValue() < VecTy.getNumElements()) 1489 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); 1490 else 1491 B.buildUndef(Dst); 1492 1493 MI.eraseFromParent(); 1494 return true; 1495 } 1496 1497 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1498 MachineInstr &MI, MachineRegisterInfo &MRI, 1499 MachineIRBuilder &B) const { 1500 // TODO: Should move some of this into LegalizerHelper. 1501 1502 // TODO: Promote dynamic indexing of s16 to s32 1503 // TODO: Dynamic s64 indexing is only legal for SGPR. 1504 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); 1505 if (!IdxVal) // Dynamic case will be selected to register indexing. 1506 return true; 1507 1508 Register Dst = MI.getOperand(0).getReg(); 1509 Register Vec = MI.getOperand(1).getReg(); 1510 Register Ins = MI.getOperand(2).getReg(); 1511 1512 LLT VecTy = MRI.getType(Vec); 1513 LLT EltTy = VecTy.getElementType(); 1514 assert(EltTy == MRI.getType(Ins)); 1515 1516 B.setInstr(MI); 1517 1518 if (IdxVal.getValue() < VecTy.getNumElements()) 1519 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); 1520 else 1521 B.buildUndef(Dst); 1522 1523 MI.eraseFromParent(); 1524 return true; 1525 } 1526 1527 bool AMDGPULegalizerInfo::legalizeSinCos( 1528 MachineInstr &MI, MachineRegisterInfo &MRI, 1529 MachineIRBuilder &B) const { 1530 B.setInstr(MI); 1531 1532 Register DstReg = MI.getOperand(0).getReg(); 1533 Register SrcReg = MI.getOperand(1).getReg(); 1534 LLT Ty = MRI.getType(DstReg); 1535 unsigned Flags = MI.getFlags(); 1536 1537 Register TrigVal; 1538 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1539 if (ST.hasTrigReducedRange()) { 1540 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1541 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1542 .addUse(MulVal.getReg(0)) 1543 .setMIFlags(Flags).getReg(0); 1544 } else 1545 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1546 1547 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1548 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1549 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1550 .addUse(TrigVal) 1551 .setMIFlags(Flags); 1552 MI.eraseFromParent(); 1553 return true; 1554 } 1555 1556 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1557 Register DstReg, LLT PtrTy, 1558 MachineIRBuilder &B, const GlobalValue *GV, 1559 unsigned Offset, unsigned GAFlags) const { 1560 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1561 // to the following code sequence: 1562 // 1563 // For constant address space: 1564 // s_getpc_b64 s[0:1] 1565 // s_add_u32 s0, s0, $symbol 1566 // s_addc_u32 s1, s1, 0 1567 // 1568 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1569 // a fixup or relocation is emitted to replace $symbol with a literal 1570 // constant, which is a pc-relative offset from the encoding of the $symbol 1571 // operand to the global variable. 1572 // 1573 // For global address space: 1574 // s_getpc_b64 s[0:1] 1575 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1576 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1577 // 1578 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1579 // fixups or relocations are emitted to replace $symbol@*@lo and 1580 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1581 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1582 // operand to the global variable. 1583 // 1584 // What we want here is an offset from the value returned by s_getpc 1585 // (which is the address of the s_add_u32 instruction) to the global 1586 // variable, but since the encoding of $symbol starts 4 bytes after the start 1587 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1588 // small. This requires us to add 4 to the global variable offset in order to 1589 // compute the correct address. 1590 1591 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1592 1593 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1594 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1595 1596 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1597 .addDef(PCReg); 1598 1599 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1600 if (GAFlags == SIInstrInfo::MO_NONE) 1601 MIB.addImm(0); 1602 else 1603 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1604 1605 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1606 1607 if (PtrTy.getSizeInBits() == 32) 1608 B.buildExtract(DstReg, PCReg, 0); 1609 return true; 1610 } 1611 1612 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1613 MachineInstr &MI, MachineRegisterInfo &MRI, 1614 MachineIRBuilder &B) const { 1615 Register DstReg = MI.getOperand(0).getReg(); 1616 LLT Ty = MRI.getType(DstReg); 1617 unsigned AS = Ty.getAddressSpace(); 1618 1619 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1620 MachineFunction &MF = B.getMF(); 1621 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1622 B.setInstr(MI); 1623 1624 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1625 if (!MFI->isEntryFunction()) { 1626 const Function &Fn = MF.getFunction(); 1627 DiagnosticInfoUnsupported BadLDSDecl( 1628 Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 1629 Fn.getContext().diagnose(BadLDSDecl); 1630 } 1631 1632 // TODO: We could emit code to handle the initialization somewhere. 1633 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1634 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1635 MI.eraseFromParent(); 1636 return true; 1637 } 1638 1639 const Function &Fn = MF.getFunction(); 1640 DiagnosticInfoUnsupported BadInit( 1641 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1642 Fn.getContext().diagnose(BadInit); 1643 return true; 1644 } 1645 1646 const SITargetLowering *TLI = ST.getTargetLowering(); 1647 1648 if (TLI->shouldEmitFixup(GV)) { 1649 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 1650 MI.eraseFromParent(); 1651 return true; 1652 } 1653 1654 if (TLI->shouldEmitPCReloc(GV)) { 1655 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 1656 MI.eraseFromParent(); 1657 return true; 1658 } 1659 1660 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1661 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 1662 1663 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 1664 MachinePointerInfo::getGOT(MF), 1665 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1666 MachineMemOperand::MOInvariant, 1667 8 /*Size*/, 8 /*Align*/); 1668 1669 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 1670 1671 if (Ty.getSizeInBits() == 32) { 1672 // Truncate if this is a 32-bit constant adrdess. 1673 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 1674 B.buildExtract(DstReg, Load, 0); 1675 } else 1676 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 1677 1678 MI.eraseFromParent(); 1679 return true; 1680 } 1681 1682 bool AMDGPULegalizerInfo::legalizeLoad( 1683 MachineInstr &MI, MachineRegisterInfo &MRI, 1684 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 1685 B.setInstr(MI); 1686 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1687 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 1688 Observer.changingInstr(MI); 1689 MI.getOperand(1).setReg(Cast.getReg(0)); 1690 Observer.changedInstr(MI); 1691 return true; 1692 } 1693 1694 bool AMDGPULegalizerInfo::legalizeFMad( 1695 MachineInstr &MI, MachineRegisterInfo &MRI, 1696 MachineIRBuilder &B) const { 1697 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 1698 assert(Ty.isScalar()); 1699 1700 // TODO: Always legal with future ftz flag. 1701 if (Ty == LLT::scalar(32) && !ST.hasFP32Denormals()) 1702 return true; 1703 if (Ty == LLT::scalar(16) && !ST.hasFP16Denormals()) 1704 return true; 1705 1706 MachineFunction &MF = B.getMF(); 1707 1708 MachineIRBuilder HelperBuilder(MI); 1709 GISelObserverWrapper DummyObserver; 1710 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1711 HelperBuilder.setMBB(*MI.getParent()); 1712 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 1713 } 1714 1715 // Return the use branch instruction, otherwise null if the usage is invalid. 1716 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 1717 MachineRegisterInfo &MRI) { 1718 Register CondDef = MI.getOperand(0).getReg(); 1719 if (!MRI.hasOneNonDBGUse(CondDef)) 1720 return nullptr; 1721 1722 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 1723 return UseMI.getParent() == MI.getParent() && 1724 UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr; 1725 } 1726 1727 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 1728 Register Reg, LLT Ty) const { 1729 Register LiveIn = MRI.getLiveInVirtReg(Reg); 1730 if (LiveIn) 1731 return LiveIn; 1732 1733 Register NewReg = MRI.createGenericVirtualRegister(Ty); 1734 MRI.addLiveIn(Reg, NewReg); 1735 return NewReg; 1736 } 1737 1738 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 1739 const ArgDescriptor *Arg) const { 1740 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 1741 return false; // TODO: Handle these 1742 1743 assert(Arg->getRegister().isPhysical()); 1744 1745 MachineRegisterInfo &MRI = *B.getMRI(); 1746 1747 LLT Ty = MRI.getType(DstReg); 1748 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 1749 1750 if (Arg->isMasked()) { 1751 // TODO: Should we try to emit this once in the entry block? 1752 const LLT S32 = LLT::scalar(32); 1753 const unsigned Mask = Arg->getMask(); 1754 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 1755 1756 Register AndMaskSrc = LiveIn; 1757 1758 if (Shift != 0) { 1759 auto ShiftAmt = B.buildConstant(S32, Shift); 1760 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 1761 } 1762 1763 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 1764 } else 1765 B.buildCopy(DstReg, LiveIn); 1766 1767 // Insert the argument copy if it doens't already exist. 1768 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 1769 if (!MRI.getVRegDef(LiveIn)) { 1770 // FIXME: Should have scoped insert pt 1771 MachineBasicBlock &OrigInsBB = B.getMBB(); 1772 auto OrigInsPt = B.getInsertPt(); 1773 1774 MachineBasicBlock &EntryMBB = B.getMF().front(); 1775 EntryMBB.addLiveIn(Arg->getRegister()); 1776 B.setInsertPt(EntryMBB, EntryMBB.begin()); 1777 B.buildCopy(LiveIn, Arg->getRegister()); 1778 1779 B.setInsertPt(OrigInsBB, OrigInsPt); 1780 } 1781 1782 return true; 1783 } 1784 1785 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 1786 MachineInstr &MI, 1787 MachineRegisterInfo &MRI, 1788 MachineIRBuilder &B, 1789 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 1790 B.setInstr(MI); 1791 1792 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1793 1794 const ArgDescriptor *Arg; 1795 const TargetRegisterClass *RC; 1796 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 1797 if (!Arg) { 1798 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 1799 return false; 1800 } 1801 1802 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 1803 MI.eraseFromParent(); 1804 return true; 1805 } 1806 1807 return false; 1808 } 1809 1810 bool AMDGPULegalizerInfo::legalizeFDIVFast(MachineInstr &MI, 1811 MachineRegisterInfo &MRI, 1812 MachineIRBuilder &B) const { 1813 B.setInstr(MI); 1814 Register Res = MI.getOperand(0).getReg(); 1815 Register LHS = MI.getOperand(2).getReg(); 1816 Register RHS = MI.getOperand(3).getReg(); 1817 uint16_t Flags = MI.getFlags(); 1818 1819 LLT S32 = LLT::scalar(32); 1820 LLT S1 = LLT::scalar(1); 1821 1822 auto Abs = B.buildFAbs(S32, RHS, Flags); 1823 const APFloat C0Val(1.0f); 1824 1825 auto C0 = B.buildConstant(S32, 0x6f800000); 1826 auto C1 = B.buildConstant(S32, 0x2f800000); 1827 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 1828 1829 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 1830 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 1831 1832 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 1833 1834 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 1835 .addUse(Mul0.getReg(0)) 1836 .setMIFlags(Flags); 1837 1838 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 1839 1840 B.buildFMul(Res, Sel, Mul1, Flags); 1841 1842 MI.eraseFromParent(); 1843 return true; 1844 } 1845 1846 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 1847 MachineRegisterInfo &MRI, 1848 MachineIRBuilder &B) const { 1849 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1850 if (!MFI->isEntryFunction()) { 1851 return legalizePreloadedArgIntrin(MI, MRI, B, 1852 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 1853 } 1854 1855 B.setInstr(MI); 1856 1857 uint64_t Offset = 1858 ST.getTargetLowering()->getImplicitParameterOffset( 1859 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 1860 Register DstReg = MI.getOperand(0).getReg(); 1861 LLT DstTy = MRI.getType(DstReg); 1862 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 1863 1864 const ArgDescriptor *Arg; 1865 const TargetRegisterClass *RC; 1866 std::tie(Arg, RC) 1867 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 1868 if (!Arg) 1869 return false; 1870 1871 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 1872 if (!loadInputValue(KernargPtrReg, B, Arg)) 1873 return false; 1874 1875 B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 1876 MI.eraseFromParent(); 1877 return true; 1878 } 1879 1880 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 1881 MachineRegisterInfo &MRI, 1882 MachineIRBuilder &B, 1883 unsigned AddrSpace) const { 1884 B.setInstr(MI); 1885 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 1886 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 1887 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 1888 MI.eraseFromParent(); 1889 return true; 1890 } 1891 1892 /// Handle register layout difference for f16 images for some subtargets. 1893 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 1894 MachineRegisterInfo &MRI, 1895 Register Reg) const { 1896 if (!ST.hasUnpackedD16VMem()) 1897 return Reg; 1898 1899 const LLT S16 = LLT::scalar(16); 1900 const LLT S32 = LLT::scalar(32); 1901 LLT StoreVT = MRI.getType(Reg); 1902 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 1903 1904 auto Unmerge = B.buildUnmerge(S16, Reg); 1905 1906 SmallVector<Register, 4> WideRegs; 1907 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 1908 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 1909 1910 int NumElts = StoreVT.getNumElements(); 1911 1912 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 1913 } 1914 1915 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI, 1916 MachineRegisterInfo &MRI, 1917 MachineIRBuilder &B, 1918 bool IsFormat) const { 1919 // TODO: Reject f16 format on targets where unsupported. 1920 Register VData = MI.getOperand(1).getReg(); 1921 LLT Ty = MRI.getType(VData); 1922 1923 B.setInstr(MI); 1924 1925 const LLT S32 = LLT::scalar(32); 1926 const LLT S16 = LLT::scalar(16); 1927 1928 // Fixup illegal register types for i8 stores. 1929 if (Ty == LLT::scalar(8) || Ty == S16) { 1930 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 1931 MI.getOperand(1).setReg(AnyExt); 1932 return true; 1933 } 1934 1935 if (Ty.isVector()) { 1936 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 1937 if (IsFormat) 1938 MI.getOperand(1).setReg(handleD16VData(B, MRI, VData)); 1939 return true; 1940 } 1941 1942 return Ty.getElementType() == S32 && Ty.getNumElements() <= 4; 1943 } 1944 1945 return Ty == S32; 1946 } 1947 1948 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 1949 MachineRegisterInfo &MRI, 1950 MachineIRBuilder &B) const { 1951 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 1952 switch (MI.getIntrinsicID()) { 1953 case Intrinsic::amdgcn_if: { 1954 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 1955 const SIRegisterInfo *TRI 1956 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 1957 1958 B.setInstr(*BrCond); 1959 Register Def = MI.getOperand(1).getReg(); 1960 Register Use = MI.getOperand(3).getReg(); 1961 B.buildInstr(AMDGPU::SI_IF) 1962 .addDef(Def) 1963 .addUse(Use) 1964 .addMBB(BrCond->getOperand(1).getMBB()); 1965 1966 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 1967 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 1968 MI.eraseFromParent(); 1969 BrCond->eraseFromParent(); 1970 return true; 1971 } 1972 1973 return false; 1974 } 1975 case Intrinsic::amdgcn_loop: { 1976 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 1977 const SIRegisterInfo *TRI 1978 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 1979 1980 B.setInstr(*BrCond); 1981 Register Reg = MI.getOperand(2).getReg(); 1982 B.buildInstr(AMDGPU::SI_LOOP) 1983 .addUse(Reg) 1984 .addMBB(BrCond->getOperand(1).getMBB()); 1985 MI.eraseFromParent(); 1986 BrCond->eraseFromParent(); 1987 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 1988 return true; 1989 } 1990 1991 return false; 1992 } 1993 case Intrinsic::amdgcn_kernarg_segment_ptr: 1994 return legalizePreloadedArgIntrin( 1995 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 1996 case Intrinsic::amdgcn_implicitarg_ptr: 1997 return legalizeImplicitArgPtr(MI, MRI, B); 1998 case Intrinsic::amdgcn_workitem_id_x: 1999 return legalizePreloadedArgIntrin(MI, MRI, B, 2000 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 2001 case Intrinsic::amdgcn_workitem_id_y: 2002 return legalizePreloadedArgIntrin(MI, MRI, B, 2003 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 2004 case Intrinsic::amdgcn_workitem_id_z: 2005 return legalizePreloadedArgIntrin(MI, MRI, B, 2006 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 2007 case Intrinsic::amdgcn_workgroup_id_x: 2008 return legalizePreloadedArgIntrin(MI, MRI, B, 2009 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 2010 case Intrinsic::amdgcn_workgroup_id_y: 2011 return legalizePreloadedArgIntrin(MI, MRI, B, 2012 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 2013 case Intrinsic::amdgcn_workgroup_id_z: 2014 return legalizePreloadedArgIntrin(MI, MRI, B, 2015 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 2016 case Intrinsic::amdgcn_dispatch_ptr: 2017 return legalizePreloadedArgIntrin(MI, MRI, B, 2018 AMDGPUFunctionArgInfo::DISPATCH_PTR); 2019 case Intrinsic::amdgcn_queue_ptr: 2020 return legalizePreloadedArgIntrin(MI, MRI, B, 2021 AMDGPUFunctionArgInfo::QUEUE_PTR); 2022 case Intrinsic::amdgcn_implicit_buffer_ptr: 2023 return legalizePreloadedArgIntrin( 2024 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 2025 case Intrinsic::amdgcn_dispatch_id: 2026 return legalizePreloadedArgIntrin(MI, MRI, B, 2027 AMDGPUFunctionArgInfo::DISPATCH_ID); 2028 case Intrinsic::amdgcn_fdiv_fast: 2029 return legalizeFDIVFast(MI, MRI, B); 2030 case Intrinsic::amdgcn_is_shared: 2031 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 2032 case Intrinsic::amdgcn_is_private: 2033 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 2034 case Intrinsic::amdgcn_wavefrontsize: { 2035 B.setInstr(MI); 2036 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 2037 MI.eraseFromParent(); 2038 return true; 2039 } 2040 case Intrinsic::amdgcn_raw_buffer_store: 2041 return legalizeRawBufferStore(MI, MRI, B, false); 2042 case Intrinsic::amdgcn_raw_buffer_store_format: 2043 return legalizeRawBufferStore(MI, MRI, B, true); 2044 default: 2045 return true; 2046 } 2047 2048 return true; 2049 } 2050