1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPU.h" 22 #include "AMDGPULegalizerInfo.h" 23 #include "AMDGPUTargetMachine.h" 24 #include "SIMachineFunctionInfo.h" 25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/CodeGen/TargetOpcodes.h" 28 #include "llvm/CodeGen/ValueTypes.h" 29 #include "llvm/IR/DerivedTypes.h" 30 #include "llvm/IR/DiagnosticInfo.h" 31 #include "llvm/IR/Type.h" 32 #include "llvm/Support/Debug.h" 33 34 #define DEBUG_TYPE "amdgpu-legalinfo" 35 36 using namespace llvm; 37 using namespace LegalizeActions; 38 using namespace LegalizeMutations; 39 using namespace LegalityPredicates; 40 41 42 static LegalityPredicate isMultiple32(unsigned TypeIdx, 43 unsigned MaxSize = 1024) { 44 return [=](const LegalityQuery &Query) { 45 const LLT Ty = Query.Types[TypeIdx]; 46 const LLT EltTy = Ty.getScalarType(); 47 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 48 }; 49 } 50 51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 52 return [=](const LegalityQuery &Query) { 53 return Query.Types[TypeIdx].getSizeInBits() == Size; 54 }; 55 } 56 57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 58 return [=](const LegalityQuery &Query) { 59 const LLT Ty = Query.Types[TypeIdx]; 60 return Ty.isVector() && 61 Ty.getNumElements() % 2 != 0 && 62 Ty.getElementType().getSizeInBits() < 32 && 63 Ty.getSizeInBits() % 32 != 0; 64 }; 65 } 66 67 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 68 return [=](const LegalityQuery &Query) { 69 const LLT Ty = Query.Types[TypeIdx]; 70 const LLT EltTy = Ty.getElementType(); 71 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 72 }; 73 } 74 75 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 76 return [=](const LegalityQuery &Query) { 77 const LLT Ty = Query.Types[TypeIdx]; 78 const LLT EltTy = Ty.getElementType(); 79 unsigned Size = Ty.getSizeInBits(); 80 unsigned Pieces = (Size + 63) / 64; 81 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 82 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 83 }; 84 } 85 86 // Increase the number of vector elements to reach the next multiple of 32-bit 87 // type. 88 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 89 return [=](const LegalityQuery &Query) { 90 const LLT Ty = Query.Types[TypeIdx]; 91 92 const LLT EltTy = Ty.getElementType(); 93 const int Size = Ty.getSizeInBits(); 94 const int EltSize = EltTy.getSizeInBits(); 95 const int NextMul32 = (Size + 31) / 32; 96 97 assert(EltSize < 32); 98 99 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 100 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 101 }; 102 } 103 104 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 105 return [=](const LegalityQuery &Query) { 106 const LLT QueryTy = Query.Types[TypeIdx]; 107 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 108 }; 109 } 110 111 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 112 return [=](const LegalityQuery &Query) { 113 const LLT QueryTy = Query.Types[TypeIdx]; 114 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 115 }; 116 } 117 118 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT QueryTy = Query.Types[TypeIdx]; 121 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 122 }; 123 } 124 125 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 126 // v2s16. 127 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 128 return [=](const LegalityQuery &Query) { 129 const LLT Ty = Query.Types[TypeIdx]; 130 if (Ty.isVector()) { 131 const int EltSize = Ty.getElementType().getSizeInBits(); 132 return EltSize == 32 || EltSize == 64 || 133 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 134 EltSize == 128 || EltSize == 256; 135 } 136 137 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 138 }; 139 } 140 141 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 142 return [=](const LegalityQuery &Query) { 143 return Query.Types[TypeIdx].getElementType() == Type; 144 }; 145 } 146 147 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 148 return [=](const LegalityQuery &Query) { 149 const LLT Ty = Query.Types[TypeIdx]; 150 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 151 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 152 }; 153 } 154 155 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 156 const GCNTargetMachine &TM) 157 : ST(ST_) { 158 using namespace TargetOpcode; 159 160 auto GetAddrSpacePtr = [&TM](unsigned AS) { 161 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 162 }; 163 164 const LLT S1 = LLT::scalar(1); 165 const LLT S8 = LLT::scalar(8); 166 const LLT S16 = LLT::scalar(16); 167 const LLT S32 = LLT::scalar(32); 168 const LLT S64 = LLT::scalar(64); 169 const LLT S96 = LLT::scalar(96); 170 const LLT S128 = LLT::scalar(128); 171 const LLT S256 = LLT::scalar(256); 172 const LLT S1024 = LLT::scalar(1024); 173 174 const LLT V2S16 = LLT::vector(2, 16); 175 const LLT V4S16 = LLT::vector(4, 16); 176 177 const LLT V2S32 = LLT::vector(2, 32); 178 const LLT V3S32 = LLT::vector(3, 32); 179 const LLT V4S32 = LLT::vector(4, 32); 180 const LLT V5S32 = LLT::vector(5, 32); 181 const LLT V6S32 = LLT::vector(6, 32); 182 const LLT V7S32 = LLT::vector(7, 32); 183 const LLT V8S32 = LLT::vector(8, 32); 184 const LLT V9S32 = LLT::vector(9, 32); 185 const LLT V10S32 = LLT::vector(10, 32); 186 const LLT V11S32 = LLT::vector(11, 32); 187 const LLT V12S32 = LLT::vector(12, 32); 188 const LLT V13S32 = LLT::vector(13, 32); 189 const LLT V14S32 = LLT::vector(14, 32); 190 const LLT V15S32 = LLT::vector(15, 32); 191 const LLT V16S32 = LLT::vector(16, 32); 192 const LLT V32S32 = LLT::vector(32, 32); 193 194 const LLT V2S64 = LLT::vector(2, 64); 195 const LLT V3S64 = LLT::vector(3, 64); 196 const LLT V4S64 = LLT::vector(4, 64); 197 const LLT V5S64 = LLT::vector(5, 64); 198 const LLT V6S64 = LLT::vector(6, 64); 199 const LLT V7S64 = LLT::vector(7, 64); 200 const LLT V8S64 = LLT::vector(8, 64); 201 const LLT V16S64 = LLT::vector(16, 64); 202 203 std::initializer_list<LLT> AllS32Vectors = 204 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 205 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 206 std::initializer_list<LLT> AllS64Vectors = 207 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 208 209 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 210 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 211 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 212 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 213 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 214 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 215 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 216 217 const LLT CodePtr = FlatPtr; 218 219 const std::initializer_list<LLT> AddrSpaces64 = { 220 GlobalPtr, ConstantPtr, FlatPtr 221 }; 222 223 const std::initializer_list<LLT> AddrSpaces32 = { 224 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 225 }; 226 227 const std::initializer_list<LLT> FPTypesBase = { 228 S32, S64 229 }; 230 231 const std::initializer_list<LLT> FPTypes16 = { 232 S32, S64, S16 233 }; 234 235 const std::initializer_list<LLT> FPTypesPK16 = { 236 S32, S64, S16, V2S16 237 }; 238 239 setAction({G_BRCOND, S1}, Legal); 240 241 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 242 // elements for v3s16 243 getActionDefinitionsBuilder(G_PHI) 244 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 245 .legalFor(AllS32Vectors) 246 .legalFor(AllS64Vectors) 247 .legalFor(AddrSpaces64) 248 .legalFor(AddrSpaces32) 249 .clampScalar(0, S32, S256) 250 .widenScalarToNextPow2(0, 32) 251 .clampMaxNumElements(0, S32, 16) 252 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 253 .legalIf(isPointer(0)); 254 255 if (ST.has16BitInsts()) { 256 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 257 .legalFor({S32, S16}) 258 .clampScalar(0, S16, S32) 259 .scalarize(0); 260 } else { 261 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 262 .legalFor({S32}) 263 .clampScalar(0, S32, S32) 264 .scalarize(0); 265 } 266 267 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 268 .legalFor({S32}) 269 .clampScalar(0, S32, S32) 270 .scalarize(0); 271 272 // Report legal for any types we can handle anywhere. For the cases only legal 273 // on the SALU, RegBankSelect will be able to re-legalize. 274 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 275 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 276 .clampScalar(0, S32, S64) 277 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 278 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 279 .widenScalarToNextPow2(0) 280 .scalarize(0); 281 282 getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO, 283 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 284 .legalFor({{S32, S1}}) 285 .clampScalar(0, S32, S32) 286 .scalarize(0); // TODO: Implement. 287 288 getActionDefinitionsBuilder(G_BITCAST) 289 // Don't worry about the size constraint. 290 .legalIf(all(isRegisterType(0), isRegisterType(1))) 291 // FIXME: Testing hack 292 .legalForCartesianProduct({S16, LLT::vector(2, 8), }); 293 294 getActionDefinitionsBuilder(G_FCONSTANT) 295 .legalFor({S32, S64, S16}) 296 .clampScalar(0, S16, S64); 297 298 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 299 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 300 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 301 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 302 .clampScalarOrElt(0, S32, S1024) 303 .legalIf(isMultiple32(0)) 304 .widenScalarToNextPow2(0, 32) 305 .clampMaxNumElements(0, S32, 16); 306 307 308 // FIXME: i1 operands to intrinsics should always be legal, but other i1 309 // values may not be legal. We need to figure out how to distinguish 310 // between these two scenarios. 311 getActionDefinitionsBuilder(G_CONSTANT) 312 .legalFor({S1, S32, S64, S16, GlobalPtr, 313 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 314 .clampScalar(0, S32, S64) 315 .widenScalarToNextPow2(0) 316 .legalIf(isPointer(0)); 317 318 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 319 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 320 .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr}); 321 322 323 auto &FPOpActions = getActionDefinitionsBuilder( 324 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 325 .legalFor({S32, S64}); 326 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 327 .customFor({S32, S64}); 328 329 if (ST.has16BitInsts()) { 330 if (ST.hasVOP3PInsts()) 331 FPOpActions.legalFor({S16, V2S16}); 332 else 333 FPOpActions.legalFor({S16}); 334 335 TrigActions.customFor({S16}); 336 } 337 338 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 339 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 340 341 if (ST.hasVOP3PInsts()) { 342 MinNumMaxNum.customFor(FPTypesPK16) 343 .clampMaxNumElements(0, S16, 2) 344 .clampScalar(0, S16, S64) 345 .scalarize(0); 346 } else if (ST.has16BitInsts()) { 347 MinNumMaxNum.customFor(FPTypes16) 348 .clampScalar(0, S16, S64) 349 .scalarize(0); 350 } else { 351 MinNumMaxNum.customFor(FPTypesBase) 352 .clampScalar(0, S32, S64) 353 .scalarize(0); 354 } 355 356 if (ST.hasVOP3PInsts()) 357 FPOpActions.clampMaxNumElements(0, S16, 2); 358 359 FPOpActions 360 .scalarize(0) 361 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 362 363 TrigActions 364 .scalarize(0) 365 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 366 367 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 368 .legalFor(FPTypesPK16) 369 .clampMaxNumElements(0, S16, 2) 370 .scalarize(0) 371 .clampScalar(0, S16, S64); 372 373 // TODO: Implement 374 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); 375 376 if (ST.has16BitInsts()) { 377 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 378 .legalFor({S32, S64, S16}) 379 .scalarize(0) 380 .clampScalar(0, S16, S64); 381 } else { 382 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 383 .legalFor({S32, S64}) 384 .scalarize(0) 385 .clampScalar(0, S32, S64); 386 } 387 388 getActionDefinitionsBuilder(G_FPTRUNC) 389 .legalFor({{S32, S64}, {S16, S32}}) 390 .scalarize(0); 391 392 getActionDefinitionsBuilder(G_FPEXT) 393 .legalFor({{S64, S32}, {S32, S16}}) 394 .lowerFor({{S64, S16}}) // FIXME: Implement 395 .scalarize(0); 396 397 // TODO: Verify V_BFI_B32 is generated from expanded bit ops. 398 getActionDefinitionsBuilder(G_FCOPYSIGN).lower(); 399 400 getActionDefinitionsBuilder(G_FSUB) 401 // Use actual fsub instruction 402 .legalFor({S32}) 403 // Must use fadd + fneg 404 .lowerFor({S64, S16, V2S16}) 405 .scalarize(0) 406 .clampScalar(0, S32, S64); 407 408 // Whether this is legal depends on the floating point mode for the function. 409 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 410 if (ST.hasMadF16()) 411 FMad.customFor({S32, S16}); 412 else 413 FMad.customFor({S32}); 414 FMad.scalarize(0) 415 .lower(); 416 417 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 418 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 419 {S32, S1}, {S64, S1}, {S16, S1}, 420 {S96, S32}, 421 // FIXME: Hack 422 {S64, LLT::scalar(33)}, 423 {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}}) 424 .scalarize(0); 425 426 // TODO: Split s1->s64 during regbankselect for VALU. 427 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 428 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}, {S32, S1}, {S16, S1}, {S64, S1}}) 429 .lowerFor({{S32, S64}}) 430 .customFor({{S64, S64}}); 431 if (ST.has16BitInsts()) 432 IToFP.legalFor({{S16, S16}}); 433 IToFP.clampScalar(1, S32, S64) 434 .scalarize(0); 435 436 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 437 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}); 438 if (ST.has16BitInsts()) 439 FPToI.legalFor({{S16, S16}}); 440 else 441 FPToI.minScalar(1, S32); 442 443 FPToI.minScalar(0, S32) 444 .scalarize(0); 445 446 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 447 .legalFor({S32, S64}) 448 .scalarize(0); 449 450 if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 451 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 452 .legalFor({S32, S64}) 453 .clampScalar(0, S32, S64) 454 .scalarize(0); 455 } else { 456 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 457 .legalFor({S32}) 458 .customFor({S64}) 459 .clampScalar(0, S32, S64) 460 .scalarize(0); 461 } 462 463 getActionDefinitionsBuilder(G_GEP) 464 .legalForCartesianProduct(AddrSpaces64, {S64}) 465 .legalForCartesianProduct(AddrSpaces32, {S32}) 466 .scalarize(0); 467 468 getActionDefinitionsBuilder(G_PTR_MASK) 469 .scalarize(0) 470 .alwaysLegal(); 471 472 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 473 474 auto &CmpBuilder = 475 getActionDefinitionsBuilder(G_ICMP) 476 .legalForCartesianProduct( 477 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 478 .legalFor({{S1, S32}, {S1, S64}}); 479 if (ST.has16BitInsts()) { 480 CmpBuilder.legalFor({{S1, S16}}); 481 } 482 483 CmpBuilder 484 .widenScalarToNextPow2(1) 485 .clampScalar(1, S32, S64) 486 .scalarize(0) 487 .legalIf(all(typeIs(0, S1), isPointer(1))); 488 489 getActionDefinitionsBuilder(G_FCMP) 490 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 491 .widenScalarToNextPow2(1) 492 .clampScalar(1, S32, S64) 493 .scalarize(0); 494 495 // FIXME: fexp, flog2, flog10 needs to be custom lowered. 496 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, 497 G_FLOG, G_FLOG2, G_FLOG10}) 498 .legalFor({S32}) 499 .scalarize(0); 500 501 // The 64-bit versions produce 32-bit results, but only on the SALU. 502 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, 503 G_CTTZ, G_CTTZ_ZERO_UNDEF, 504 G_CTPOP}) 505 .legalFor({{S32, S32}, {S32, S64}}) 506 .clampScalar(0, S32, S32) 507 .clampScalar(1, S32, S64) 508 .scalarize(0) 509 .widenScalarToNextPow2(0, 32) 510 .widenScalarToNextPow2(1, 32); 511 512 // TODO: Expand for > s32 513 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE}) 514 .legalFor({S32}) 515 .clampScalar(0, S32, S32) 516 .scalarize(0); 517 518 if (ST.has16BitInsts()) { 519 if (ST.hasVOP3PInsts()) { 520 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 521 .legalFor({S32, S16, V2S16}) 522 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 523 .clampMaxNumElements(0, S16, 2) 524 .clampScalar(0, S16, S32) 525 .widenScalarToNextPow2(0) 526 .scalarize(0); 527 } else { 528 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 529 .legalFor({S32, S16}) 530 .widenScalarToNextPow2(0) 531 .clampScalar(0, S16, S32) 532 .scalarize(0); 533 } 534 } else { 535 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 536 .legalFor({S32}) 537 .clampScalar(0, S32, S32) 538 .widenScalarToNextPow2(0) 539 .scalarize(0); 540 } 541 542 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 543 return [=](const LegalityQuery &Query) { 544 return Query.Types[TypeIdx0].getSizeInBits() < 545 Query.Types[TypeIdx1].getSizeInBits(); 546 }; 547 }; 548 549 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 550 return [=](const LegalityQuery &Query) { 551 return Query.Types[TypeIdx0].getSizeInBits() > 552 Query.Types[TypeIdx1].getSizeInBits(); 553 }; 554 }; 555 556 getActionDefinitionsBuilder(G_INTTOPTR) 557 // List the common cases 558 .legalForCartesianProduct(AddrSpaces64, {S64}) 559 .legalForCartesianProduct(AddrSpaces32, {S32}) 560 .scalarize(0) 561 // Accept any address space as long as the size matches 562 .legalIf(sameSize(0, 1)) 563 .widenScalarIf(smallerThan(1, 0), 564 [](const LegalityQuery &Query) { 565 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 566 }) 567 .narrowScalarIf(greaterThan(1, 0), 568 [](const LegalityQuery &Query) { 569 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 570 }); 571 572 getActionDefinitionsBuilder(G_PTRTOINT) 573 // List the common cases 574 .legalForCartesianProduct(AddrSpaces64, {S64}) 575 .legalForCartesianProduct(AddrSpaces32, {S32}) 576 .scalarize(0) 577 // Accept any address space as long as the size matches 578 .legalIf(sameSize(0, 1)) 579 .widenScalarIf(smallerThan(0, 1), 580 [](const LegalityQuery &Query) { 581 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 582 }) 583 .narrowScalarIf( 584 greaterThan(0, 1), 585 [](const LegalityQuery &Query) { 586 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 587 }); 588 589 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 590 .scalarize(0) 591 .custom(); 592 593 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 594 // handle some operations by just promoting the register during 595 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 596 auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned { 597 switch (AS) { 598 // FIXME: Private element size. 599 case AMDGPUAS::PRIVATE_ADDRESS: 600 return 32; 601 // FIXME: Check subtarget 602 case AMDGPUAS::LOCAL_ADDRESS: 603 return ST.useDS128() ? 128 : 64; 604 605 // Treat constant and global as identical. SMRD loads are sometimes usable 606 // for global loads (ideally constant address space should be eliminated) 607 // depending on the context. Legality cannot be context dependent, but 608 // RegBankSelect can split the load as necessary depending on the pointer 609 // register bank/uniformity and if the memory is invariant or not written in 610 // a kernel. 611 case AMDGPUAS::CONSTANT_ADDRESS: 612 case AMDGPUAS::GLOBAL_ADDRESS: 613 return 512; 614 default: 615 return 128; 616 } 617 }; 618 619 const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool { 620 const LLT DstTy = Query.Types[0]; 621 622 // Split vector extloads. 623 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 624 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 625 return true; 626 627 const LLT PtrTy = Query.Types[1]; 628 unsigned AS = PtrTy.getAddressSpace(); 629 if (MemSize > maxSizeForAddrSpace(AS)) 630 return true; 631 632 // Catch weird sized loads that don't evenly divide into the access sizes 633 // TODO: May be able to widen depending on alignment etc. 634 unsigned NumRegs = MemSize / 32; 635 if (NumRegs == 3 && !ST.hasDwordx3LoadStores()) 636 return true; 637 638 unsigned Align = Query.MMODescrs[0].AlignInBits; 639 if (Align < MemSize) { 640 const SITargetLowering *TLI = ST.getTargetLowering(); 641 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 642 } 643 644 return false; 645 }; 646 647 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 648 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 649 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 650 651 // TODO: Refine based on subtargets which support unaligned access or 128-bit 652 // LDS 653 // TODO: Unsupported flat for SI. 654 655 for (unsigned Op : {G_LOAD, G_STORE}) { 656 const bool IsStore = Op == G_STORE; 657 658 auto &Actions = getActionDefinitionsBuilder(Op); 659 // Whitelist the common cases. 660 // TODO: Pointer loads 661 // TODO: Wide constant loads 662 // TODO: Only CI+ has 3x loads 663 // TODO: Loads to s16 on gfx9 664 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 665 {V2S32, GlobalPtr, 64, GlobalAlign32}, 666 {V3S32, GlobalPtr, 96, GlobalAlign32}, 667 {S96, GlobalPtr, 96, GlobalAlign32}, 668 {V4S32, GlobalPtr, 128, GlobalAlign32}, 669 {S128, GlobalPtr, 128, GlobalAlign32}, 670 {S64, GlobalPtr, 64, GlobalAlign32}, 671 {V2S64, GlobalPtr, 128, GlobalAlign32}, 672 {V2S16, GlobalPtr, 32, GlobalAlign32}, 673 {S32, GlobalPtr, 8, GlobalAlign8}, 674 {S32, GlobalPtr, 16, GlobalAlign16}, 675 676 {S32, LocalPtr, 32, 32}, 677 {S64, LocalPtr, 64, 32}, 678 {V2S32, LocalPtr, 64, 32}, 679 {S32, LocalPtr, 8, 8}, 680 {S32, LocalPtr, 16, 16}, 681 {V2S16, LocalPtr, 32, 32}, 682 683 {S32, PrivatePtr, 32, 32}, 684 {S32, PrivatePtr, 8, 8}, 685 {S32, PrivatePtr, 16, 16}, 686 {V2S16, PrivatePtr, 32, 32}, 687 688 {S32, FlatPtr, 32, GlobalAlign32}, 689 {S32, FlatPtr, 16, GlobalAlign16}, 690 {S32, FlatPtr, 8, GlobalAlign8}, 691 {V2S16, FlatPtr, 32, GlobalAlign32}, 692 693 {S32, ConstantPtr, 32, GlobalAlign32}, 694 {V2S32, ConstantPtr, 64, GlobalAlign32}, 695 {V3S32, ConstantPtr, 96, GlobalAlign32}, 696 {V4S32, ConstantPtr, 128, GlobalAlign32}, 697 {S64, ConstantPtr, 64, GlobalAlign32}, 698 {S128, ConstantPtr, 128, GlobalAlign32}, 699 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 700 Actions 701 .customIf(typeIs(1, Constant32Ptr)) 702 .narrowScalarIf( 703 [=](const LegalityQuery &Query) -> bool { 704 return !Query.Types[0].isVector() && needToSplitLoad(Query); 705 }, 706 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 707 const LLT DstTy = Query.Types[0]; 708 const LLT PtrTy = Query.Types[1]; 709 710 const unsigned DstSize = DstTy.getSizeInBits(); 711 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 712 713 // Split extloads. 714 if (DstSize > MemSize) 715 return std::make_pair(0, LLT::scalar(MemSize)); 716 717 if (DstSize > 32 && (DstSize % 32 != 0)) { 718 // FIXME: Need a way to specify non-extload of larger size if 719 // suitably aligned. 720 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 721 } 722 723 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 724 if (MemSize > MaxSize) 725 return std::make_pair(0, LLT::scalar(MaxSize)); 726 727 unsigned Align = Query.MMODescrs[0].AlignInBits; 728 return std::make_pair(0, LLT::scalar(Align)); 729 }) 730 .fewerElementsIf( 731 [=](const LegalityQuery &Query) -> bool { 732 return Query.Types[0].isVector() && needToSplitLoad(Query); 733 }, 734 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 735 const LLT DstTy = Query.Types[0]; 736 const LLT PtrTy = Query.Types[1]; 737 738 LLT EltTy = DstTy.getElementType(); 739 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 740 741 // Split if it's too large for the address space. 742 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 743 unsigned NumElts = DstTy.getNumElements(); 744 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 745 746 // FIXME: Refine when odd breakdowns handled 747 // The scalars will need to be re-legalized. 748 if (NumPieces == 1 || NumPieces >= NumElts || 749 NumElts % NumPieces != 0) 750 return std::make_pair(0, EltTy); 751 752 return std::make_pair(0, 753 LLT::vector(NumElts / NumPieces, EltTy)); 754 } 755 756 // Need to split because of alignment. 757 unsigned Align = Query.MMODescrs[0].AlignInBits; 758 unsigned EltSize = EltTy.getSizeInBits(); 759 if (EltSize > Align && 760 (EltSize / Align < DstTy.getNumElements())) { 761 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 762 } 763 764 // May need relegalization for the scalars. 765 return std::make_pair(0, EltTy); 766 }) 767 .minScalar(0, S32); 768 769 if (IsStore) 770 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 771 772 // TODO: Need a bitcast lower option? 773 Actions 774 .legalIf([=](const LegalityQuery &Query) { 775 const LLT Ty0 = Query.Types[0]; 776 unsigned Size = Ty0.getSizeInBits(); 777 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 778 unsigned Align = Query.MMODescrs[0].AlignInBits; 779 780 // No extending vector loads. 781 if (Size > MemSize && Ty0.isVector()) 782 return false; 783 784 // FIXME: Widening store from alignment not valid. 785 if (MemSize < Size) 786 MemSize = std::max(MemSize, Align); 787 788 switch (MemSize) { 789 case 8: 790 case 16: 791 return Size == 32; 792 case 32: 793 case 64: 794 case 128: 795 return true; 796 case 96: 797 return ST.hasDwordx3LoadStores(); 798 case 256: 799 case 512: 800 return true; 801 default: 802 return false; 803 } 804 }) 805 .widenScalarToNextPow2(0) 806 // TODO: v3s32->v4s32 with alignment 807 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 808 } 809 810 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 811 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 812 {S32, GlobalPtr, 16, 2 * 8}, 813 {S32, LocalPtr, 8, 8}, 814 {S32, LocalPtr, 16, 16}, 815 {S32, PrivatePtr, 8, 8}, 816 {S32, PrivatePtr, 16, 16}, 817 {S32, ConstantPtr, 8, 8}, 818 {S32, ConstantPtr, 16, 2 * 8}}); 819 if (ST.hasFlatAddressSpace()) { 820 ExtLoads.legalForTypesWithMemDesc( 821 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 822 } 823 824 ExtLoads.clampScalar(0, S32, S32) 825 .widenScalarToNextPow2(0) 826 .unsupportedIfMemSizeNotPow2() 827 .lower(); 828 829 auto &Atomics = getActionDefinitionsBuilder( 830 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 831 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 832 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 833 G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG}) 834 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 835 {S64, GlobalPtr}, {S64, LocalPtr}}); 836 if (ST.hasFlatAddressSpace()) { 837 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 838 } 839 840 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 841 .legalFor({{S32, LocalPtr}}); 842 843 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS) 844 .lower(); 845 846 // TODO: Pointer types, any 32-bit or 64-bit vector 847 getActionDefinitionsBuilder(G_SELECT) 848 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 849 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 850 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1}) 851 .clampScalar(0, S16, S64) 852 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 853 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 854 .scalarize(1) 855 .clampMaxNumElements(0, S32, 2) 856 .clampMaxNumElements(0, LocalPtr, 2) 857 .clampMaxNumElements(0, PrivatePtr, 2) 858 .scalarize(0) 859 .widenScalarToNextPow2(0) 860 .legalIf(all(isPointer(0), typeIs(1, S1))); 861 862 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 863 // be more flexible with the shift amount type. 864 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 865 .legalFor({{S32, S32}, {S64, S32}}); 866 if (ST.has16BitInsts()) { 867 if (ST.hasVOP3PInsts()) { 868 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 869 .clampMaxNumElements(0, S16, 2); 870 } else 871 Shifts.legalFor({{S16, S32}, {S16, S16}}); 872 873 Shifts.clampScalar(1, S16, S32); 874 Shifts.clampScalar(0, S16, S64); 875 Shifts.widenScalarToNextPow2(0, 16); 876 } else { 877 // Make sure we legalize the shift amount type first, as the general 878 // expansion for the shifted type will produce much worse code if it hasn't 879 // been truncated already. 880 Shifts.clampScalar(1, S32, S32); 881 Shifts.clampScalar(0, S32, S64); 882 Shifts.widenScalarToNextPow2(0, 32); 883 } 884 Shifts.scalarize(0); 885 886 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 887 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 888 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 889 unsigned IdxTypeIdx = 2; 890 891 getActionDefinitionsBuilder(Op) 892 .customIf([=](const LegalityQuery &Query) { 893 const LLT EltTy = Query.Types[EltTypeIdx]; 894 const LLT VecTy = Query.Types[VecTypeIdx]; 895 const LLT IdxTy = Query.Types[IdxTypeIdx]; 896 return (EltTy.getSizeInBits() == 16 || 897 EltTy.getSizeInBits() % 32 == 0) && 898 VecTy.getSizeInBits() % 32 == 0 && 899 VecTy.getSizeInBits() <= 1024 && 900 IdxTy.getSizeInBits() == 32; 901 }) 902 .clampScalar(EltTypeIdx, S32, S64) 903 .clampScalar(VecTypeIdx, S32, S64) 904 .clampScalar(IdxTypeIdx, S32, S32); 905 } 906 907 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 908 .unsupportedIf([=](const LegalityQuery &Query) { 909 const LLT &EltTy = Query.Types[1].getElementType(); 910 return Query.Types[0] != EltTy; 911 }); 912 913 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 914 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 915 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 916 917 // FIXME: Doesn't handle extract of illegal sizes. 918 getActionDefinitionsBuilder(Op) 919 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 920 // FIXME: Multiples of 16 should not be legal. 921 .legalIf([=](const LegalityQuery &Query) { 922 const LLT BigTy = Query.Types[BigTyIdx]; 923 const LLT LitTy = Query.Types[LitTyIdx]; 924 return (BigTy.getSizeInBits() % 32 == 0) && 925 (LitTy.getSizeInBits() % 16 == 0); 926 }) 927 .widenScalarIf( 928 [=](const LegalityQuery &Query) { 929 const LLT BigTy = Query.Types[BigTyIdx]; 930 return (BigTy.getScalarSizeInBits() < 16); 931 }, 932 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 933 .widenScalarIf( 934 [=](const LegalityQuery &Query) { 935 const LLT LitTy = Query.Types[LitTyIdx]; 936 return (LitTy.getScalarSizeInBits() < 16); 937 }, 938 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 939 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 940 .widenScalarToNextPow2(BigTyIdx, 32); 941 942 } 943 944 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 945 .legalForCartesianProduct(AllS32Vectors, {S32}) 946 .legalForCartesianProduct(AllS64Vectors, {S64}) 947 .clampNumElements(0, V16S32, V32S32) 948 .clampNumElements(0, V2S64, V16S64); 949 950 if (ST.hasScalarPackInsts()) 951 BuildVector.legalFor({V2S16, S32}); 952 953 BuildVector 954 .minScalarSameAs(1, 0) 955 .legalIf(isRegisterType(0)) 956 .minScalarOrElt(0, S32); 957 958 if (ST.hasScalarPackInsts()) { 959 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 960 .legalFor({V2S16, S32}) 961 .lower(); 962 } else { 963 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 964 .lower(); 965 } 966 967 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 968 .legalIf(isRegisterType(0)); 969 970 // TODO: Don't fully scalarize v2s16 pieces 971 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 972 973 // Merge/Unmerge 974 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 975 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 976 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 977 978 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 979 const LLT &Ty = Query.Types[TypeIdx]; 980 if (Ty.isVector()) { 981 const LLT &EltTy = Ty.getElementType(); 982 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 983 return true; 984 if (!isPowerOf2_32(EltTy.getSizeInBits())) 985 return true; 986 } 987 return false; 988 }; 989 990 auto &Builder = getActionDefinitionsBuilder(Op) 991 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 992 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 993 // worth considering the multiples of 64 since 2*192 and 2*384 are not 994 // valid. 995 .clampScalar(LitTyIdx, S16, S256) 996 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 997 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 998 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 999 elementTypeIs(1, S16)), 1000 changeTo(1, V2S16)) 1001 // Break up vectors with weird elements into scalars 1002 .fewerElementsIf( 1003 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 1004 scalarize(0)) 1005 .fewerElementsIf( 1006 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 1007 scalarize(1)) 1008 .clampScalar(BigTyIdx, S32, S1024) 1009 .lowerFor({{S16, V2S16}}); 1010 1011 if (Op == G_MERGE_VALUES) { 1012 Builder.widenScalarIf( 1013 // TODO: Use 16-bit shifts if legal for 8-bit values? 1014 [=](const LegalityQuery &Query) { 1015 const LLT Ty = Query.Types[LitTyIdx]; 1016 return Ty.getSizeInBits() < 32; 1017 }, 1018 changeTo(LitTyIdx, S32)); 1019 } 1020 1021 Builder.widenScalarIf( 1022 [=](const LegalityQuery &Query) { 1023 const LLT Ty = Query.Types[BigTyIdx]; 1024 return !isPowerOf2_32(Ty.getSizeInBits()) && 1025 Ty.getSizeInBits() % 16 != 0; 1026 }, 1027 [=](const LegalityQuery &Query) { 1028 // Pick the next power of 2, or a multiple of 64 over 128. 1029 // Whichever is smaller. 1030 const LLT &Ty = Query.Types[BigTyIdx]; 1031 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1032 if (NewSizeInBits >= 256) { 1033 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1034 if (RoundedTo < NewSizeInBits) 1035 NewSizeInBits = RoundedTo; 1036 } 1037 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1038 }) 1039 .legalIf([=](const LegalityQuery &Query) { 1040 const LLT &BigTy = Query.Types[BigTyIdx]; 1041 const LLT &LitTy = Query.Types[LitTyIdx]; 1042 1043 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1044 return false; 1045 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1046 return false; 1047 1048 return BigTy.getSizeInBits() % 16 == 0 && 1049 LitTy.getSizeInBits() % 16 == 0 && 1050 BigTy.getSizeInBits() <= 1024; 1051 }) 1052 // Any vectors left are the wrong size. Scalarize them. 1053 .scalarize(0) 1054 .scalarize(1); 1055 } 1056 1057 getActionDefinitionsBuilder(G_SEXT_INREG).lower(); 1058 1059 computeTables(); 1060 verify(*ST.getInstrInfo()); 1061 } 1062 1063 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1064 MachineRegisterInfo &MRI, 1065 MachineIRBuilder &B, 1066 GISelChangeObserver &Observer) const { 1067 switch (MI.getOpcode()) { 1068 case TargetOpcode::G_ADDRSPACE_CAST: 1069 return legalizeAddrSpaceCast(MI, MRI, B); 1070 case TargetOpcode::G_FRINT: 1071 return legalizeFrint(MI, MRI, B); 1072 case TargetOpcode::G_FCEIL: 1073 return legalizeFceil(MI, MRI, B); 1074 case TargetOpcode::G_INTRINSIC_TRUNC: 1075 return legalizeIntrinsicTrunc(MI, MRI, B); 1076 case TargetOpcode::G_SITOFP: 1077 return legalizeITOFP(MI, MRI, B, true); 1078 case TargetOpcode::G_UITOFP: 1079 return legalizeITOFP(MI, MRI, B, false); 1080 case TargetOpcode::G_FMINNUM: 1081 case TargetOpcode::G_FMAXNUM: 1082 case TargetOpcode::G_FMINNUM_IEEE: 1083 case TargetOpcode::G_FMAXNUM_IEEE: 1084 return legalizeMinNumMaxNum(MI, MRI, B); 1085 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1086 return legalizeExtractVectorElt(MI, MRI, B); 1087 case TargetOpcode::G_INSERT_VECTOR_ELT: 1088 return legalizeInsertVectorElt(MI, MRI, B); 1089 case TargetOpcode::G_FSIN: 1090 case TargetOpcode::G_FCOS: 1091 return legalizeSinCos(MI, MRI, B); 1092 case TargetOpcode::G_GLOBAL_VALUE: 1093 return legalizeGlobalValue(MI, MRI, B); 1094 case TargetOpcode::G_LOAD: 1095 return legalizeLoad(MI, MRI, B, Observer); 1096 case TargetOpcode::G_FMAD: 1097 return legalizeFMad(MI, MRI, B); 1098 default: 1099 return false; 1100 } 1101 1102 llvm_unreachable("expected switch to return"); 1103 } 1104 1105 Register AMDGPULegalizerInfo::getSegmentAperture( 1106 unsigned AS, 1107 MachineRegisterInfo &MRI, 1108 MachineIRBuilder &B) const { 1109 MachineFunction &MF = B.getMF(); 1110 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1111 const LLT S32 = LLT::scalar(32); 1112 1113 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1114 1115 if (ST.hasApertureRegs()) { 1116 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1117 // getreg. 1118 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1119 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1120 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1121 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1122 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1123 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1124 unsigned Encoding = 1125 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1126 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1127 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1128 1129 Register ApertureReg = MRI.createGenericVirtualRegister(S32); 1130 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1131 1132 B.buildInstr(AMDGPU::S_GETREG_B32) 1133 .addDef(GetReg) 1134 .addImm(Encoding); 1135 MRI.setType(GetReg, S32); 1136 1137 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1138 B.buildInstr(TargetOpcode::G_SHL) 1139 .addDef(ApertureReg) 1140 .addUse(GetReg) 1141 .addUse(ShiftAmt.getReg(0)); 1142 1143 return ApertureReg; 1144 } 1145 1146 Register QueuePtr = MRI.createGenericVirtualRegister( 1147 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1148 1149 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1150 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1151 return Register(); 1152 1153 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1154 // private_segment_aperture_base_hi. 1155 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1156 1157 // FIXME: Don't use undef 1158 Value *V = UndefValue::get(PointerType::get( 1159 Type::getInt8Ty(MF.getFunction().getContext()), 1160 AMDGPUAS::CONSTANT_ADDRESS)); 1161 1162 MachinePointerInfo PtrInfo(V, StructOffset); 1163 MachineMemOperand *MMO = MF.getMachineMemOperand( 1164 PtrInfo, 1165 MachineMemOperand::MOLoad | 1166 MachineMemOperand::MODereferenceable | 1167 MachineMemOperand::MOInvariant, 1168 4, 1169 MinAlign(64, StructOffset)); 1170 1171 Register LoadResult = MRI.createGenericVirtualRegister(S32); 1172 Register LoadAddr; 1173 1174 B.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1175 B.buildLoad(LoadResult, LoadAddr, *MMO); 1176 return LoadResult; 1177 } 1178 1179 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1180 MachineInstr &MI, MachineRegisterInfo &MRI, 1181 MachineIRBuilder &B) const { 1182 MachineFunction &MF = B.getMF(); 1183 1184 B.setInstr(MI); 1185 1186 const LLT S32 = LLT::scalar(32); 1187 Register Dst = MI.getOperand(0).getReg(); 1188 Register Src = MI.getOperand(1).getReg(); 1189 1190 LLT DstTy = MRI.getType(Dst); 1191 LLT SrcTy = MRI.getType(Src); 1192 unsigned DestAS = DstTy.getAddressSpace(); 1193 unsigned SrcAS = SrcTy.getAddressSpace(); 1194 1195 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1196 // vector element. 1197 assert(!DstTy.isVector()); 1198 1199 const AMDGPUTargetMachine &TM 1200 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1201 1202 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1203 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1204 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1205 return true; 1206 } 1207 1208 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1209 // Truncate. 1210 B.buildExtract(Dst, Src, 0); 1211 MI.eraseFromParent(); 1212 return true; 1213 } 1214 1215 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1216 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1217 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1218 1219 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1220 // another. Merge operands are required to be the same type, but creating an 1221 // extra ptrtoint would be kind of pointless. 1222 auto HighAddr = B.buildConstant( 1223 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1224 B.buildMerge(Dst, {Src, HighAddr.getReg(0)}); 1225 MI.eraseFromParent(); 1226 return true; 1227 } 1228 1229 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1230 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1231 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1232 unsigned NullVal = TM.getNullPointerValue(DestAS); 1233 1234 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1235 auto FlatNull = B.buildConstant(SrcTy, 0); 1236 1237 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy); 1238 1239 // Extract low 32-bits of the pointer. 1240 B.buildExtract(PtrLo32, Src, 0); 1241 1242 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1243 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); 1244 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1245 1246 MI.eraseFromParent(); 1247 return true; 1248 } 1249 1250 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1251 return false; 1252 1253 if (!ST.hasFlatAddressSpace()) 1254 return false; 1255 1256 auto SegmentNull = 1257 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1258 auto FlatNull = 1259 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1260 1261 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1262 if (!ApertureReg.isValid()) 1263 return false; 1264 1265 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1266 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); 1267 1268 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy); 1269 1270 // Coerce the type of the low half of the result so we can use merge_values. 1271 Register SrcAsInt = MRI.createGenericVirtualRegister(S32); 1272 B.buildInstr(TargetOpcode::G_PTRTOINT) 1273 .addDef(SrcAsInt) 1274 .addUse(Src); 1275 1276 // TODO: Should we allow mismatched types but matching sizes in merges to 1277 // avoid the ptrtoint? 1278 B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); 1279 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); 1280 1281 MI.eraseFromParent(); 1282 return true; 1283 } 1284 1285 bool AMDGPULegalizerInfo::legalizeFrint( 1286 MachineInstr &MI, MachineRegisterInfo &MRI, 1287 MachineIRBuilder &B) const { 1288 B.setInstr(MI); 1289 1290 Register Src = MI.getOperand(1).getReg(); 1291 LLT Ty = MRI.getType(Src); 1292 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1293 1294 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1295 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1296 1297 auto C1 = B.buildFConstant(Ty, C1Val); 1298 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1299 1300 // TODO: Should this propagate fast-math-flags? 1301 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1302 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1303 1304 auto C2 = B.buildFConstant(Ty, C2Val); 1305 auto Fabs = B.buildFAbs(Ty, Src); 1306 1307 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1308 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1309 return true; 1310 } 1311 1312 bool AMDGPULegalizerInfo::legalizeFceil( 1313 MachineInstr &MI, MachineRegisterInfo &MRI, 1314 MachineIRBuilder &B) const { 1315 B.setInstr(MI); 1316 1317 const LLT S1 = LLT::scalar(1); 1318 const LLT S64 = LLT::scalar(64); 1319 1320 Register Src = MI.getOperand(1).getReg(); 1321 assert(MRI.getType(Src) == S64); 1322 1323 // result = trunc(src) 1324 // if (src > 0.0 && src != result) 1325 // result += 1.0 1326 1327 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src}); 1328 1329 const auto Zero = B.buildFConstant(S64, 0.0); 1330 const auto One = B.buildFConstant(S64, 1.0); 1331 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1332 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1333 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1334 auto Add = B.buildSelect(S64, And, One, Zero); 1335 1336 // TODO: Should this propagate fast-math-flags? 1337 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1338 return true; 1339 } 1340 1341 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1342 MachineIRBuilder &B) { 1343 const unsigned FractBits = 52; 1344 const unsigned ExpBits = 11; 1345 LLT S32 = LLT::scalar(32); 1346 1347 auto Const0 = B.buildConstant(S32, FractBits - 32); 1348 auto Const1 = B.buildConstant(S32, ExpBits); 1349 1350 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1351 .addUse(Const0.getReg(0)) 1352 .addUse(Const1.getReg(0)); 1353 1354 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1355 } 1356 1357 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1358 MachineInstr &MI, MachineRegisterInfo &MRI, 1359 MachineIRBuilder &B) const { 1360 B.setInstr(MI); 1361 1362 const LLT S1 = LLT::scalar(1); 1363 const LLT S32 = LLT::scalar(32); 1364 const LLT S64 = LLT::scalar(64); 1365 1366 Register Src = MI.getOperand(1).getReg(); 1367 assert(MRI.getType(Src) == S64); 1368 1369 // TODO: Should this use extract since the low half is unused? 1370 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1371 Register Hi = Unmerge.getReg(1); 1372 1373 // Extract the upper half, since this is where we will find the sign and 1374 // exponent. 1375 auto Exp = extractF64Exponent(Hi, B); 1376 1377 const unsigned FractBits = 52; 1378 1379 // Extract the sign bit. 1380 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1381 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1382 1383 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1384 1385 const auto Zero32 = B.buildConstant(S32, 0); 1386 1387 // Extend back to 64-bits. 1388 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); 1389 1390 auto Shr = B.buildAShr(S64, FractMask, Exp); 1391 auto Not = B.buildNot(S64, Shr); 1392 auto Tmp0 = B.buildAnd(S64, Src, Not); 1393 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1394 1395 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1396 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1397 1398 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1399 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1400 return true; 1401 } 1402 1403 bool AMDGPULegalizerInfo::legalizeITOFP( 1404 MachineInstr &MI, MachineRegisterInfo &MRI, 1405 MachineIRBuilder &B, bool Signed) const { 1406 B.setInstr(MI); 1407 1408 Register Dst = MI.getOperand(0).getReg(); 1409 Register Src = MI.getOperand(1).getReg(); 1410 1411 const LLT S64 = LLT::scalar(64); 1412 const LLT S32 = LLT::scalar(32); 1413 1414 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1415 1416 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1417 1418 auto CvtHi = Signed ? 1419 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1420 B.buildUITOFP(S64, Unmerge.getReg(1)); 1421 1422 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1423 1424 auto ThirtyTwo = B.buildConstant(S32, 32); 1425 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1426 .addUse(CvtHi.getReg(0)) 1427 .addUse(ThirtyTwo.getReg(0)); 1428 1429 // TODO: Should this propagate fast-math-flags? 1430 B.buildFAdd(Dst, LdExp, CvtLo); 1431 MI.eraseFromParent(); 1432 return true; 1433 } 1434 1435 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1436 MachineInstr &MI, MachineRegisterInfo &MRI, 1437 MachineIRBuilder &B) const { 1438 MachineFunction &MF = B.getMF(); 1439 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1440 1441 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1442 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1443 1444 // With ieee_mode disabled, the instructions have the correct behavior 1445 // already for G_FMINNUM/G_FMAXNUM 1446 if (!MFI->getMode().IEEE) 1447 return !IsIEEEOp; 1448 1449 if (IsIEEEOp) 1450 return true; 1451 1452 MachineIRBuilder HelperBuilder(MI); 1453 GISelObserverWrapper DummyObserver; 1454 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1455 HelperBuilder.setInstr(MI); 1456 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1457 } 1458 1459 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1460 MachineInstr &MI, MachineRegisterInfo &MRI, 1461 MachineIRBuilder &B) const { 1462 // TODO: Should move some of this into LegalizerHelper. 1463 1464 // TODO: Promote dynamic indexing of s16 to s32 1465 // TODO: Dynamic s64 indexing is only legal for SGPR. 1466 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); 1467 if (!IdxVal) // Dynamic case will be selected to register indexing. 1468 return true; 1469 1470 Register Dst = MI.getOperand(0).getReg(); 1471 Register Vec = MI.getOperand(1).getReg(); 1472 1473 LLT VecTy = MRI.getType(Vec); 1474 LLT EltTy = VecTy.getElementType(); 1475 assert(EltTy == MRI.getType(Dst)); 1476 1477 B.setInstr(MI); 1478 1479 if (IdxVal.getValue() < VecTy.getNumElements()) 1480 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); 1481 else 1482 B.buildUndef(Dst); 1483 1484 MI.eraseFromParent(); 1485 return true; 1486 } 1487 1488 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1489 MachineInstr &MI, MachineRegisterInfo &MRI, 1490 MachineIRBuilder &B) const { 1491 // TODO: Should move some of this into LegalizerHelper. 1492 1493 // TODO: Promote dynamic indexing of s16 to s32 1494 // TODO: Dynamic s64 indexing is only legal for SGPR. 1495 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); 1496 if (!IdxVal) // Dynamic case will be selected to register indexing. 1497 return true; 1498 1499 Register Dst = MI.getOperand(0).getReg(); 1500 Register Vec = MI.getOperand(1).getReg(); 1501 Register Ins = MI.getOperand(2).getReg(); 1502 1503 LLT VecTy = MRI.getType(Vec); 1504 LLT EltTy = VecTy.getElementType(); 1505 assert(EltTy == MRI.getType(Ins)); 1506 1507 B.setInstr(MI); 1508 1509 if (IdxVal.getValue() < VecTy.getNumElements()) 1510 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); 1511 else 1512 B.buildUndef(Dst); 1513 1514 MI.eraseFromParent(); 1515 return true; 1516 } 1517 1518 bool AMDGPULegalizerInfo::legalizeSinCos( 1519 MachineInstr &MI, MachineRegisterInfo &MRI, 1520 MachineIRBuilder &B) const { 1521 B.setInstr(MI); 1522 1523 Register DstReg = MI.getOperand(0).getReg(); 1524 Register SrcReg = MI.getOperand(1).getReg(); 1525 LLT Ty = MRI.getType(DstReg); 1526 unsigned Flags = MI.getFlags(); 1527 1528 Register TrigVal; 1529 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1530 if (ST.hasTrigReducedRange()) { 1531 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1532 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1533 .addUse(MulVal.getReg(0)) 1534 .setMIFlags(Flags).getReg(0); 1535 } else 1536 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1537 1538 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1539 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1540 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1541 .addUse(TrigVal) 1542 .setMIFlags(Flags); 1543 MI.eraseFromParent(); 1544 return true; 1545 } 1546 1547 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1548 Register DstReg, LLT PtrTy, 1549 MachineIRBuilder &B, const GlobalValue *GV, 1550 unsigned Offset, unsigned GAFlags) const { 1551 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1552 // to the following code sequence: 1553 // 1554 // For constant address space: 1555 // s_getpc_b64 s[0:1] 1556 // s_add_u32 s0, s0, $symbol 1557 // s_addc_u32 s1, s1, 0 1558 // 1559 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1560 // a fixup or relocation is emitted to replace $symbol with a literal 1561 // constant, which is a pc-relative offset from the encoding of the $symbol 1562 // operand to the global variable. 1563 // 1564 // For global address space: 1565 // s_getpc_b64 s[0:1] 1566 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1567 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1568 // 1569 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1570 // fixups or relocations are emitted to replace $symbol@*@lo and 1571 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1572 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1573 // operand to the global variable. 1574 // 1575 // What we want here is an offset from the value returned by s_getpc 1576 // (which is the address of the s_add_u32 instruction) to the global 1577 // variable, but since the encoding of $symbol starts 4 bytes after the start 1578 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1579 // small. This requires us to add 4 to the global variable offset in order to 1580 // compute the correct address. 1581 1582 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1583 1584 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1585 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1586 1587 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1588 .addDef(PCReg); 1589 1590 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1591 if (GAFlags == SIInstrInfo::MO_NONE) 1592 MIB.addImm(0); 1593 else 1594 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1595 1596 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1597 1598 if (PtrTy.getSizeInBits() == 32) 1599 B.buildExtract(DstReg, PCReg, 0); 1600 return true; 1601 } 1602 1603 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1604 MachineInstr &MI, MachineRegisterInfo &MRI, 1605 MachineIRBuilder &B) const { 1606 Register DstReg = MI.getOperand(0).getReg(); 1607 LLT Ty = MRI.getType(DstReg); 1608 unsigned AS = Ty.getAddressSpace(); 1609 1610 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1611 MachineFunction &MF = B.getMF(); 1612 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1613 B.setInstr(MI); 1614 1615 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1616 if (!MFI->isEntryFunction()) { 1617 const Function &Fn = MF.getFunction(); 1618 DiagnosticInfoUnsupported BadLDSDecl( 1619 Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 1620 Fn.getContext().diagnose(BadLDSDecl); 1621 } 1622 1623 // TODO: We could emit code to handle the initialization somewhere. 1624 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1625 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1626 MI.eraseFromParent(); 1627 return true; 1628 } 1629 1630 const Function &Fn = MF.getFunction(); 1631 DiagnosticInfoUnsupported BadInit( 1632 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1633 Fn.getContext().diagnose(BadInit); 1634 return true; 1635 } 1636 1637 const SITargetLowering *TLI = ST.getTargetLowering(); 1638 1639 if (TLI->shouldEmitFixup(GV)) { 1640 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 1641 MI.eraseFromParent(); 1642 return true; 1643 } 1644 1645 if (TLI->shouldEmitPCReloc(GV)) { 1646 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 1647 MI.eraseFromParent(); 1648 return true; 1649 } 1650 1651 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1652 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 1653 1654 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 1655 MachinePointerInfo::getGOT(MF), 1656 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1657 MachineMemOperand::MOInvariant, 1658 8 /*Size*/, 8 /*Align*/); 1659 1660 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 1661 1662 if (Ty.getSizeInBits() == 32) { 1663 // Truncate if this is a 32-bit constant adrdess. 1664 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 1665 B.buildExtract(DstReg, Load, 0); 1666 } else 1667 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 1668 1669 MI.eraseFromParent(); 1670 return true; 1671 } 1672 1673 bool AMDGPULegalizerInfo::legalizeLoad( 1674 MachineInstr &MI, MachineRegisterInfo &MRI, 1675 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 1676 B.setInstr(MI); 1677 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1678 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 1679 Observer.changingInstr(MI); 1680 MI.getOperand(1).setReg(Cast.getReg(0)); 1681 Observer.changedInstr(MI); 1682 return true; 1683 } 1684 1685 bool AMDGPULegalizerInfo::legalizeFMad( 1686 MachineInstr &MI, MachineRegisterInfo &MRI, 1687 MachineIRBuilder &B) const { 1688 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 1689 assert(Ty.isScalar()); 1690 1691 // TODO: Always legal with future ftz flag. 1692 if (Ty == LLT::scalar(32) && !ST.hasFP32Denormals()) 1693 return true; 1694 if (Ty == LLT::scalar(16) && !ST.hasFP16Denormals()) 1695 return true; 1696 1697 MachineFunction &MF = B.getMF(); 1698 1699 MachineIRBuilder HelperBuilder(MI); 1700 GISelObserverWrapper DummyObserver; 1701 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1702 HelperBuilder.setMBB(*MI.getParent()); 1703 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 1704 } 1705 1706 // Return the use branch instruction, otherwise null if the usage is invalid. 1707 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 1708 MachineRegisterInfo &MRI) { 1709 Register CondDef = MI.getOperand(0).getReg(); 1710 if (!MRI.hasOneNonDBGUse(CondDef)) 1711 return nullptr; 1712 1713 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 1714 return UseMI.getParent() == MI.getParent() && 1715 UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr; 1716 } 1717 1718 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 1719 Register Reg, LLT Ty) const { 1720 Register LiveIn = MRI.getLiveInVirtReg(Reg); 1721 if (LiveIn) 1722 return LiveIn; 1723 1724 Register NewReg = MRI.createGenericVirtualRegister(Ty); 1725 MRI.addLiveIn(Reg, NewReg); 1726 return NewReg; 1727 } 1728 1729 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 1730 const ArgDescriptor *Arg) const { 1731 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 1732 return false; // TODO: Handle these 1733 1734 assert(Arg->getRegister().isPhysical()); 1735 1736 MachineRegisterInfo &MRI = *B.getMRI(); 1737 1738 LLT Ty = MRI.getType(DstReg); 1739 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 1740 1741 if (Arg->isMasked()) { 1742 // TODO: Should we try to emit this once in the entry block? 1743 const LLT S32 = LLT::scalar(32); 1744 const unsigned Mask = Arg->getMask(); 1745 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 1746 1747 Register AndMaskSrc = LiveIn; 1748 1749 if (Shift != 0) { 1750 auto ShiftAmt = B.buildConstant(S32, Shift); 1751 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 1752 } 1753 1754 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 1755 } else 1756 B.buildCopy(DstReg, LiveIn); 1757 1758 // Insert the argument copy if it doens't already exist. 1759 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 1760 if (!MRI.getVRegDef(LiveIn)) { 1761 // FIXME: Should have scoped insert pt 1762 MachineBasicBlock &OrigInsBB = B.getMBB(); 1763 auto OrigInsPt = B.getInsertPt(); 1764 1765 MachineBasicBlock &EntryMBB = B.getMF().front(); 1766 EntryMBB.addLiveIn(Arg->getRegister()); 1767 B.setInsertPt(EntryMBB, EntryMBB.begin()); 1768 B.buildCopy(LiveIn, Arg->getRegister()); 1769 1770 B.setInsertPt(OrigInsBB, OrigInsPt); 1771 } 1772 1773 return true; 1774 } 1775 1776 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 1777 MachineInstr &MI, 1778 MachineRegisterInfo &MRI, 1779 MachineIRBuilder &B, 1780 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 1781 B.setInstr(MI); 1782 1783 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1784 1785 const ArgDescriptor *Arg; 1786 const TargetRegisterClass *RC; 1787 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 1788 if (!Arg) { 1789 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 1790 return false; 1791 } 1792 1793 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 1794 MI.eraseFromParent(); 1795 return true; 1796 } 1797 1798 return false; 1799 } 1800 1801 bool AMDGPULegalizerInfo::legalizeFDIVFast(MachineInstr &MI, 1802 MachineRegisterInfo &MRI, 1803 MachineIRBuilder &B) const { 1804 B.setInstr(MI); 1805 Register Res = MI.getOperand(0).getReg(); 1806 Register LHS = MI.getOperand(2).getReg(); 1807 Register RHS = MI.getOperand(3).getReg(); 1808 uint16_t Flags = MI.getFlags(); 1809 1810 LLT S32 = LLT::scalar(32); 1811 LLT S1 = LLT::scalar(1); 1812 1813 auto Abs = B.buildFAbs(S32, RHS, Flags); 1814 const APFloat C0Val(1.0f); 1815 1816 auto C0 = B.buildConstant(S32, 0x6f800000); 1817 auto C1 = B.buildConstant(S32, 0x2f800000); 1818 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 1819 1820 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 1821 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 1822 1823 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 1824 1825 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 1826 .addUse(Mul0.getReg(0)) 1827 .setMIFlags(Flags); 1828 1829 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 1830 1831 B.buildFMul(Res, Sel, Mul1, Flags); 1832 1833 MI.eraseFromParent(); 1834 return true; 1835 } 1836 1837 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 1838 MachineRegisterInfo &MRI, 1839 MachineIRBuilder &B) const { 1840 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1841 if (!MFI->isEntryFunction()) { 1842 return legalizePreloadedArgIntrin(MI, MRI, B, 1843 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 1844 } 1845 1846 B.setInstr(MI); 1847 1848 uint64_t Offset = 1849 ST.getTargetLowering()->getImplicitParameterOffset( 1850 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 1851 Register DstReg = MI.getOperand(0).getReg(); 1852 LLT DstTy = MRI.getType(DstReg); 1853 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 1854 1855 const ArgDescriptor *Arg; 1856 const TargetRegisterClass *RC; 1857 std::tie(Arg, RC) 1858 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 1859 if (!Arg) 1860 return false; 1861 1862 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 1863 if (!loadInputValue(KernargPtrReg, B, Arg)) 1864 return false; 1865 1866 B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 1867 MI.eraseFromParent(); 1868 return true; 1869 } 1870 1871 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 1872 MachineRegisterInfo &MRI, 1873 MachineIRBuilder &B, 1874 unsigned AddrSpace) const { 1875 B.setInstr(MI); 1876 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 1877 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 1878 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 1879 MI.eraseFromParent(); 1880 return true; 1881 } 1882 1883 /// Handle register layout difference for f16 images for some subtargets. 1884 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 1885 MachineRegisterInfo &MRI, 1886 Register Reg) const { 1887 if (!ST.hasUnpackedD16VMem()) 1888 return Reg; 1889 1890 const LLT S16 = LLT::scalar(16); 1891 const LLT S32 = LLT::scalar(32); 1892 LLT StoreVT = MRI.getType(Reg); 1893 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 1894 1895 auto Unmerge = B.buildUnmerge(S16, Reg); 1896 1897 SmallVector<Register, 4> WideRegs; 1898 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 1899 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 1900 1901 int NumElts = StoreVT.getNumElements(); 1902 1903 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 1904 } 1905 1906 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI, 1907 MachineRegisterInfo &MRI, 1908 MachineIRBuilder &B, 1909 bool IsFormat) const { 1910 // TODO: Reject f16 format on targets where unsupported. 1911 Register VData = MI.getOperand(1).getReg(); 1912 LLT Ty = MRI.getType(VData); 1913 1914 B.setInstr(MI); 1915 1916 const LLT S32 = LLT::scalar(32); 1917 const LLT S16 = LLT::scalar(16); 1918 1919 // Fixup illegal register types for i8 stores. 1920 if (Ty == LLT::scalar(8) || Ty == S16) { 1921 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 1922 MI.getOperand(1).setReg(AnyExt); 1923 return true; 1924 } 1925 1926 if (Ty.isVector()) { 1927 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 1928 if (IsFormat) 1929 MI.getOperand(1).setReg(handleD16VData(B, MRI, VData)); 1930 return true; 1931 } 1932 1933 return Ty.getElementType() == S32 && Ty.getNumElements() <= 4; 1934 } 1935 1936 return Ty == S32; 1937 } 1938 1939 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 1940 MachineRegisterInfo &MRI, 1941 MachineIRBuilder &B) const { 1942 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 1943 switch (MI.getIntrinsicID()) { 1944 case Intrinsic::amdgcn_if: { 1945 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 1946 const SIRegisterInfo *TRI 1947 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 1948 1949 B.setInstr(*BrCond); 1950 Register Def = MI.getOperand(1).getReg(); 1951 Register Use = MI.getOperand(3).getReg(); 1952 B.buildInstr(AMDGPU::SI_IF) 1953 .addDef(Def) 1954 .addUse(Use) 1955 .addMBB(BrCond->getOperand(1).getMBB()); 1956 1957 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 1958 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 1959 MI.eraseFromParent(); 1960 BrCond->eraseFromParent(); 1961 return true; 1962 } 1963 1964 return false; 1965 } 1966 case Intrinsic::amdgcn_loop: { 1967 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 1968 const SIRegisterInfo *TRI 1969 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 1970 1971 B.setInstr(*BrCond); 1972 Register Reg = MI.getOperand(2).getReg(); 1973 B.buildInstr(AMDGPU::SI_LOOP) 1974 .addUse(Reg) 1975 .addMBB(BrCond->getOperand(1).getMBB()); 1976 MI.eraseFromParent(); 1977 BrCond->eraseFromParent(); 1978 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 1979 return true; 1980 } 1981 1982 return false; 1983 } 1984 case Intrinsic::amdgcn_kernarg_segment_ptr: 1985 return legalizePreloadedArgIntrin( 1986 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 1987 case Intrinsic::amdgcn_implicitarg_ptr: 1988 return legalizeImplicitArgPtr(MI, MRI, B); 1989 case Intrinsic::amdgcn_workitem_id_x: 1990 return legalizePreloadedArgIntrin(MI, MRI, B, 1991 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 1992 case Intrinsic::amdgcn_workitem_id_y: 1993 return legalizePreloadedArgIntrin(MI, MRI, B, 1994 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 1995 case Intrinsic::amdgcn_workitem_id_z: 1996 return legalizePreloadedArgIntrin(MI, MRI, B, 1997 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 1998 case Intrinsic::amdgcn_workgroup_id_x: 1999 return legalizePreloadedArgIntrin(MI, MRI, B, 2000 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 2001 case Intrinsic::amdgcn_workgroup_id_y: 2002 return legalizePreloadedArgIntrin(MI, MRI, B, 2003 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 2004 case Intrinsic::amdgcn_workgroup_id_z: 2005 return legalizePreloadedArgIntrin(MI, MRI, B, 2006 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 2007 case Intrinsic::amdgcn_dispatch_ptr: 2008 return legalizePreloadedArgIntrin(MI, MRI, B, 2009 AMDGPUFunctionArgInfo::DISPATCH_PTR); 2010 case Intrinsic::amdgcn_queue_ptr: 2011 return legalizePreloadedArgIntrin(MI, MRI, B, 2012 AMDGPUFunctionArgInfo::QUEUE_PTR); 2013 case Intrinsic::amdgcn_implicit_buffer_ptr: 2014 return legalizePreloadedArgIntrin( 2015 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 2016 case Intrinsic::amdgcn_dispatch_id: 2017 return legalizePreloadedArgIntrin(MI, MRI, B, 2018 AMDGPUFunctionArgInfo::DISPATCH_ID); 2019 case Intrinsic::amdgcn_fdiv_fast: 2020 return legalizeFDIVFast(MI, MRI, B); 2021 case Intrinsic::amdgcn_is_shared: 2022 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 2023 case Intrinsic::amdgcn_is_private: 2024 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 2025 case Intrinsic::amdgcn_wavefrontsize: { 2026 B.setInstr(MI); 2027 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 2028 MI.eraseFromParent(); 2029 return true; 2030 } 2031 case Intrinsic::amdgcn_raw_buffer_store: 2032 return legalizeRawBufferStore(MI, MRI, B, false); 2033 case Intrinsic::amdgcn_raw_buffer_store_format: 2034 return legalizeRawBufferStore(MI, MRI, B, true); 2035 default: 2036 return true; 2037 } 2038 2039 return true; 2040 } 2041