1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPU.h" 22 #include "AMDGPULegalizerInfo.h" 23 #include "AMDGPUTargetMachine.h" 24 #include "SIMachineFunctionInfo.h" 25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/CodeGen/TargetOpcodes.h" 28 #include "llvm/CodeGen/ValueTypes.h" 29 #include "llvm/IR/DerivedTypes.h" 30 #include "llvm/IR/DiagnosticInfo.h" 31 #include "llvm/IR/Type.h" 32 #include "llvm/Support/Debug.h" 33 34 #define DEBUG_TYPE "amdgpu-legalinfo" 35 36 using namespace llvm; 37 using namespace LegalizeActions; 38 using namespace LegalizeMutations; 39 using namespace LegalityPredicates; 40 41 42 static LegalityPredicate isMultiple32(unsigned TypeIdx, 43 unsigned MaxSize = 1024) { 44 return [=](const LegalityQuery &Query) { 45 const LLT Ty = Query.Types[TypeIdx]; 46 const LLT EltTy = Ty.getScalarType(); 47 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 48 }; 49 } 50 51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 52 return [=](const LegalityQuery &Query) { 53 return Query.Types[TypeIdx].getSizeInBits() == Size; 54 }; 55 } 56 57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 58 return [=](const LegalityQuery &Query) { 59 const LLT Ty = Query.Types[TypeIdx]; 60 return Ty.isVector() && 61 Ty.getNumElements() % 2 != 0 && 62 Ty.getElementType().getSizeInBits() < 32 && 63 Ty.getSizeInBits() % 32 != 0; 64 }; 65 } 66 67 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 68 return [=](const LegalityQuery &Query) { 69 const LLT Ty = Query.Types[TypeIdx]; 70 const LLT EltTy = Ty.getScalarType(); 71 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 72 }; 73 } 74 75 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 76 return [=](const LegalityQuery &Query) { 77 const LLT Ty = Query.Types[TypeIdx]; 78 const LLT EltTy = Ty.getElementType(); 79 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 80 }; 81 } 82 83 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 84 return [=](const LegalityQuery &Query) { 85 const LLT Ty = Query.Types[TypeIdx]; 86 const LLT EltTy = Ty.getElementType(); 87 unsigned Size = Ty.getSizeInBits(); 88 unsigned Pieces = (Size + 63) / 64; 89 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 90 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 91 }; 92 } 93 94 // Increase the number of vector elements to reach the next multiple of 32-bit 95 // type. 96 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 97 return [=](const LegalityQuery &Query) { 98 const LLT Ty = Query.Types[TypeIdx]; 99 100 const LLT EltTy = Ty.getElementType(); 101 const int Size = Ty.getSizeInBits(); 102 const int EltSize = EltTy.getSizeInBits(); 103 const int NextMul32 = (Size + 31) / 32; 104 105 assert(EltSize < 32); 106 107 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 108 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 109 }; 110 } 111 112 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 113 return [=](const LegalityQuery &Query) { 114 const LLT QueryTy = Query.Types[TypeIdx]; 115 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 116 }; 117 } 118 119 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 120 return [=](const LegalityQuery &Query) { 121 const LLT QueryTy = Query.Types[TypeIdx]; 122 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 123 }; 124 } 125 126 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 127 return [=](const LegalityQuery &Query) { 128 const LLT QueryTy = Query.Types[TypeIdx]; 129 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 130 }; 131 } 132 133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 134 // v2s16. 135 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 136 return [=](const LegalityQuery &Query) { 137 const LLT Ty = Query.Types[TypeIdx]; 138 if (Ty.isVector()) { 139 const int EltSize = Ty.getElementType().getSizeInBits(); 140 return EltSize == 32 || EltSize == 64 || 141 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 142 EltSize == 128 || EltSize == 256; 143 } 144 145 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 146 }; 147 } 148 149 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 150 return [=](const LegalityQuery &Query) { 151 return Query.Types[TypeIdx].getElementType() == Type; 152 }; 153 } 154 155 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 156 return [=](const LegalityQuery &Query) { 157 const LLT Ty = Query.Types[TypeIdx]; 158 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 159 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 160 }; 161 } 162 163 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 164 const GCNTargetMachine &TM) 165 : ST(ST_) { 166 using namespace TargetOpcode; 167 168 auto GetAddrSpacePtr = [&TM](unsigned AS) { 169 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 170 }; 171 172 const LLT S1 = LLT::scalar(1); 173 const LLT S8 = LLT::scalar(8); 174 const LLT S16 = LLT::scalar(16); 175 const LLT S32 = LLT::scalar(32); 176 const LLT S64 = LLT::scalar(64); 177 const LLT S96 = LLT::scalar(96); 178 const LLT S128 = LLT::scalar(128); 179 const LLT S256 = LLT::scalar(256); 180 const LLT S1024 = LLT::scalar(1024); 181 182 const LLT V2S16 = LLT::vector(2, 16); 183 const LLT V4S16 = LLT::vector(4, 16); 184 185 const LLT V2S32 = LLT::vector(2, 32); 186 const LLT V3S32 = LLT::vector(3, 32); 187 const LLT V4S32 = LLT::vector(4, 32); 188 const LLT V5S32 = LLT::vector(5, 32); 189 const LLT V6S32 = LLT::vector(6, 32); 190 const LLT V7S32 = LLT::vector(7, 32); 191 const LLT V8S32 = LLT::vector(8, 32); 192 const LLT V9S32 = LLT::vector(9, 32); 193 const LLT V10S32 = LLT::vector(10, 32); 194 const LLT V11S32 = LLT::vector(11, 32); 195 const LLT V12S32 = LLT::vector(12, 32); 196 const LLT V13S32 = LLT::vector(13, 32); 197 const LLT V14S32 = LLT::vector(14, 32); 198 const LLT V15S32 = LLT::vector(15, 32); 199 const LLT V16S32 = LLT::vector(16, 32); 200 const LLT V32S32 = LLT::vector(32, 32); 201 202 const LLT V2S64 = LLT::vector(2, 64); 203 const LLT V3S64 = LLT::vector(3, 64); 204 const LLT V4S64 = LLT::vector(4, 64); 205 const LLT V5S64 = LLT::vector(5, 64); 206 const LLT V6S64 = LLT::vector(6, 64); 207 const LLT V7S64 = LLT::vector(7, 64); 208 const LLT V8S64 = LLT::vector(8, 64); 209 const LLT V16S64 = LLT::vector(16, 64); 210 211 std::initializer_list<LLT> AllS32Vectors = 212 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 213 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 214 std::initializer_list<LLT> AllS64Vectors = 215 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 216 217 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 218 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 219 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 220 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 221 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 222 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 223 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 224 225 const LLT CodePtr = FlatPtr; 226 227 const std::initializer_list<LLT> AddrSpaces64 = { 228 GlobalPtr, ConstantPtr, FlatPtr 229 }; 230 231 const std::initializer_list<LLT> AddrSpaces32 = { 232 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 233 }; 234 235 const std::initializer_list<LLT> FPTypesBase = { 236 S32, S64 237 }; 238 239 const std::initializer_list<LLT> FPTypes16 = { 240 S32, S64, S16 241 }; 242 243 const std::initializer_list<LLT> FPTypesPK16 = { 244 S32, S64, S16, V2S16 245 }; 246 247 setAction({G_BRCOND, S1}, Legal); 248 249 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 250 // elements for v3s16 251 getActionDefinitionsBuilder(G_PHI) 252 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 253 .legalFor(AllS32Vectors) 254 .legalFor(AllS64Vectors) 255 .legalFor(AddrSpaces64) 256 .legalFor(AddrSpaces32) 257 .clampScalar(0, S32, S256) 258 .widenScalarToNextPow2(0, 32) 259 .clampMaxNumElements(0, S32, 16) 260 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 261 .legalIf(isPointer(0)); 262 263 if (ST.has16BitInsts()) { 264 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 265 .legalFor({S32, S16}) 266 .clampScalar(0, S16, S32) 267 .scalarize(0); 268 } else { 269 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 270 .legalFor({S32}) 271 .clampScalar(0, S32, S32) 272 .scalarize(0); 273 } 274 275 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 276 .legalFor({S32}) 277 .clampScalar(0, S32, S32) 278 .scalarize(0); 279 280 // Report legal for any types we can handle anywhere. For the cases only legal 281 // on the SALU, RegBankSelect will be able to re-legalize. 282 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 283 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 284 .clampScalar(0, S32, S64) 285 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 286 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 287 .widenScalarToNextPow2(0) 288 .scalarize(0); 289 290 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 291 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 292 .legalFor({{S32, S1}}) 293 .clampScalar(0, S32, S32) 294 .scalarize(0); // TODO: Implement. 295 296 getActionDefinitionsBuilder({G_SADDO, G_SSUBO}) 297 .lower(); 298 299 getActionDefinitionsBuilder(G_BITCAST) 300 // Don't worry about the size constraint. 301 .legalIf(all(isRegisterType(0), isRegisterType(1))) 302 // FIXME: Testing hack 303 .legalForCartesianProduct({S16, LLT::vector(2, 8), }); 304 305 getActionDefinitionsBuilder(G_FCONSTANT) 306 .legalFor({S32, S64, S16}) 307 .clampScalar(0, S16, S64); 308 309 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 310 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 311 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 312 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 313 .clampScalarOrElt(0, S32, S1024) 314 .legalIf(isMultiple32(0)) 315 .widenScalarToNextPow2(0, 32) 316 .clampMaxNumElements(0, S32, 16); 317 318 319 // FIXME: i1 operands to intrinsics should always be legal, but other i1 320 // values may not be legal. We need to figure out how to distinguish 321 // between these two scenarios. 322 getActionDefinitionsBuilder(G_CONSTANT) 323 .legalFor({S1, S32, S64, S16, GlobalPtr, 324 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 325 .clampScalar(0, S32, S64) 326 .widenScalarToNextPow2(0) 327 .legalIf(isPointer(0)); 328 329 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 330 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 331 .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr}); 332 333 334 auto &FPOpActions = getActionDefinitionsBuilder( 335 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 336 .legalFor({S32, S64}); 337 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 338 .customFor({S32, S64}); 339 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 340 .customFor({S32, S64}); 341 342 if (ST.has16BitInsts()) { 343 if (ST.hasVOP3PInsts()) 344 FPOpActions.legalFor({S16, V2S16}); 345 else 346 FPOpActions.legalFor({S16}); 347 348 TrigActions.customFor({S16}); 349 FDIVActions.customFor({S16}); 350 } 351 352 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 353 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 354 355 if (ST.hasVOP3PInsts()) { 356 MinNumMaxNum.customFor(FPTypesPK16) 357 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 358 .clampMaxNumElements(0, S16, 2) 359 .clampScalar(0, S16, S64) 360 .scalarize(0); 361 } else if (ST.has16BitInsts()) { 362 MinNumMaxNum.customFor(FPTypes16) 363 .clampScalar(0, S16, S64) 364 .scalarize(0); 365 } else { 366 MinNumMaxNum.customFor(FPTypesBase) 367 .clampScalar(0, S32, S64) 368 .scalarize(0); 369 } 370 371 if (ST.hasVOP3PInsts()) 372 FPOpActions.clampMaxNumElements(0, S16, 2); 373 374 FPOpActions 375 .scalarize(0) 376 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 377 378 TrigActions 379 .scalarize(0) 380 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 381 382 FDIVActions 383 .scalarize(0) 384 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 385 386 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 387 .legalFor(FPTypesPK16) 388 .clampMaxNumElements(0, S16, 2) 389 .scalarize(0) 390 .clampScalar(0, S16, S64); 391 392 // TODO: Implement 393 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); 394 395 if (ST.has16BitInsts()) { 396 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 397 .legalFor({S32, S64, S16}) 398 .scalarize(0) 399 .clampScalar(0, S16, S64); 400 } else { 401 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 402 .legalFor({S32, S64}) 403 .scalarize(0) 404 .clampScalar(0, S32, S64); 405 } 406 407 getActionDefinitionsBuilder(G_FPTRUNC) 408 .legalFor({{S32, S64}, {S16, S32}}) 409 .scalarize(0); 410 411 getActionDefinitionsBuilder(G_FPEXT) 412 .legalFor({{S64, S32}, {S32, S16}}) 413 .lowerFor({{S64, S16}}) // FIXME: Implement 414 .scalarize(0); 415 416 // TODO: Verify V_BFI_B32 is generated from expanded bit ops. 417 getActionDefinitionsBuilder(G_FCOPYSIGN).lower(); 418 419 getActionDefinitionsBuilder(G_FSUB) 420 // Use actual fsub instruction 421 .legalFor({S32}) 422 // Must use fadd + fneg 423 .lowerFor({S64, S16, V2S16}) 424 .scalarize(0) 425 .clampScalar(0, S32, S64); 426 427 // Whether this is legal depends on the floating point mode for the function. 428 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 429 if (ST.hasMadF16()) 430 FMad.customFor({S32, S16}); 431 else 432 FMad.customFor({S32}); 433 FMad.scalarize(0) 434 .lower(); 435 436 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 437 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 438 {S32, S1}, {S64, S1}, {S16, S1}, 439 {S96, S32}, 440 // FIXME: Hack 441 {S64, LLT::scalar(33)}, 442 {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}}) 443 .scalarize(0); 444 445 // TODO: Split s1->s64 during regbankselect for VALU. 446 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 447 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 448 .lowerFor({{S32, S64}}) 449 .lowerIf(typeIs(1, S1)) 450 .customFor({{S64, S64}}); 451 if (ST.has16BitInsts()) 452 IToFP.legalFor({{S16, S16}}); 453 IToFP.clampScalar(1, S32, S64) 454 .scalarize(0); 455 456 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 457 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}); 458 if (ST.has16BitInsts()) 459 FPToI.legalFor({{S16, S16}}); 460 else 461 FPToI.minScalar(1, S32); 462 463 FPToI.minScalar(0, S32) 464 .scalarize(0); 465 466 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 467 .legalFor({S32, S64}) 468 .scalarize(0); 469 470 if (ST.has16BitInsts()) { 471 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 472 .legalFor({S16, S32, S64}) 473 .clampScalar(0, S16, S64) 474 .scalarize(0); 475 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 476 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 477 .legalFor({S32, S64}) 478 .clampScalar(0, S32, S64) 479 .scalarize(0); 480 } else { 481 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 482 .legalFor({S32}) 483 .customFor({S64}) 484 .clampScalar(0, S32, S64) 485 .scalarize(0); 486 } 487 488 getActionDefinitionsBuilder(G_PTR_ADD) 489 .legalForCartesianProduct(AddrSpaces64, {S64}) 490 .legalForCartesianProduct(AddrSpaces32, {S32}) 491 .scalarize(0); 492 493 getActionDefinitionsBuilder(G_PTR_MASK) 494 .scalarize(0) 495 .alwaysLegal(); 496 497 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 498 499 auto &CmpBuilder = 500 getActionDefinitionsBuilder(G_ICMP) 501 .legalForCartesianProduct( 502 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 503 .legalFor({{S1, S32}, {S1, S64}}); 504 if (ST.has16BitInsts()) { 505 CmpBuilder.legalFor({{S1, S16}}); 506 } 507 508 CmpBuilder 509 .widenScalarToNextPow2(1) 510 .clampScalar(1, S32, S64) 511 .scalarize(0) 512 .legalIf(all(typeIs(0, S1), isPointer(1))); 513 514 getActionDefinitionsBuilder(G_FCMP) 515 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 516 .widenScalarToNextPow2(1) 517 .clampScalar(1, S32, S64) 518 .scalarize(0); 519 520 // FIXME: fexp, flog2, flog10 needs to be custom lowered. 521 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, 522 G_FLOG, G_FLOG2, G_FLOG10}) 523 .legalFor({S32}) 524 .scalarize(0); 525 526 // The 64-bit versions produce 32-bit results, but only on the SALU. 527 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, 528 G_CTTZ, G_CTTZ_ZERO_UNDEF, 529 G_CTPOP}) 530 .legalFor({{S32, S32}, {S32, S64}}) 531 .clampScalar(0, S32, S32) 532 .clampScalar(1, S32, S64) 533 .scalarize(0) 534 .widenScalarToNextPow2(0, 32) 535 .widenScalarToNextPow2(1, 32); 536 537 // TODO: Expand for > s32 538 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE}) 539 .legalFor({S32}) 540 .clampScalar(0, S32, S32) 541 .scalarize(0); 542 543 if (ST.has16BitInsts()) { 544 if (ST.hasVOP3PInsts()) { 545 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 546 .legalFor({S32, S16, V2S16}) 547 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 548 .clampMaxNumElements(0, S16, 2) 549 .clampScalar(0, S16, S32) 550 .widenScalarToNextPow2(0) 551 .scalarize(0); 552 } else { 553 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 554 .legalFor({S32, S16}) 555 .widenScalarToNextPow2(0) 556 .clampScalar(0, S16, S32) 557 .scalarize(0); 558 } 559 } else { 560 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 561 .legalFor({S32}) 562 .clampScalar(0, S32, S32) 563 .widenScalarToNextPow2(0) 564 .scalarize(0); 565 } 566 567 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 568 return [=](const LegalityQuery &Query) { 569 return Query.Types[TypeIdx0].getSizeInBits() < 570 Query.Types[TypeIdx1].getSizeInBits(); 571 }; 572 }; 573 574 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 575 return [=](const LegalityQuery &Query) { 576 return Query.Types[TypeIdx0].getSizeInBits() > 577 Query.Types[TypeIdx1].getSizeInBits(); 578 }; 579 }; 580 581 getActionDefinitionsBuilder(G_INTTOPTR) 582 // List the common cases 583 .legalForCartesianProduct(AddrSpaces64, {S64}) 584 .legalForCartesianProduct(AddrSpaces32, {S32}) 585 .scalarize(0) 586 // Accept any address space as long as the size matches 587 .legalIf(sameSize(0, 1)) 588 .widenScalarIf(smallerThan(1, 0), 589 [](const LegalityQuery &Query) { 590 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 591 }) 592 .narrowScalarIf(greaterThan(1, 0), 593 [](const LegalityQuery &Query) { 594 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 595 }); 596 597 getActionDefinitionsBuilder(G_PTRTOINT) 598 // List the common cases 599 .legalForCartesianProduct(AddrSpaces64, {S64}) 600 .legalForCartesianProduct(AddrSpaces32, {S32}) 601 .scalarize(0) 602 // Accept any address space as long as the size matches 603 .legalIf(sameSize(0, 1)) 604 .widenScalarIf(smallerThan(0, 1), 605 [](const LegalityQuery &Query) { 606 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 607 }) 608 .narrowScalarIf( 609 greaterThan(0, 1), 610 [](const LegalityQuery &Query) { 611 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 612 }); 613 614 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 615 .scalarize(0) 616 .custom(); 617 618 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 619 // handle some operations by just promoting the register during 620 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 621 auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned { 622 switch (AS) { 623 // FIXME: Private element size. 624 case AMDGPUAS::PRIVATE_ADDRESS: 625 return 32; 626 // FIXME: Check subtarget 627 case AMDGPUAS::LOCAL_ADDRESS: 628 return ST.useDS128() ? 128 : 64; 629 630 // Treat constant and global as identical. SMRD loads are sometimes usable 631 // for global loads (ideally constant address space should be eliminated) 632 // depending on the context. Legality cannot be context dependent, but 633 // RegBankSelect can split the load as necessary depending on the pointer 634 // register bank/uniformity and if the memory is invariant or not written in 635 // a kernel. 636 case AMDGPUAS::CONSTANT_ADDRESS: 637 case AMDGPUAS::GLOBAL_ADDRESS: 638 return 512; 639 default: 640 return 128; 641 } 642 }; 643 644 const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool { 645 const LLT DstTy = Query.Types[0]; 646 647 // Split vector extloads. 648 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 649 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 650 return true; 651 652 const LLT PtrTy = Query.Types[1]; 653 unsigned AS = PtrTy.getAddressSpace(); 654 if (MemSize > maxSizeForAddrSpace(AS)) 655 return true; 656 657 // Catch weird sized loads that don't evenly divide into the access sizes 658 // TODO: May be able to widen depending on alignment etc. 659 unsigned NumRegs = MemSize / 32; 660 if (NumRegs == 3 && !ST.hasDwordx3LoadStores()) 661 return true; 662 663 unsigned Align = Query.MMODescrs[0].AlignInBits; 664 if (Align < MemSize) { 665 const SITargetLowering *TLI = ST.getTargetLowering(); 666 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 667 } 668 669 return false; 670 }; 671 672 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 673 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 674 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 675 676 // TODO: Refine based on subtargets which support unaligned access or 128-bit 677 // LDS 678 // TODO: Unsupported flat for SI. 679 680 for (unsigned Op : {G_LOAD, G_STORE}) { 681 const bool IsStore = Op == G_STORE; 682 683 auto &Actions = getActionDefinitionsBuilder(Op); 684 // Whitelist the common cases. 685 // TODO: Pointer loads 686 // TODO: Wide constant loads 687 // TODO: Only CI+ has 3x loads 688 // TODO: Loads to s16 on gfx9 689 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 690 {V2S32, GlobalPtr, 64, GlobalAlign32}, 691 {V3S32, GlobalPtr, 96, GlobalAlign32}, 692 {S96, GlobalPtr, 96, GlobalAlign32}, 693 {V4S32, GlobalPtr, 128, GlobalAlign32}, 694 {S128, GlobalPtr, 128, GlobalAlign32}, 695 {S64, GlobalPtr, 64, GlobalAlign32}, 696 {V2S64, GlobalPtr, 128, GlobalAlign32}, 697 {V2S16, GlobalPtr, 32, GlobalAlign32}, 698 {S32, GlobalPtr, 8, GlobalAlign8}, 699 {S32, GlobalPtr, 16, GlobalAlign16}, 700 701 {S32, LocalPtr, 32, 32}, 702 {S64, LocalPtr, 64, 32}, 703 {V2S32, LocalPtr, 64, 32}, 704 {S32, LocalPtr, 8, 8}, 705 {S32, LocalPtr, 16, 16}, 706 {V2S16, LocalPtr, 32, 32}, 707 708 {S32, PrivatePtr, 32, 32}, 709 {S32, PrivatePtr, 8, 8}, 710 {S32, PrivatePtr, 16, 16}, 711 {V2S16, PrivatePtr, 32, 32}, 712 713 {S32, FlatPtr, 32, GlobalAlign32}, 714 {S32, FlatPtr, 16, GlobalAlign16}, 715 {S32, FlatPtr, 8, GlobalAlign8}, 716 {V2S16, FlatPtr, 32, GlobalAlign32}, 717 718 {S32, ConstantPtr, 32, GlobalAlign32}, 719 {V2S32, ConstantPtr, 64, GlobalAlign32}, 720 {V3S32, ConstantPtr, 96, GlobalAlign32}, 721 {V4S32, ConstantPtr, 128, GlobalAlign32}, 722 {S64, ConstantPtr, 64, GlobalAlign32}, 723 {S128, ConstantPtr, 128, GlobalAlign32}, 724 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 725 Actions 726 .customIf(typeIs(1, Constant32Ptr)) 727 .narrowScalarIf( 728 [=](const LegalityQuery &Query) -> bool { 729 return !Query.Types[0].isVector() && needToSplitLoad(Query); 730 }, 731 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 732 const LLT DstTy = Query.Types[0]; 733 const LLT PtrTy = Query.Types[1]; 734 735 const unsigned DstSize = DstTy.getSizeInBits(); 736 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 737 738 // Split extloads. 739 if (DstSize > MemSize) 740 return std::make_pair(0, LLT::scalar(MemSize)); 741 742 if (DstSize > 32 && (DstSize % 32 != 0)) { 743 // FIXME: Need a way to specify non-extload of larger size if 744 // suitably aligned. 745 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 746 } 747 748 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 749 if (MemSize > MaxSize) 750 return std::make_pair(0, LLT::scalar(MaxSize)); 751 752 unsigned Align = Query.MMODescrs[0].AlignInBits; 753 return std::make_pair(0, LLT::scalar(Align)); 754 }) 755 .fewerElementsIf( 756 [=](const LegalityQuery &Query) -> bool { 757 return Query.Types[0].isVector() && needToSplitLoad(Query); 758 }, 759 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 760 const LLT DstTy = Query.Types[0]; 761 const LLT PtrTy = Query.Types[1]; 762 763 LLT EltTy = DstTy.getElementType(); 764 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 765 766 // Split if it's too large for the address space. 767 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 768 unsigned NumElts = DstTy.getNumElements(); 769 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 770 771 // FIXME: Refine when odd breakdowns handled 772 // The scalars will need to be re-legalized. 773 if (NumPieces == 1 || NumPieces >= NumElts || 774 NumElts % NumPieces != 0) 775 return std::make_pair(0, EltTy); 776 777 return std::make_pair(0, 778 LLT::vector(NumElts / NumPieces, EltTy)); 779 } 780 781 // Need to split because of alignment. 782 unsigned Align = Query.MMODescrs[0].AlignInBits; 783 unsigned EltSize = EltTy.getSizeInBits(); 784 if (EltSize > Align && 785 (EltSize / Align < DstTy.getNumElements())) { 786 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 787 } 788 789 // May need relegalization for the scalars. 790 return std::make_pair(0, EltTy); 791 }) 792 .minScalar(0, S32); 793 794 if (IsStore) 795 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 796 797 // TODO: Need a bitcast lower option? 798 Actions 799 .legalIf([=](const LegalityQuery &Query) { 800 const LLT Ty0 = Query.Types[0]; 801 unsigned Size = Ty0.getSizeInBits(); 802 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 803 unsigned Align = Query.MMODescrs[0].AlignInBits; 804 805 // No extending vector loads. 806 if (Size > MemSize && Ty0.isVector()) 807 return false; 808 809 // FIXME: Widening store from alignment not valid. 810 if (MemSize < Size) 811 MemSize = std::max(MemSize, Align); 812 813 switch (MemSize) { 814 case 8: 815 case 16: 816 return Size == 32; 817 case 32: 818 case 64: 819 case 128: 820 return true; 821 case 96: 822 return ST.hasDwordx3LoadStores(); 823 case 256: 824 case 512: 825 return true; 826 default: 827 return false; 828 } 829 }) 830 .widenScalarToNextPow2(0) 831 // TODO: v3s32->v4s32 with alignment 832 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 833 } 834 835 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 836 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 837 {S32, GlobalPtr, 16, 2 * 8}, 838 {S32, LocalPtr, 8, 8}, 839 {S32, LocalPtr, 16, 16}, 840 {S32, PrivatePtr, 8, 8}, 841 {S32, PrivatePtr, 16, 16}, 842 {S32, ConstantPtr, 8, 8}, 843 {S32, ConstantPtr, 16, 2 * 8}}); 844 if (ST.hasFlatAddressSpace()) { 845 ExtLoads.legalForTypesWithMemDesc( 846 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 847 } 848 849 ExtLoads.clampScalar(0, S32, S32) 850 .widenScalarToNextPow2(0) 851 .unsupportedIfMemSizeNotPow2() 852 .lower(); 853 854 auto &Atomics = getActionDefinitionsBuilder( 855 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 856 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 857 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 858 G_ATOMICRMW_UMIN}) 859 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 860 {S64, GlobalPtr}, {S64, LocalPtr}}); 861 if (ST.hasFlatAddressSpace()) { 862 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 863 } 864 865 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 866 .legalFor({{S32, LocalPtr}}); 867 868 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 869 // demarshalling 870 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 871 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 872 {S32, FlatPtr}, {S64, FlatPtr}}) 873 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 874 {S32, RegionPtr}, {S64, RegionPtr}}); 875 876 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS) 877 .lower(); 878 879 // TODO: Pointer types, any 32-bit or 64-bit vector 880 getActionDefinitionsBuilder(G_SELECT) 881 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 882 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 883 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1}) 884 .clampScalar(0, S16, S64) 885 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 886 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 887 .scalarize(1) 888 .clampMaxNumElements(0, S32, 2) 889 .clampMaxNumElements(0, LocalPtr, 2) 890 .clampMaxNumElements(0, PrivatePtr, 2) 891 .scalarize(0) 892 .widenScalarToNextPow2(0) 893 .legalIf(all(isPointer(0), typeIs(1, S1))); 894 895 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 896 // be more flexible with the shift amount type. 897 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 898 .legalFor({{S32, S32}, {S64, S32}}); 899 if (ST.has16BitInsts()) { 900 if (ST.hasVOP3PInsts()) { 901 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 902 .clampMaxNumElements(0, S16, 2); 903 } else 904 Shifts.legalFor({{S16, S32}, {S16, S16}}); 905 906 Shifts.clampScalar(1, S16, S32); 907 Shifts.clampScalar(0, S16, S64); 908 Shifts.widenScalarToNextPow2(0, 16); 909 } else { 910 // Make sure we legalize the shift amount type first, as the general 911 // expansion for the shifted type will produce much worse code if it hasn't 912 // been truncated already. 913 Shifts.clampScalar(1, S32, S32); 914 Shifts.clampScalar(0, S32, S64); 915 Shifts.widenScalarToNextPow2(0, 32); 916 } 917 Shifts.scalarize(0); 918 919 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 920 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 921 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 922 unsigned IdxTypeIdx = 2; 923 924 getActionDefinitionsBuilder(Op) 925 .customIf([=](const LegalityQuery &Query) { 926 const LLT EltTy = Query.Types[EltTypeIdx]; 927 const LLT VecTy = Query.Types[VecTypeIdx]; 928 const LLT IdxTy = Query.Types[IdxTypeIdx]; 929 return (EltTy.getSizeInBits() == 16 || 930 EltTy.getSizeInBits() % 32 == 0) && 931 VecTy.getSizeInBits() % 32 == 0 && 932 VecTy.getSizeInBits() <= 1024 && 933 IdxTy.getSizeInBits() == 32; 934 }) 935 .clampScalar(EltTypeIdx, S32, S64) 936 .clampScalar(VecTypeIdx, S32, S64) 937 .clampScalar(IdxTypeIdx, S32, S32); 938 } 939 940 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 941 .unsupportedIf([=](const LegalityQuery &Query) { 942 const LLT &EltTy = Query.Types[1].getElementType(); 943 return Query.Types[0] != EltTy; 944 }); 945 946 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 947 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 948 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 949 950 // FIXME: Doesn't handle extract of illegal sizes. 951 getActionDefinitionsBuilder(Op) 952 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 953 // FIXME: Multiples of 16 should not be legal. 954 .legalIf([=](const LegalityQuery &Query) { 955 const LLT BigTy = Query.Types[BigTyIdx]; 956 const LLT LitTy = Query.Types[LitTyIdx]; 957 return (BigTy.getSizeInBits() % 32 == 0) && 958 (LitTy.getSizeInBits() % 16 == 0); 959 }) 960 .widenScalarIf( 961 [=](const LegalityQuery &Query) { 962 const LLT BigTy = Query.Types[BigTyIdx]; 963 return (BigTy.getScalarSizeInBits() < 16); 964 }, 965 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 966 .widenScalarIf( 967 [=](const LegalityQuery &Query) { 968 const LLT LitTy = Query.Types[LitTyIdx]; 969 return (LitTy.getScalarSizeInBits() < 16); 970 }, 971 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 972 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 973 .widenScalarToNextPow2(BigTyIdx, 32); 974 975 } 976 977 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 978 .legalForCartesianProduct(AllS32Vectors, {S32}) 979 .legalForCartesianProduct(AllS64Vectors, {S64}) 980 .clampNumElements(0, V16S32, V32S32) 981 .clampNumElements(0, V2S64, V16S64) 982 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 983 984 if (ST.hasScalarPackInsts()) 985 BuildVector.legalFor({V2S16, S32}); 986 987 BuildVector 988 .minScalarSameAs(1, 0) 989 .legalIf(isRegisterType(0)) 990 .minScalarOrElt(0, S32); 991 992 if (ST.hasScalarPackInsts()) { 993 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 994 .legalFor({V2S16, S32}) 995 .lower(); 996 } else { 997 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 998 .lower(); 999 } 1000 1001 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1002 .legalIf(isRegisterType(0)); 1003 1004 // TODO: Don't fully scalarize v2s16 pieces 1005 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1006 1007 // Merge/Unmerge 1008 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1009 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1010 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1011 1012 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1013 const LLT &Ty = Query.Types[TypeIdx]; 1014 if (Ty.isVector()) { 1015 const LLT &EltTy = Ty.getElementType(); 1016 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 1017 return true; 1018 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1019 return true; 1020 } 1021 return false; 1022 }; 1023 1024 auto &Builder = getActionDefinitionsBuilder(Op) 1025 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1026 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1027 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1028 // valid. 1029 .clampScalar(LitTyIdx, S16, S256) 1030 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1031 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1032 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1033 elementTypeIs(1, S16)), 1034 changeTo(1, V2S16)) 1035 // Break up vectors with weird elements into scalars 1036 .fewerElementsIf( 1037 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 1038 scalarize(0)) 1039 .fewerElementsIf( 1040 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 1041 scalarize(1)) 1042 .clampScalar(BigTyIdx, S32, S1024) 1043 .lowerFor({{S16, V2S16}}); 1044 1045 if (Op == G_MERGE_VALUES) { 1046 Builder.widenScalarIf( 1047 // TODO: Use 16-bit shifts if legal for 8-bit values? 1048 [=](const LegalityQuery &Query) { 1049 const LLT Ty = Query.Types[LitTyIdx]; 1050 return Ty.getSizeInBits() < 32; 1051 }, 1052 changeTo(LitTyIdx, S32)); 1053 } 1054 1055 Builder.widenScalarIf( 1056 [=](const LegalityQuery &Query) { 1057 const LLT Ty = Query.Types[BigTyIdx]; 1058 return !isPowerOf2_32(Ty.getSizeInBits()) && 1059 Ty.getSizeInBits() % 16 != 0; 1060 }, 1061 [=](const LegalityQuery &Query) { 1062 // Pick the next power of 2, or a multiple of 64 over 128. 1063 // Whichever is smaller. 1064 const LLT &Ty = Query.Types[BigTyIdx]; 1065 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1066 if (NewSizeInBits >= 256) { 1067 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1068 if (RoundedTo < NewSizeInBits) 1069 NewSizeInBits = RoundedTo; 1070 } 1071 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1072 }) 1073 .legalIf([=](const LegalityQuery &Query) { 1074 const LLT &BigTy = Query.Types[BigTyIdx]; 1075 const LLT &LitTy = Query.Types[LitTyIdx]; 1076 1077 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1078 return false; 1079 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1080 return false; 1081 1082 return BigTy.getSizeInBits() % 16 == 0 && 1083 LitTy.getSizeInBits() % 16 == 0 && 1084 BigTy.getSizeInBits() <= 1024; 1085 }) 1086 // Any vectors left are the wrong size. Scalarize them. 1087 .scalarize(0) 1088 .scalarize(1); 1089 } 1090 1091 getActionDefinitionsBuilder(G_SEXT_INREG).lower(); 1092 1093 computeTables(); 1094 verify(*ST.getInstrInfo()); 1095 } 1096 1097 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1098 MachineRegisterInfo &MRI, 1099 MachineIRBuilder &B, 1100 GISelChangeObserver &Observer) const { 1101 switch (MI.getOpcode()) { 1102 case TargetOpcode::G_ADDRSPACE_CAST: 1103 return legalizeAddrSpaceCast(MI, MRI, B); 1104 case TargetOpcode::G_FRINT: 1105 return legalizeFrint(MI, MRI, B); 1106 case TargetOpcode::G_FCEIL: 1107 return legalizeFceil(MI, MRI, B); 1108 case TargetOpcode::G_INTRINSIC_TRUNC: 1109 return legalizeIntrinsicTrunc(MI, MRI, B); 1110 case TargetOpcode::G_SITOFP: 1111 return legalizeITOFP(MI, MRI, B, true); 1112 case TargetOpcode::G_UITOFP: 1113 return legalizeITOFP(MI, MRI, B, false); 1114 case TargetOpcode::G_FMINNUM: 1115 case TargetOpcode::G_FMAXNUM: 1116 case TargetOpcode::G_FMINNUM_IEEE: 1117 case TargetOpcode::G_FMAXNUM_IEEE: 1118 return legalizeMinNumMaxNum(MI, MRI, B); 1119 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1120 return legalizeExtractVectorElt(MI, MRI, B); 1121 case TargetOpcode::G_INSERT_VECTOR_ELT: 1122 return legalizeInsertVectorElt(MI, MRI, B); 1123 case TargetOpcode::G_FSIN: 1124 case TargetOpcode::G_FCOS: 1125 return legalizeSinCos(MI, MRI, B); 1126 case TargetOpcode::G_GLOBAL_VALUE: 1127 return legalizeGlobalValue(MI, MRI, B); 1128 case TargetOpcode::G_LOAD: 1129 return legalizeLoad(MI, MRI, B, Observer); 1130 case TargetOpcode::G_FMAD: 1131 return legalizeFMad(MI, MRI, B); 1132 case TargetOpcode::G_FDIV: 1133 return legalizeFDIV(MI, MRI, B); 1134 case TargetOpcode::G_ATOMIC_CMPXCHG: 1135 return legalizeAtomicCmpXChg(MI, MRI, B); 1136 default: 1137 return false; 1138 } 1139 1140 llvm_unreachable("expected switch to return"); 1141 } 1142 1143 Register AMDGPULegalizerInfo::getSegmentAperture( 1144 unsigned AS, 1145 MachineRegisterInfo &MRI, 1146 MachineIRBuilder &B) const { 1147 MachineFunction &MF = B.getMF(); 1148 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1149 const LLT S32 = LLT::scalar(32); 1150 1151 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1152 1153 if (ST.hasApertureRegs()) { 1154 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1155 // getreg. 1156 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1157 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1158 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1159 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1160 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1161 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1162 unsigned Encoding = 1163 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1164 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1165 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1166 1167 Register ApertureReg = MRI.createGenericVirtualRegister(S32); 1168 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1169 1170 B.buildInstr(AMDGPU::S_GETREG_B32) 1171 .addDef(GetReg) 1172 .addImm(Encoding); 1173 MRI.setType(GetReg, S32); 1174 1175 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1176 B.buildInstr(TargetOpcode::G_SHL) 1177 .addDef(ApertureReg) 1178 .addUse(GetReg) 1179 .addUse(ShiftAmt.getReg(0)); 1180 1181 return ApertureReg; 1182 } 1183 1184 Register QueuePtr = MRI.createGenericVirtualRegister( 1185 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1186 1187 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1188 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1189 return Register(); 1190 1191 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1192 // private_segment_aperture_base_hi. 1193 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1194 1195 // TODO: can we be smarter about machine pointer info? 1196 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1197 MachineMemOperand *MMO = MF.getMachineMemOperand( 1198 PtrInfo, 1199 MachineMemOperand::MOLoad | 1200 MachineMemOperand::MODereferenceable | 1201 MachineMemOperand::MOInvariant, 1202 4, 1203 MinAlign(64, StructOffset)); 1204 1205 Register LoadResult = MRI.createGenericVirtualRegister(S32); 1206 Register LoadAddr; 1207 1208 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1209 B.buildLoad(LoadResult, LoadAddr, *MMO); 1210 return LoadResult; 1211 } 1212 1213 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1214 MachineInstr &MI, MachineRegisterInfo &MRI, 1215 MachineIRBuilder &B) const { 1216 MachineFunction &MF = B.getMF(); 1217 1218 B.setInstr(MI); 1219 1220 const LLT S32 = LLT::scalar(32); 1221 Register Dst = MI.getOperand(0).getReg(); 1222 Register Src = MI.getOperand(1).getReg(); 1223 1224 LLT DstTy = MRI.getType(Dst); 1225 LLT SrcTy = MRI.getType(Src); 1226 unsigned DestAS = DstTy.getAddressSpace(); 1227 unsigned SrcAS = SrcTy.getAddressSpace(); 1228 1229 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1230 // vector element. 1231 assert(!DstTy.isVector()); 1232 1233 const AMDGPUTargetMachine &TM 1234 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1235 1236 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1237 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1238 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1239 return true; 1240 } 1241 1242 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1243 // Truncate. 1244 B.buildExtract(Dst, Src, 0); 1245 MI.eraseFromParent(); 1246 return true; 1247 } 1248 1249 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1250 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1251 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1252 1253 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1254 // another. Merge operands are required to be the same type, but creating an 1255 // extra ptrtoint would be kind of pointless. 1256 auto HighAddr = B.buildConstant( 1257 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1258 B.buildMerge(Dst, {Src, HighAddr.getReg(0)}); 1259 MI.eraseFromParent(); 1260 return true; 1261 } 1262 1263 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1264 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1265 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1266 unsigned NullVal = TM.getNullPointerValue(DestAS); 1267 1268 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1269 auto FlatNull = B.buildConstant(SrcTy, 0); 1270 1271 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy); 1272 1273 // Extract low 32-bits of the pointer. 1274 B.buildExtract(PtrLo32, Src, 0); 1275 1276 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1277 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); 1278 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1279 1280 MI.eraseFromParent(); 1281 return true; 1282 } 1283 1284 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1285 return false; 1286 1287 if (!ST.hasFlatAddressSpace()) 1288 return false; 1289 1290 auto SegmentNull = 1291 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1292 auto FlatNull = 1293 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1294 1295 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1296 if (!ApertureReg.isValid()) 1297 return false; 1298 1299 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1300 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); 1301 1302 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy); 1303 1304 // Coerce the type of the low half of the result so we can use merge_values. 1305 Register SrcAsInt = MRI.createGenericVirtualRegister(S32); 1306 B.buildInstr(TargetOpcode::G_PTRTOINT) 1307 .addDef(SrcAsInt) 1308 .addUse(Src); 1309 1310 // TODO: Should we allow mismatched types but matching sizes in merges to 1311 // avoid the ptrtoint? 1312 B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); 1313 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); 1314 1315 MI.eraseFromParent(); 1316 return true; 1317 } 1318 1319 bool AMDGPULegalizerInfo::legalizeFrint( 1320 MachineInstr &MI, MachineRegisterInfo &MRI, 1321 MachineIRBuilder &B) const { 1322 B.setInstr(MI); 1323 1324 Register Src = MI.getOperand(1).getReg(); 1325 LLT Ty = MRI.getType(Src); 1326 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1327 1328 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1329 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1330 1331 auto C1 = B.buildFConstant(Ty, C1Val); 1332 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1333 1334 // TODO: Should this propagate fast-math-flags? 1335 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1336 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1337 1338 auto C2 = B.buildFConstant(Ty, C2Val); 1339 auto Fabs = B.buildFAbs(Ty, Src); 1340 1341 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1342 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1343 return true; 1344 } 1345 1346 bool AMDGPULegalizerInfo::legalizeFceil( 1347 MachineInstr &MI, MachineRegisterInfo &MRI, 1348 MachineIRBuilder &B) const { 1349 B.setInstr(MI); 1350 1351 const LLT S1 = LLT::scalar(1); 1352 const LLT S64 = LLT::scalar(64); 1353 1354 Register Src = MI.getOperand(1).getReg(); 1355 assert(MRI.getType(Src) == S64); 1356 1357 // result = trunc(src) 1358 // if (src > 0.0 && src != result) 1359 // result += 1.0 1360 1361 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src}); 1362 1363 const auto Zero = B.buildFConstant(S64, 0.0); 1364 const auto One = B.buildFConstant(S64, 1.0); 1365 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1366 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1367 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1368 auto Add = B.buildSelect(S64, And, One, Zero); 1369 1370 // TODO: Should this propagate fast-math-flags? 1371 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1372 return true; 1373 } 1374 1375 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1376 MachineIRBuilder &B) { 1377 const unsigned FractBits = 52; 1378 const unsigned ExpBits = 11; 1379 LLT S32 = LLT::scalar(32); 1380 1381 auto Const0 = B.buildConstant(S32, FractBits - 32); 1382 auto Const1 = B.buildConstant(S32, ExpBits); 1383 1384 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1385 .addUse(Const0.getReg(0)) 1386 .addUse(Const1.getReg(0)); 1387 1388 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1389 } 1390 1391 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1392 MachineInstr &MI, MachineRegisterInfo &MRI, 1393 MachineIRBuilder &B) const { 1394 B.setInstr(MI); 1395 1396 const LLT S1 = LLT::scalar(1); 1397 const LLT S32 = LLT::scalar(32); 1398 const LLT S64 = LLT::scalar(64); 1399 1400 Register Src = MI.getOperand(1).getReg(); 1401 assert(MRI.getType(Src) == S64); 1402 1403 // TODO: Should this use extract since the low half is unused? 1404 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1405 Register Hi = Unmerge.getReg(1); 1406 1407 // Extract the upper half, since this is where we will find the sign and 1408 // exponent. 1409 auto Exp = extractF64Exponent(Hi, B); 1410 1411 const unsigned FractBits = 52; 1412 1413 // Extract the sign bit. 1414 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1415 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1416 1417 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1418 1419 const auto Zero32 = B.buildConstant(S32, 0); 1420 1421 // Extend back to 64-bits. 1422 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); 1423 1424 auto Shr = B.buildAShr(S64, FractMask, Exp); 1425 auto Not = B.buildNot(S64, Shr); 1426 auto Tmp0 = B.buildAnd(S64, Src, Not); 1427 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1428 1429 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1430 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1431 1432 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1433 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1434 return true; 1435 } 1436 1437 bool AMDGPULegalizerInfo::legalizeITOFP( 1438 MachineInstr &MI, MachineRegisterInfo &MRI, 1439 MachineIRBuilder &B, bool Signed) const { 1440 B.setInstr(MI); 1441 1442 Register Dst = MI.getOperand(0).getReg(); 1443 Register Src = MI.getOperand(1).getReg(); 1444 1445 const LLT S64 = LLT::scalar(64); 1446 const LLT S32 = LLT::scalar(32); 1447 1448 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1449 1450 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1451 1452 auto CvtHi = Signed ? 1453 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1454 B.buildUITOFP(S64, Unmerge.getReg(1)); 1455 1456 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1457 1458 auto ThirtyTwo = B.buildConstant(S32, 32); 1459 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1460 .addUse(CvtHi.getReg(0)) 1461 .addUse(ThirtyTwo.getReg(0)); 1462 1463 // TODO: Should this propagate fast-math-flags? 1464 B.buildFAdd(Dst, LdExp, CvtLo); 1465 MI.eraseFromParent(); 1466 return true; 1467 } 1468 1469 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1470 MachineInstr &MI, MachineRegisterInfo &MRI, 1471 MachineIRBuilder &B) const { 1472 MachineFunction &MF = B.getMF(); 1473 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1474 1475 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1476 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1477 1478 // With ieee_mode disabled, the instructions have the correct behavior 1479 // already for G_FMINNUM/G_FMAXNUM 1480 if (!MFI->getMode().IEEE) 1481 return !IsIEEEOp; 1482 1483 if (IsIEEEOp) 1484 return true; 1485 1486 MachineIRBuilder HelperBuilder(MI); 1487 GISelObserverWrapper DummyObserver; 1488 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1489 HelperBuilder.setInstr(MI); 1490 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1491 } 1492 1493 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1494 MachineInstr &MI, MachineRegisterInfo &MRI, 1495 MachineIRBuilder &B) const { 1496 // TODO: Should move some of this into LegalizerHelper. 1497 1498 // TODO: Promote dynamic indexing of s16 to s32 1499 // TODO: Dynamic s64 indexing is only legal for SGPR. 1500 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); 1501 if (!IdxVal) // Dynamic case will be selected to register indexing. 1502 return true; 1503 1504 Register Dst = MI.getOperand(0).getReg(); 1505 Register Vec = MI.getOperand(1).getReg(); 1506 1507 LLT VecTy = MRI.getType(Vec); 1508 LLT EltTy = VecTy.getElementType(); 1509 assert(EltTy == MRI.getType(Dst)); 1510 1511 B.setInstr(MI); 1512 1513 if (IdxVal.getValue() < VecTy.getNumElements()) 1514 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); 1515 else 1516 B.buildUndef(Dst); 1517 1518 MI.eraseFromParent(); 1519 return true; 1520 } 1521 1522 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1523 MachineInstr &MI, MachineRegisterInfo &MRI, 1524 MachineIRBuilder &B) const { 1525 // TODO: Should move some of this into LegalizerHelper. 1526 1527 // TODO: Promote dynamic indexing of s16 to s32 1528 // TODO: Dynamic s64 indexing is only legal for SGPR. 1529 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); 1530 if (!IdxVal) // Dynamic case will be selected to register indexing. 1531 return true; 1532 1533 Register Dst = MI.getOperand(0).getReg(); 1534 Register Vec = MI.getOperand(1).getReg(); 1535 Register Ins = MI.getOperand(2).getReg(); 1536 1537 LLT VecTy = MRI.getType(Vec); 1538 LLT EltTy = VecTy.getElementType(); 1539 assert(EltTy == MRI.getType(Ins)); 1540 1541 B.setInstr(MI); 1542 1543 if (IdxVal.getValue() < VecTy.getNumElements()) 1544 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); 1545 else 1546 B.buildUndef(Dst); 1547 1548 MI.eraseFromParent(); 1549 return true; 1550 } 1551 1552 bool AMDGPULegalizerInfo::legalizeSinCos( 1553 MachineInstr &MI, MachineRegisterInfo &MRI, 1554 MachineIRBuilder &B) const { 1555 B.setInstr(MI); 1556 1557 Register DstReg = MI.getOperand(0).getReg(); 1558 Register SrcReg = MI.getOperand(1).getReg(); 1559 LLT Ty = MRI.getType(DstReg); 1560 unsigned Flags = MI.getFlags(); 1561 1562 Register TrigVal; 1563 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1564 if (ST.hasTrigReducedRange()) { 1565 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1566 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1567 .addUse(MulVal.getReg(0)) 1568 .setMIFlags(Flags).getReg(0); 1569 } else 1570 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1571 1572 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1573 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1574 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1575 .addUse(TrigVal) 1576 .setMIFlags(Flags); 1577 MI.eraseFromParent(); 1578 return true; 1579 } 1580 1581 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1582 Register DstReg, LLT PtrTy, 1583 MachineIRBuilder &B, const GlobalValue *GV, 1584 unsigned Offset, unsigned GAFlags) const { 1585 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1586 // to the following code sequence: 1587 // 1588 // For constant address space: 1589 // s_getpc_b64 s[0:1] 1590 // s_add_u32 s0, s0, $symbol 1591 // s_addc_u32 s1, s1, 0 1592 // 1593 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1594 // a fixup or relocation is emitted to replace $symbol with a literal 1595 // constant, which is a pc-relative offset from the encoding of the $symbol 1596 // operand to the global variable. 1597 // 1598 // For global address space: 1599 // s_getpc_b64 s[0:1] 1600 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1601 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1602 // 1603 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1604 // fixups or relocations are emitted to replace $symbol@*@lo and 1605 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1606 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1607 // operand to the global variable. 1608 // 1609 // What we want here is an offset from the value returned by s_getpc 1610 // (which is the address of the s_add_u32 instruction) to the global 1611 // variable, but since the encoding of $symbol starts 4 bytes after the start 1612 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1613 // small. This requires us to add 4 to the global variable offset in order to 1614 // compute the correct address. 1615 1616 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1617 1618 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1619 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1620 1621 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1622 .addDef(PCReg); 1623 1624 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1625 if (GAFlags == SIInstrInfo::MO_NONE) 1626 MIB.addImm(0); 1627 else 1628 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1629 1630 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1631 1632 if (PtrTy.getSizeInBits() == 32) 1633 B.buildExtract(DstReg, PCReg, 0); 1634 return true; 1635 } 1636 1637 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1638 MachineInstr &MI, MachineRegisterInfo &MRI, 1639 MachineIRBuilder &B) const { 1640 Register DstReg = MI.getOperand(0).getReg(); 1641 LLT Ty = MRI.getType(DstReg); 1642 unsigned AS = Ty.getAddressSpace(); 1643 1644 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1645 MachineFunction &MF = B.getMF(); 1646 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1647 B.setInstr(MI); 1648 1649 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1650 if (!MFI->isEntryFunction()) { 1651 const Function &Fn = MF.getFunction(); 1652 DiagnosticInfoUnsupported BadLDSDecl( 1653 Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 1654 Fn.getContext().diagnose(BadLDSDecl); 1655 } 1656 1657 // TODO: We could emit code to handle the initialization somewhere. 1658 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1659 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1660 MI.eraseFromParent(); 1661 return true; 1662 } 1663 1664 const Function &Fn = MF.getFunction(); 1665 DiagnosticInfoUnsupported BadInit( 1666 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1667 Fn.getContext().diagnose(BadInit); 1668 return true; 1669 } 1670 1671 const SITargetLowering *TLI = ST.getTargetLowering(); 1672 1673 if (TLI->shouldEmitFixup(GV)) { 1674 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 1675 MI.eraseFromParent(); 1676 return true; 1677 } 1678 1679 if (TLI->shouldEmitPCReloc(GV)) { 1680 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 1681 MI.eraseFromParent(); 1682 return true; 1683 } 1684 1685 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1686 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 1687 1688 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 1689 MachinePointerInfo::getGOT(MF), 1690 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1691 MachineMemOperand::MOInvariant, 1692 8 /*Size*/, 8 /*Align*/); 1693 1694 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 1695 1696 if (Ty.getSizeInBits() == 32) { 1697 // Truncate if this is a 32-bit constant adrdess. 1698 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 1699 B.buildExtract(DstReg, Load, 0); 1700 } else 1701 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 1702 1703 MI.eraseFromParent(); 1704 return true; 1705 } 1706 1707 bool AMDGPULegalizerInfo::legalizeLoad( 1708 MachineInstr &MI, MachineRegisterInfo &MRI, 1709 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 1710 B.setInstr(MI); 1711 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1712 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 1713 Observer.changingInstr(MI); 1714 MI.getOperand(1).setReg(Cast.getReg(0)); 1715 Observer.changedInstr(MI); 1716 return true; 1717 } 1718 1719 bool AMDGPULegalizerInfo::legalizeFMad( 1720 MachineInstr &MI, MachineRegisterInfo &MRI, 1721 MachineIRBuilder &B) const { 1722 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 1723 assert(Ty.isScalar()); 1724 1725 MachineFunction &MF = B.getMF(); 1726 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1727 1728 // TODO: Always legal with future ftz flag. 1729 if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals) 1730 return true; 1731 if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals) 1732 return true; 1733 1734 1735 MachineIRBuilder HelperBuilder(MI); 1736 GISelObserverWrapper DummyObserver; 1737 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1738 HelperBuilder.setMBB(*MI.getParent()); 1739 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 1740 } 1741 1742 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 1743 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 1744 Register DstReg = MI.getOperand(0).getReg(); 1745 Register PtrReg = MI.getOperand(1).getReg(); 1746 Register CmpVal = MI.getOperand(2).getReg(); 1747 Register NewVal = MI.getOperand(3).getReg(); 1748 1749 assert(SITargetLowering::isFlatGlobalAddrSpace( 1750 MRI.getType(PtrReg).getAddressSpace()) && 1751 "this should not have been custom lowered"); 1752 1753 LLT ValTy = MRI.getType(CmpVal); 1754 LLT VecTy = LLT::vector(2, ValTy); 1755 1756 B.setInstr(MI); 1757 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 1758 1759 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 1760 .addDef(DstReg) 1761 .addUse(PtrReg) 1762 .addUse(PackedVal) 1763 .setMemRefs(MI.memoperands()); 1764 1765 MI.eraseFromParent(); 1766 return true; 1767 } 1768 1769 // Return the use branch instruction, otherwise null if the usage is invalid. 1770 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 1771 MachineRegisterInfo &MRI) { 1772 Register CondDef = MI.getOperand(0).getReg(); 1773 if (!MRI.hasOneNonDBGUse(CondDef)) 1774 return nullptr; 1775 1776 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 1777 return UseMI.getParent() == MI.getParent() && 1778 UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr; 1779 } 1780 1781 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 1782 Register Reg, LLT Ty) const { 1783 Register LiveIn = MRI.getLiveInVirtReg(Reg); 1784 if (LiveIn) 1785 return LiveIn; 1786 1787 Register NewReg = MRI.createGenericVirtualRegister(Ty); 1788 MRI.addLiveIn(Reg, NewReg); 1789 return NewReg; 1790 } 1791 1792 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 1793 const ArgDescriptor *Arg) const { 1794 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 1795 return false; // TODO: Handle these 1796 1797 assert(Arg->getRegister().isPhysical()); 1798 1799 MachineRegisterInfo &MRI = *B.getMRI(); 1800 1801 LLT Ty = MRI.getType(DstReg); 1802 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 1803 1804 if (Arg->isMasked()) { 1805 // TODO: Should we try to emit this once in the entry block? 1806 const LLT S32 = LLT::scalar(32); 1807 const unsigned Mask = Arg->getMask(); 1808 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 1809 1810 Register AndMaskSrc = LiveIn; 1811 1812 if (Shift != 0) { 1813 auto ShiftAmt = B.buildConstant(S32, Shift); 1814 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 1815 } 1816 1817 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 1818 } else 1819 B.buildCopy(DstReg, LiveIn); 1820 1821 // Insert the argument copy if it doens't already exist. 1822 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 1823 if (!MRI.getVRegDef(LiveIn)) { 1824 // FIXME: Should have scoped insert pt 1825 MachineBasicBlock &OrigInsBB = B.getMBB(); 1826 auto OrigInsPt = B.getInsertPt(); 1827 1828 MachineBasicBlock &EntryMBB = B.getMF().front(); 1829 EntryMBB.addLiveIn(Arg->getRegister()); 1830 B.setInsertPt(EntryMBB, EntryMBB.begin()); 1831 B.buildCopy(LiveIn, Arg->getRegister()); 1832 1833 B.setInsertPt(OrigInsBB, OrigInsPt); 1834 } 1835 1836 return true; 1837 } 1838 1839 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 1840 MachineInstr &MI, 1841 MachineRegisterInfo &MRI, 1842 MachineIRBuilder &B, 1843 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 1844 B.setInstr(MI); 1845 1846 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1847 1848 const ArgDescriptor *Arg; 1849 const TargetRegisterClass *RC; 1850 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 1851 if (!Arg) { 1852 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 1853 return false; 1854 } 1855 1856 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 1857 MI.eraseFromParent(); 1858 return true; 1859 } 1860 1861 return false; 1862 } 1863 1864 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 1865 MachineRegisterInfo &MRI, 1866 MachineIRBuilder &B) const { 1867 B.setInstr(MI); 1868 Register Dst = MI.getOperand(0).getReg(); 1869 LLT DstTy = MRI.getType(Dst); 1870 LLT S16 = LLT::scalar(16); 1871 LLT S32 = LLT::scalar(32); 1872 LLT S64 = LLT::scalar(64); 1873 1874 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 1875 return true; 1876 1877 if (DstTy == S16) 1878 return legalizeFDIV16(MI, MRI, B); 1879 if (DstTy == S32) 1880 return legalizeFDIV32(MI, MRI, B); 1881 if (DstTy == S64) 1882 return legalizeFDIV64(MI, MRI, B); 1883 1884 return false; 1885 } 1886 1887 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 1888 MachineRegisterInfo &MRI, 1889 MachineIRBuilder &B) const { 1890 Register Res = MI.getOperand(0).getReg(); 1891 Register LHS = MI.getOperand(1).getReg(); 1892 Register RHS = MI.getOperand(2).getReg(); 1893 1894 uint16_t Flags = MI.getFlags(); 1895 1896 LLT ResTy = MRI.getType(Res); 1897 LLT S32 = LLT::scalar(32); 1898 LLT S64 = LLT::scalar(64); 1899 1900 const MachineFunction &MF = B.getMF(); 1901 bool Unsafe = 1902 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 1903 1904 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 1905 return false; 1906 1907 if (!Unsafe && ResTy == S32 && 1908 MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals) 1909 return false; 1910 1911 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 1912 // 1 / x -> RCP(x) 1913 if (CLHS->isExactlyValue(1.0)) { 1914 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 1915 .addUse(RHS) 1916 .setMIFlags(Flags); 1917 1918 MI.eraseFromParent(); 1919 return true; 1920 } 1921 1922 // -1 / x -> RCP( FNEG(x) ) 1923 if (CLHS->isExactlyValue(-1.0)) { 1924 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 1925 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 1926 .addUse(FNeg.getReg(0)) 1927 .setMIFlags(Flags); 1928 1929 MI.eraseFromParent(); 1930 return true; 1931 } 1932 } 1933 1934 // x / y -> x * (1.0 / y) 1935 if (Unsafe) { 1936 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 1937 .addUse(RHS) 1938 .setMIFlags(Flags); 1939 B.buildFMul(Res, LHS, RCP, Flags); 1940 1941 MI.eraseFromParent(); 1942 return true; 1943 } 1944 1945 return false; 1946 } 1947 1948 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 1949 MachineRegisterInfo &MRI, 1950 MachineIRBuilder &B) const { 1951 B.setInstr(MI); 1952 Register Res = MI.getOperand(0).getReg(); 1953 Register LHS = MI.getOperand(1).getReg(); 1954 Register RHS = MI.getOperand(2).getReg(); 1955 1956 uint16_t Flags = MI.getFlags(); 1957 1958 LLT S16 = LLT::scalar(16); 1959 LLT S32 = LLT::scalar(32); 1960 1961 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 1962 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 1963 1964 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 1965 .addUse(RHSExt.getReg(0)) 1966 .setMIFlags(Flags); 1967 1968 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 1969 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 1970 1971 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 1972 .addUse(RDst.getReg(0)) 1973 .addUse(RHS) 1974 .addUse(LHS) 1975 .setMIFlags(Flags); 1976 1977 MI.eraseFromParent(); 1978 return true; 1979 } 1980 1981 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 1982 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 1983 static void toggleSPDenormMode(bool Enable, 1984 MachineIRBuilder &B, 1985 const GCNSubtarget &ST, 1986 AMDGPU::SIModeRegisterDefaults Mode) { 1987 // Set SP denorm mode to this value. 1988 unsigned SPDenormMode = 1989 Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; 1990 1991 if (ST.hasDenormModeInst()) { 1992 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 1993 unsigned DPDenormModeDefault = Mode.FP64FP16Denormals 1994 ? FP_DENORM_FLUSH_NONE 1995 : FP_DENORM_FLUSH_IN_FLUSH_OUT; 1996 1997 unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 1998 B.buildInstr(AMDGPU::S_DENORM_MODE) 1999 .addImm(NewDenormModeValue); 2000 2001 } else { 2002 // Select FP32 bit field in mode register. 2003 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2004 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2005 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2006 2007 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2008 .addImm(SPDenormMode) 2009 .addImm(SPDenormModeBitField); 2010 } 2011 } 2012 2013 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2014 MachineRegisterInfo &MRI, 2015 MachineIRBuilder &B) const { 2016 B.setInstr(MI); 2017 Register Res = MI.getOperand(0).getReg(); 2018 Register LHS = MI.getOperand(1).getReg(); 2019 Register RHS = MI.getOperand(2).getReg(); 2020 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2021 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2022 2023 uint16_t Flags = MI.getFlags(); 2024 2025 LLT S32 = LLT::scalar(32); 2026 LLT S1 = LLT::scalar(1); 2027 2028 auto One = B.buildFConstant(S32, 1.0f); 2029 2030 auto DenominatorScaled = 2031 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2032 .addUse(RHS) 2033 .addUse(LHS) 2034 .addImm(1) 2035 .setMIFlags(Flags); 2036 auto NumeratorScaled = 2037 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2038 .addUse(LHS) 2039 .addUse(RHS) 2040 .addImm(0) 2041 .setMIFlags(Flags); 2042 2043 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2044 .addUse(DenominatorScaled.getReg(0)) 2045 .setMIFlags(Flags); 2046 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2047 2048 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2049 // aren't modeled as reading it. 2050 if (!Mode.FP32Denormals) 2051 toggleSPDenormMode(true, B, ST, Mode); 2052 2053 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2054 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2055 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2056 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2057 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2058 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2059 2060 if (!Mode.FP32Denormals) 2061 toggleSPDenormMode(false, B, ST, Mode); 2062 2063 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2064 .addUse(Fma4.getReg(0)) 2065 .addUse(Fma1.getReg(0)) 2066 .addUse(Fma3.getReg(0)) 2067 .addUse(NumeratorScaled.getReg(1)) 2068 .setMIFlags(Flags); 2069 2070 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2071 .addUse(Fmas.getReg(0)) 2072 .addUse(RHS) 2073 .addUse(LHS) 2074 .setMIFlags(Flags); 2075 2076 MI.eraseFromParent(); 2077 return true; 2078 } 2079 2080 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2081 MachineRegisterInfo &MRI, 2082 MachineIRBuilder &B) const { 2083 B.setInstr(MI); 2084 Register Res = MI.getOperand(0).getReg(); 2085 Register LHS = MI.getOperand(1).getReg(); 2086 Register RHS = MI.getOperand(2).getReg(); 2087 2088 uint16_t Flags = MI.getFlags(); 2089 2090 LLT S64 = LLT::scalar(64); 2091 LLT S1 = LLT::scalar(1); 2092 2093 auto One = B.buildFConstant(S64, 1.0); 2094 2095 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2096 .addUse(LHS) 2097 .addUse(RHS) 2098 .addImm(1) 2099 .setMIFlags(Flags); 2100 2101 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2102 2103 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2104 .addUse(DivScale0.getReg(0)) 2105 .setMIFlags(Flags); 2106 2107 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2108 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2109 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2110 2111 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2112 .addUse(LHS) 2113 .addUse(RHS) 2114 .addImm(0) 2115 .setMIFlags(Flags); 2116 2117 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2118 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); 2119 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2120 2121 Register Scale; 2122 if (!ST.hasUsableDivScaleConditionOutput()) { 2123 // Workaround a hardware bug on SI where the condition output from div_scale 2124 // is not usable. 2125 2126 Scale = MRI.createGenericVirtualRegister(S1); 2127 2128 LLT S32 = LLT::scalar(32); 2129 2130 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2131 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2132 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2133 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 2134 2135 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 2136 Scale1Unmerge.getReg(1)); 2137 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 2138 Scale0Unmerge.getReg(1)); 2139 B.buildXor(Scale, CmpNum, CmpDen); 2140 } else { 2141 Scale = DivScale1.getReg(1); 2142 } 2143 2144 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 2145 .addUse(Fma4.getReg(0)) 2146 .addUse(Fma3.getReg(0)) 2147 .addUse(Mul.getReg(0)) 2148 .addUse(Scale) 2149 .setMIFlags(Flags); 2150 2151 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 2152 .addUse(Fmas.getReg(0)) 2153 .addUse(RHS) 2154 .addUse(LHS) 2155 .setMIFlags(Flags); 2156 2157 MI.eraseFromParent(); 2158 return true; 2159 } 2160 2161 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 2162 MachineRegisterInfo &MRI, 2163 MachineIRBuilder &B) const { 2164 B.setInstr(MI); 2165 Register Res = MI.getOperand(0).getReg(); 2166 Register LHS = MI.getOperand(2).getReg(); 2167 Register RHS = MI.getOperand(3).getReg(); 2168 uint16_t Flags = MI.getFlags(); 2169 2170 LLT S32 = LLT::scalar(32); 2171 LLT S1 = LLT::scalar(1); 2172 2173 auto Abs = B.buildFAbs(S32, RHS, Flags); 2174 const APFloat C0Val(1.0f); 2175 2176 auto C0 = B.buildConstant(S32, 0x6f800000); 2177 auto C1 = B.buildConstant(S32, 0x2f800000); 2178 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 2179 2180 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 2181 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 2182 2183 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 2184 2185 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2186 .addUse(Mul0.getReg(0)) 2187 .setMIFlags(Flags); 2188 2189 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 2190 2191 B.buildFMul(Res, Sel, Mul1, Flags); 2192 2193 MI.eraseFromParent(); 2194 return true; 2195 } 2196 2197 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 2198 MachineRegisterInfo &MRI, 2199 MachineIRBuilder &B) const { 2200 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2201 if (!MFI->isEntryFunction()) { 2202 return legalizePreloadedArgIntrin(MI, MRI, B, 2203 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 2204 } 2205 2206 B.setInstr(MI); 2207 2208 uint64_t Offset = 2209 ST.getTargetLowering()->getImplicitParameterOffset( 2210 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 2211 Register DstReg = MI.getOperand(0).getReg(); 2212 LLT DstTy = MRI.getType(DstReg); 2213 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 2214 2215 const ArgDescriptor *Arg; 2216 const TargetRegisterClass *RC; 2217 std::tie(Arg, RC) 2218 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2219 if (!Arg) 2220 return false; 2221 2222 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 2223 if (!loadInputValue(KernargPtrReg, B, Arg)) 2224 return false; 2225 2226 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 2227 MI.eraseFromParent(); 2228 return true; 2229 } 2230 2231 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 2232 MachineRegisterInfo &MRI, 2233 MachineIRBuilder &B, 2234 unsigned AddrSpace) const { 2235 B.setInstr(MI); 2236 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 2237 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 2238 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 2239 MI.eraseFromParent(); 2240 return true; 2241 } 2242 2243 /// Handle register layout difference for f16 images for some subtargets. 2244 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 2245 MachineRegisterInfo &MRI, 2246 Register Reg) const { 2247 if (!ST.hasUnpackedD16VMem()) 2248 return Reg; 2249 2250 const LLT S16 = LLT::scalar(16); 2251 const LLT S32 = LLT::scalar(32); 2252 LLT StoreVT = MRI.getType(Reg); 2253 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 2254 2255 auto Unmerge = B.buildUnmerge(S16, Reg); 2256 2257 SmallVector<Register, 4> WideRegs; 2258 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 2259 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 2260 2261 int NumElts = StoreVT.getNumElements(); 2262 2263 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 2264 } 2265 2266 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI, 2267 MachineRegisterInfo &MRI, 2268 MachineIRBuilder &B, 2269 bool IsFormat) const { 2270 // TODO: Reject f16 format on targets where unsupported. 2271 Register VData = MI.getOperand(1).getReg(); 2272 LLT Ty = MRI.getType(VData); 2273 2274 B.setInstr(MI); 2275 2276 const LLT S32 = LLT::scalar(32); 2277 const LLT S16 = LLT::scalar(16); 2278 2279 // Fixup illegal register types for i8 stores. 2280 if (Ty == LLT::scalar(8) || Ty == S16) { 2281 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 2282 MI.getOperand(1).setReg(AnyExt); 2283 return true; 2284 } 2285 2286 if (Ty.isVector()) { 2287 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 2288 if (IsFormat) 2289 MI.getOperand(1).setReg(handleD16VData(B, MRI, VData)); 2290 return true; 2291 } 2292 2293 return Ty.getElementType() == S32 && Ty.getNumElements() <= 4; 2294 } 2295 2296 return Ty == S32; 2297 } 2298 2299 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 2300 MachineRegisterInfo &MRI, 2301 MachineIRBuilder &B) const { 2302 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 2303 auto IntrID = MI.getIntrinsicID(); 2304 switch (IntrID) { 2305 case Intrinsic::amdgcn_if: 2306 case Intrinsic::amdgcn_else: { 2307 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 2308 const SIRegisterInfo *TRI 2309 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 2310 2311 B.setInstr(*BrCond); 2312 Register Def = MI.getOperand(1).getReg(); 2313 Register Use = MI.getOperand(3).getReg(); 2314 2315 if (IntrID == Intrinsic::amdgcn_if) { 2316 B.buildInstr(AMDGPU::SI_IF) 2317 .addDef(Def) 2318 .addUse(Use) 2319 .addMBB(BrCond->getOperand(1).getMBB()); 2320 } else { 2321 B.buildInstr(AMDGPU::SI_ELSE) 2322 .addDef(Def) 2323 .addUse(Use) 2324 .addMBB(BrCond->getOperand(1).getMBB()) 2325 .addImm(0); 2326 } 2327 2328 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 2329 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 2330 MI.eraseFromParent(); 2331 BrCond->eraseFromParent(); 2332 return true; 2333 } 2334 2335 return false; 2336 } 2337 case Intrinsic::amdgcn_loop: { 2338 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 2339 const SIRegisterInfo *TRI 2340 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 2341 2342 B.setInstr(*BrCond); 2343 Register Reg = MI.getOperand(2).getReg(); 2344 B.buildInstr(AMDGPU::SI_LOOP) 2345 .addUse(Reg) 2346 .addMBB(BrCond->getOperand(1).getMBB()); 2347 MI.eraseFromParent(); 2348 BrCond->eraseFromParent(); 2349 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 2350 return true; 2351 } 2352 2353 return false; 2354 } 2355 case Intrinsic::amdgcn_kernarg_segment_ptr: 2356 return legalizePreloadedArgIntrin( 2357 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2358 case Intrinsic::amdgcn_implicitarg_ptr: 2359 return legalizeImplicitArgPtr(MI, MRI, B); 2360 case Intrinsic::amdgcn_workitem_id_x: 2361 return legalizePreloadedArgIntrin(MI, MRI, B, 2362 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 2363 case Intrinsic::amdgcn_workitem_id_y: 2364 return legalizePreloadedArgIntrin(MI, MRI, B, 2365 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 2366 case Intrinsic::amdgcn_workitem_id_z: 2367 return legalizePreloadedArgIntrin(MI, MRI, B, 2368 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 2369 case Intrinsic::amdgcn_workgroup_id_x: 2370 return legalizePreloadedArgIntrin(MI, MRI, B, 2371 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 2372 case Intrinsic::amdgcn_workgroup_id_y: 2373 return legalizePreloadedArgIntrin(MI, MRI, B, 2374 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 2375 case Intrinsic::amdgcn_workgroup_id_z: 2376 return legalizePreloadedArgIntrin(MI, MRI, B, 2377 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 2378 case Intrinsic::amdgcn_dispatch_ptr: 2379 return legalizePreloadedArgIntrin(MI, MRI, B, 2380 AMDGPUFunctionArgInfo::DISPATCH_PTR); 2381 case Intrinsic::amdgcn_queue_ptr: 2382 return legalizePreloadedArgIntrin(MI, MRI, B, 2383 AMDGPUFunctionArgInfo::QUEUE_PTR); 2384 case Intrinsic::amdgcn_implicit_buffer_ptr: 2385 return legalizePreloadedArgIntrin( 2386 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 2387 case Intrinsic::amdgcn_dispatch_id: 2388 return legalizePreloadedArgIntrin(MI, MRI, B, 2389 AMDGPUFunctionArgInfo::DISPATCH_ID); 2390 case Intrinsic::amdgcn_fdiv_fast: 2391 return legalizeFDIVFastIntrin(MI, MRI, B); 2392 case Intrinsic::amdgcn_is_shared: 2393 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 2394 case Intrinsic::amdgcn_is_private: 2395 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 2396 case Intrinsic::amdgcn_wavefrontsize: { 2397 B.setInstr(MI); 2398 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 2399 MI.eraseFromParent(); 2400 return true; 2401 } 2402 case Intrinsic::amdgcn_raw_buffer_store: 2403 return legalizeRawBufferStore(MI, MRI, B, false); 2404 case Intrinsic::amdgcn_raw_buffer_store_format: 2405 return legalizeRawBufferStore(MI, MRI, B, true); 2406 default: 2407 return true; 2408 } 2409 2410 return true; 2411 } 2412