1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPU.h" 22 #include "AMDGPULegalizerInfo.h" 23 #include "AMDGPUTargetMachine.h" 24 #include "SIMachineFunctionInfo.h" 25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/CodeGen/TargetOpcodes.h" 28 #include "llvm/CodeGen/ValueTypes.h" 29 #include "llvm/IR/DerivedTypes.h" 30 #include "llvm/IR/DiagnosticInfo.h" 31 #include "llvm/IR/Type.h" 32 #include "llvm/Support/Debug.h" 33 34 #define DEBUG_TYPE "amdgpu-legalinfo" 35 36 using namespace llvm; 37 using namespace LegalizeActions; 38 using namespace LegalizeMutations; 39 using namespace LegalityPredicates; 40 41 42 static LegalityPredicate isMultiple32(unsigned TypeIdx, 43 unsigned MaxSize = 1024) { 44 return [=](const LegalityQuery &Query) { 45 const LLT Ty = Query.Types[TypeIdx]; 46 const LLT EltTy = Ty.getScalarType(); 47 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 48 }; 49 } 50 51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 52 return [=](const LegalityQuery &Query) { 53 return Query.Types[TypeIdx].getSizeInBits() == Size; 54 }; 55 } 56 57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 58 return [=](const LegalityQuery &Query) { 59 const LLT Ty = Query.Types[TypeIdx]; 60 return Ty.isVector() && 61 Ty.getNumElements() % 2 != 0 && 62 Ty.getElementType().getSizeInBits() < 32 && 63 Ty.getSizeInBits() % 32 != 0; 64 }; 65 } 66 67 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 68 return [=](const LegalityQuery &Query) { 69 const LLT Ty = Query.Types[TypeIdx]; 70 const LLT EltTy = Ty.getScalarType(); 71 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 72 }; 73 } 74 75 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 76 return [=](const LegalityQuery &Query) { 77 const LLT Ty = Query.Types[TypeIdx]; 78 const LLT EltTy = Ty.getElementType(); 79 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 80 }; 81 } 82 83 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 84 return [=](const LegalityQuery &Query) { 85 const LLT Ty = Query.Types[TypeIdx]; 86 const LLT EltTy = Ty.getElementType(); 87 unsigned Size = Ty.getSizeInBits(); 88 unsigned Pieces = (Size + 63) / 64; 89 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 90 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 91 }; 92 } 93 94 // Increase the number of vector elements to reach the next multiple of 32-bit 95 // type. 96 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 97 return [=](const LegalityQuery &Query) { 98 const LLT Ty = Query.Types[TypeIdx]; 99 100 const LLT EltTy = Ty.getElementType(); 101 const int Size = Ty.getSizeInBits(); 102 const int EltSize = EltTy.getSizeInBits(); 103 const int NextMul32 = (Size + 31) / 32; 104 105 assert(EltSize < 32); 106 107 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 108 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 109 }; 110 } 111 112 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 113 return [=](const LegalityQuery &Query) { 114 const LLT QueryTy = Query.Types[TypeIdx]; 115 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 116 }; 117 } 118 119 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 120 return [=](const LegalityQuery &Query) { 121 const LLT QueryTy = Query.Types[TypeIdx]; 122 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 123 }; 124 } 125 126 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 127 return [=](const LegalityQuery &Query) { 128 const LLT QueryTy = Query.Types[TypeIdx]; 129 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 130 }; 131 } 132 133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 134 // v2s16. 135 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 136 return [=](const LegalityQuery &Query) { 137 const LLT Ty = Query.Types[TypeIdx]; 138 if (Ty.isVector()) { 139 const int EltSize = Ty.getElementType().getSizeInBits(); 140 return EltSize == 32 || EltSize == 64 || 141 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 142 EltSize == 128 || EltSize == 256; 143 } 144 145 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 146 }; 147 } 148 149 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 150 return [=](const LegalityQuery &Query) { 151 return Query.Types[TypeIdx].getElementType() == Type; 152 }; 153 } 154 155 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 156 return [=](const LegalityQuery &Query) { 157 const LLT Ty = Query.Types[TypeIdx]; 158 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 159 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 160 }; 161 } 162 163 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 164 const GCNTargetMachine &TM) 165 : ST(ST_) { 166 using namespace TargetOpcode; 167 168 auto GetAddrSpacePtr = [&TM](unsigned AS) { 169 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 170 }; 171 172 const LLT S1 = LLT::scalar(1); 173 const LLT S8 = LLT::scalar(8); 174 const LLT S16 = LLT::scalar(16); 175 const LLT S32 = LLT::scalar(32); 176 const LLT S64 = LLT::scalar(64); 177 const LLT S96 = LLT::scalar(96); 178 const LLT S128 = LLT::scalar(128); 179 const LLT S256 = LLT::scalar(256); 180 const LLT S1024 = LLT::scalar(1024); 181 182 const LLT V2S16 = LLT::vector(2, 16); 183 const LLT V4S16 = LLT::vector(4, 16); 184 185 const LLT V2S32 = LLT::vector(2, 32); 186 const LLT V3S32 = LLT::vector(3, 32); 187 const LLT V4S32 = LLT::vector(4, 32); 188 const LLT V5S32 = LLT::vector(5, 32); 189 const LLT V6S32 = LLT::vector(6, 32); 190 const LLT V7S32 = LLT::vector(7, 32); 191 const LLT V8S32 = LLT::vector(8, 32); 192 const LLT V9S32 = LLT::vector(9, 32); 193 const LLT V10S32 = LLT::vector(10, 32); 194 const LLT V11S32 = LLT::vector(11, 32); 195 const LLT V12S32 = LLT::vector(12, 32); 196 const LLT V13S32 = LLT::vector(13, 32); 197 const LLT V14S32 = LLT::vector(14, 32); 198 const LLT V15S32 = LLT::vector(15, 32); 199 const LLT V16S32 = LLT::vector(16, 32); 200 const LLT V32S32 = LLT::vector(32, 32); 201 202 const LLT V2S64 = LLT::vector(2, 64); 203 const LLT V3S64 = LLT::vector(3, 64); 204 const LLT V4S64 = LLT::vector(4, 64); 205 const LLT V5S64 = LLT::vector(5, 64); 206 const LLT V6S64 = LLT::vector(6, 64); 207 const LLT V7S64 = LLT::vector(7, 64); 208 const LLT V8S64 = LLT::vector(8, 64); 209 const LLT V16S64 = LLT::vector(16, 64); 210 211 std::initializer_list<LLT> AllS32Vectors = 212 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 213 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 214 std::initializer_list<LLT> AllS64Vectors = 215 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 216 217 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 218 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 219 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 220 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 221 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 222 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 223 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 224 225 const LLT CodePtr = FlatPtr; 226 227 const std::initializer_list<LLT> AddrSpaces64 = { 228 GlobalPtr, ConstantPtr, FlatPtr 229 }; 230 231 const std::initializer_list<LLT> AddrSpaces32 = { 232 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 233 }; 234 235 const std::initializer_list<LLT> FPTypesBase = { 236 S32, S64 237 }; 238 239 const std::initializer_list<LLT> FPTypes16 = { 240 S32, S64, S16 241 }; 242 243 const std::initializer_list<LLT> FPTypesPK16 = { 244 S32, S64, S16, V2S16 245 }; 246 247 setAction({G_BRCOND, S1}, Legal); 248 249 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 250 // elements for v3s16 251 getActionDefinitionsBuilder(G_PHI) 252 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 253 .legalFor(AllS32Vectors) 254 .legalFor(AllS64Vectors) 255 .legalFor(AddrSpaces64) 256 .legalFor(AddrSpaces32) 257 .clampScalar(0, S32, S256) 258 .widenScalarToNextPow2(0, 32) 259 .clampMaxNumElements(0, S32, 16) 260 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 261 .legalIf(isPointer(0)); 262 263 if (ST.has16BitInsts()) { 264 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 265 .legalFor({S32, S16}) 266 .clampScalar(0, S16, S32) 267 .scalarize(0); 268 } else { 269 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 270 .legalFor({S32}) 271 .clampScalar(0, S32, S32) 272 .scalarize(0); 273 } 274 275 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 276 .legalFor({S32}) 277 .clampScalar(0, S32, S32) 278 .scalarize(0); 279 280 // Report legal for any types we can handle anywhere. For the cases only legal 281 // on the SALU, RegBankSelect will be able to re-legalize. 282 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 283 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 284 .clampScalar(0, S32, S64) 285 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 286 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 287 .widenScalarToNextPow2(0) 288 .scalarize(0); 289 290 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 291 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 292 .legalFor({{S32, S1}}) 293 .clampScalar(0, S32, S32) 294 .scalarize(0); // TODO: Implement. 295 296 getActionDefinitionsBuilder({G_SADDO, G_SSUBO}) 297 .lower(); 298 299 getActionDefinitionsBuilder(G_BITCAST) 300 // Don't worry about the size constraint. 301 .legalIf(all(isRegisterType(0), isRegisterType(1))) 302 // FIXME: Testing hack 303 .legalForCartesianProduct({S16, LLT::vector(2, 8), }); 304 305 getActionDefinitionsBuilder(G_FCONSTANT) 306 .legalFor({S32, S64, S16}) 307 .clampScalar(0, S16, S64); 308 309 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 310 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 311 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 312 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 313 .clampScalarOrElt(0, S32, S1024) 314 .legalIf(isMultiple32(0)) 315 .widenScalarToNextPow2(0, 32) 316 .clampMaxNumElements(0, S32, 16); 317 318 319 // FIXME: i1 operands to intrinsics should always be legal, but other i1 320 // values may not be legal. We need to figure out how to distinguish 321 // between these two scenarios. 322 getActionDefinitionsBuilder(G_CONSTANT) 323 .legalFor({S1, S32, S64, S16, GlobalPtr, 324 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 325 .clampScalar(0, S32, S64) 326 .widenScalarToNextPow2(0) 327 .legalIf(isPointer(0)); 328 329 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 330 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 331 .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr}); 332 333 334 auto &FPOpActions = getActionDefinitionsBuilder( 335 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 336 .legalFor({S32, S64}); 337 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 338 .customFor({S32, S64}); 339 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 340 .customFor({S32, S64}); 341 342 if (ST.has16BitInsts()) { 343 if (ST.hasVOP3PInsts()) 344 FPOpActions.legalFor({S16, V2S16}); 345 else 346 FPOpActions.legalFor({S16}); 347 348 TrigActions.customFor({S16}); 349 FDIVActions.customFor({S16}); 350 } 351 352 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 353 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 354 355 if (ST.hasVOP3PInsts()) { 356 MinNumMaxNum.customFor(FPTypesPK16) 357 .clampMaxNumElements(0, S16, 2) 358 .clampScalar(0, S16, S64) 359 .scalarize(0); 360 } else if (ST.has16BitInsts()) { 361 MinNumMaxNum.customFor(FPTypes16) 362 .clampScalar(0, S16, S64) 363 .scalarize(0); 364 } else { 365 MinNumMaxNum.customFor(FPTypesBase) 366 .clampScalar(0, S32, S64) 367 .scalarize(0); 368 } 369 370 if (ST.hasVOP3PInsts()) 371 FPOpActions.clampMaxNumElements(0, S16, 2); 372 373 FPOpActions 374 .scalarize(0) 375 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 376 377 TrigActions 378 .scalarize(0) 379 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 380 381 FDIVActions 382 .scalarize(0) 383 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 384 385 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 386 .legalFor(FPTypesPK16) 387 .clampMaxNumElements(0, S16, 2) 388 .scalarize(0) 389 .clampScalar(0, S16, S64); 390 391 // TODO: Implement 392 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); 393 394 if (ST.has16BitInsts()) { 395 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 396 .legalFor({S32, S64, S16}) 397 .scalarize(0) 398 .clampScalar(0, S16, S64); 399 } else { 400 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 401 .legalFor({S32, S64}) 402 .scalarize(0) 403 .clampScalar(0, S32, S64); 404 } 405 406 getActionDefinitionsBuilder(G_FPTRUNC) 407 .legalFor({{S32, S64}, {S16, S32}}) 408 .scalarize(0); 409 410 getActionDefinitionsBuilder(G_FPEXT) 411 .legalFor({{S64, S32}, {S32, S16}}) 412 .lowerFor({{S64, S16}}) // FIXME: Implement 413 .scalarize(0); 414 415 // TODO: Verify V_BFI_B32 is generated from expanded bit ops. 416 getActionDefinitionsBuilder(G_FCOPYSIGN).lower(); 417 418 getActionDefinitionsBuilder(G_FSUB) 419 // Use actual fsub instruction 420 .legalFor({S32}) 421 // Must use fadd + fneg 422 .lowerFor({S64, S16, V2S16}) 423 .scalarize(0) 424 .clampScalar(0, S32, S64); 425 426 // Whether this is legal depends on the floating point mode for the function. 427 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 428 if (ST.hasMadF16()) 429 FMad.customFor({S32, S16}); 430 else 431 FMad.customFor({S32}); 432 FMad.scalarize(0) 433 .lower(); 434 435 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 436 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 437 {S32, S1}, {S64, S1}, {S16, S1}, 438 {S96, S32}, 439 // FIXME: Hack 440 {S64, LLT::scalar(33)}, 441 {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}}) 442 .scalarize(0); 443 444 // TODO: Split s1->s64 during regbankselect for VALU. 445 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 446 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 447 .lowerFor({{S32, S64}}) 448 .lowerIf(typeIs(1, S1)) 449 .customFor({{S64, S64}}); 450 if (ST.has16BitInsts()) 451 IToFP.legalFor({{S16, S16}}); 452 IToFP.clampScalar(1, S32, S64) 453 .scalarize(0); 454 455 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 456 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}); 457 if (ST.has16BitInsts()) 458 FPToI.legalFor({{S16, S16}}); 459 else 460 FPToI.minScalar(1, S32); 461 462 FPToI.minScalar(0, S32) 463 .scalarize(0); 464 465 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 466 .legalFor({S32, S64}) 467 .scalarize(0); 468 469 if (ST.has16BitInsts()) { 470 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 471 .legalFor({S16, S32, S64}) 472 .clampScalar(0, S16, S64) 473 .scalarize(0); 474 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 475 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 476 .legalFor({S32, S64}) 477 .clampScalar(0, S32, S64) 478 .scalarize(0); 479 } else { 480 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 481 .legalFor({S32}) 482 .customFor({S64}) 483 .clampScalar(0, S32, S64) 484 .scalarize(0); 485 } 486 487 getActionDefinitionsBuilder(G_PTR_ADD) 488 .legalForCartesianProduct(AddrSpaces64, {S64}) 489 .legalForCartesianProduct(AddrSpaces32, {S32}) 490 .scalarize(0); 491 492 getActionDefinitionsBuilder(G_PTR_MASK) 493 .scalarize(0) 494 .alwaysLegal(); 495 496 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 497 498 auto &CmpBuilder = 499 getActionDefinitionsBuilder(G_ICMP) 500 .legalForCartesianProduct( 501 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 502 .legalFor({{S1, S32}, {S1, S64}}); 503 if (ST.has16BitInsts()) { 504 CmpBuilder.legalFor({{S1, S16}}); 505 } 506 507 CmpBuilder 508 .widenScalarToNextPow2(1) 509 .clampScalar(1, S32, S64) 510 .scalarize(0) 511 .legalIf(all(typeIs(0, S1), isPointer(1))); 512 513 getActionDefinitionsBuilder(G_FCMP) 514 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 515 .widenScalarToNextPow2(1) 516 .clampScalar(1, S32, S64) 517 .scalarize(0); 518 519 // FIXME: fexp, flog2, flog10 needs to be custom lowered. 520 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, 521 G_FLOG, G_FLOG2, G_FLOG10}) 522 .legalFor({S32}) 523 .scalarize(0); 524 525 // The 64-bit versions produce 32-bit results, but only on the SALU. 526 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, 527 G_CTTZ, G_CTTZ_ZERO_UNDEF, 528 G_CTPOP}) 529 .legalFor({{S32, S32}, {S32, S64}}) 530 .clampScalar(0, S32, S32) 531 .clampScalar(1, S32, S64) 532 .scalarize(0) 533 .widenScalarToNextPow2(0, 32) 534 .widenScalarToNextPow2(1, 32); 535 536 // TODO: Expand for > s32 537 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE}) 538 .legalFor({S32}) 539 .clampScalar(0, S32, S32) 540 .scalarize(0); 541 542 if (ST.has16BitInsts()) { 543 if (ST.hasVOP3PInsts()) { 544 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 545 .legalFor({S32, S16, V2S16}) 546 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 547 .clampMaxNumElements(0, S16, 2) 548 .clampScalar(0, S16, S32) 549 .widenScalarToNextPow2(0) 550 .scalarize(0); 551 } else { 552 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 553 .legalFor({S32, S16}) 554 .widenScalarToNextPow2(0) 555 .clampScalar(0, S16, S32) 556 .scalarize(0); 557 } 558 } else { 559 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 560 .legalFor({S32}) 561 .clampScalar(0, S32, S32) 562 .widenScalarToNextPow2(0) 563 .scalarize(0); 564 } 565 566 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 567 return [=](const LegalityQuery &Query) { 568 return Query.Types[TypeIdx0].getSizeInBits() < 569 Query.Types[TypeIdx1].getSizeInBits(); 570 }; 571 }; 572 573 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 574 return [=](const LegalityQuery &Query) { 575 return Query.Types[TypeIdx0].getSizeInBits() > 576 Query.Types[TypeIdx1].getSizeInBits(); 577 }; 578 }; 579 580 getActionDefinitionsBuilder(G_INTTOPTR) 581 // List the common cases 582 .legalForCartesianProduct(AddrSpaces64, {S64}) 583 .legalForCartesianProduct(AddrSpaces32, {S32}) 584 .scalarize(0) 585 // Accept any address space as long as the size matches 586 .legalIf(sameSize(0, 1)) 587 .widenScalarIf(smallerThan(1, 0), 588 [](const LegalityQuery &Query) { 589 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 590 }) 591 .narrowScalarIf(greaterThan(1, 0), 592 [](const LegalityQuery &Query) { 593 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 594 }); 595 596 getActionDefinitionsBuilder(G_PTRTOINT) 597 // List the common cases 598 .legalForCartesianProduct(AddrSpaces64, {S64}) 599 .legalForCartesianProduct(AddrSpaces32, {S32}) 600 .scalarize(0) 601 // Accept any address space as long as the size matches 602 .legalIf(sameSize(0, 1)) 603 .widenScalarIf(smallerThan(0, 1), 604 [](const LegalityQuery &Query) { 605 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 606 }) 607 .narrowScalarIf( 608 greaterThan(0, 1), 609 [](const LegalityQuery &Query) { 610 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 611 }); 612 613 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 614 .scalarize(0) 615 .custom(); 616 617 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 618 // handle some operations by just promoting the register during 619 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 620 auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned { 621 switch (AS) { 622 // FIXME: Private element size. 623 case AMDGPUAS::PRIVATE_ADDRESS: 624 return 32; 625 // FIXME: Check subtarget 626 case AMDGPUAS::LOCAL_ADDRESS: 627 return ST.useDS128() ? 128 : 64; 628 629 // Treat constant and global as identical. SMRD loads are sometimes usable 630 // for global loads (ideally constant address space should be eliminated) 631 // depending on the context. Legality cannot be context dependent, but 632 // RegBankSelect can split the load as necessary depending on the pointer 633 // register bank/uniformity and if the memory is invariant or not written in 634 // a kernel. 635 case AMDGPUAS::CONSTANT_ADDRESS: 636 case AMDGPUAS::GLOBAL_ADDRESS: 637 return 512; 638 default: 639 return 128; 640 } 641 }; 642 643 const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool { 644 const LLT DstTy = Query.Types[0]; 645 646 // Split vector extloads. 647 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 648 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 649 return true; 650 651 const LLT PtrTy = Query.Types[1]; 652 unsigned AS = PtrTy.getAddressSpace(); 653 if (MemSize > maxSizeForAddrSpace(AS)) 654 return true; 655 656 // Catch weird sized loads that don't evenly divide into the access sizes 657 // TODO: May be able to widen depending on alignment etc. 658 unsigned NumRegs = MemSize / 32; 659 if (NumRegs == 3 && !ST.hasDwordx3LoadStores()) 660 return true; 661 662 unsigned Align = Query.MMODescrs[0].AlignInBits; 663 if (Align < MemSize) { 664 const SITargetLowering *TLI = ST.getTargetLowering(); 665 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 666 } 667 668 return false; 669 }; 670 671 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 672 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 673 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 674 675 // TODO: Refine based on subtargets which support unaligned access or 128-bit 676 // LDS 677 // TODO: Unsupported flat for SI. 678 679 for (unsigned Op : {G_LOAD, G_STORE}) { 680 const bool IsStore = Op == G_STORE; 681 682 auto &Actions = getActionDefinitionsBuilder(Op); 683 // Whitelist the common cases. 684 // TODO: Pointer loads 685 // TODO: Wide constant loads 686 // TODO: Only CI+ has 3x loads 687 // TODO: Loads to s16 on gfx9 688 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 689 {V2S32, GlobalPtr, 64, GlobalAlign32}, 690 {V3S32, GlobalPtr, 96, GlobalAlign32}, 691 {S96, GlobalPtr, 96, GlobalAlign32}, 692 {V4S32, GlobalPtr, 128, GlobalAlign32}, 693 {S128, GlobalPtr, 128, GlobalAlign32}, 694 {S64, GlobalPtr, 64, GlobalAlign32}, 695 {V2S64, GlobalPtr, 128, GlobalAlign32}, 696 {V2S16, GlobalPtr, 32, GlobalAlign32}, 697 {S32, GlobalPtr, 8, GlobalAlign8}, 698 {S32, GlobalPtr, 16, GlobalAlign16}, 699 700 {S32, LocalPtr, 32, 32}, 701 {S64, LocalPtr, 64, 32}, 702 {V2S32, LocalPtr, 64, 32}, 703 {S32, LocalPtr, 8, 8}, 704 {S32, LocalPtr, 16, 16}, 705 {V2S16, LocalPtr, 32, 32}, 706 707 {S32, PrivatePtr, 32, 32}, 708 {S32, PrivatePtr, 8, 8}, 709 {S32, PrivatePtr, 16, 16}, 710 {V2S16, PrivatePtr, 32, 32}, 711 712 {S32, FlatPtr, 32, GlobalAlign32}, 713 {S32, FlatPtr, 16, GlobalAlign16}, 714 {S32, FlatPtr, 8, GlobalAlign8}, 715 {V2S16, FlatPtr, 32, GlobalAlign32}, 716 717 {S32, ConstantPtr, 32, GlobalAlign32}, 718 {V2S32, ConstantPtr, 64, GlobalAlign32}, 719 {V3S32, ConstantPtr, 96, GlobalAlign32}, 720 {V4S32, ConstantPtr, 128, GlobalAlign32}, 721 {S64, ConstantPtr, 64, GlobalAlign32}, 722 {S128, ConstantPtr, 128, GlobalAlign32}, 723 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 724 Actions 725 .customIf(typeIs(1, Constant32Ptr)) 726 .narrowScalarIf( 727 [=](const LegalityQuery &Query) -> bool { 728 return !Query.Types[0].isVector() && needToSplitLoad(Query); 729 }, 730 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 731 const LLT DstTy = Query.Types[0]; 732 const LLT PtrTy = Query.Types[1]; 733 734 const unsigned DstSize = DstTy.getSizeInBits(); 735 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 736 737 // Split extloads. 738 if (DstSize > MemSize) 739 return std::make_pair(0, LLT::scalar(MemSize)); 740 741 if (DstSize > 32 && (DstSize % 32 != 0)) { 742 // FIXME: Need a way to specify non-extload of larger size if 743 // suitably aligned. 744 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 745 } 746 747 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 748 if (MemSize > MaxSize) 749 return std::make_pair(0, LLT::scalar(MaxSize)); 750 751 unsigned Align = Query.MMODescrs[0].AlignInBits; 752 return std::make_pair(0, LLT::scalar(Align)); 753 }) 754 .fewerElementsIf( 755 [=](const LegalityQuery &Query) -> bool { 756 return Query.Types[0].isVector() && needToSplitLoad(Query); 757 }, 758 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 759 const LLT DstTy = Query.Types[0]; 760 const LLT PtrTy = Query.Types[1]; 761 762 LLT EltTy = DstTy.getElementType(); 763 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 764 765 // Split if it's too large for the address space. 766 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 767 unsigned NumElts = DstTy.getNumElements(); 768 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 769 770 // FIXME: Refine when odd breakdowns handled 771 // The scalars will need to be re-legalized. 772 if (NumPieces == 1 || NumPieces >= NumElts || 773 NumElts % NumPieces != 0) 774 return std::make_pair(0, EltTy); 775 776 return std::make_pair(0, 777 LLT::vector(NumElts / NumPieces, EltTy)); 778 } 779 780 // Need to split because of alignment. 781 unsigned Align = Query.MMODescrs[0].AlignInBits; 782 unsigned EltSize = EltTy.getSizeInBits(); 783 if (EltSize > Align && 784 (EltSize / Align < DstTy.getNumElements())) { 785 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 786 } 787 788 // May need relegalization for the scalars. 789 return std::make_pair(0, EltTy); 790 }) 791 .minScalar(0, S32); 792 793 if (IsStore) 794 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 795 796 // TODO: Need a bitcast lower option? 797 Actions 798 .legalIf([=](const LegalityQuery &Query) { 799 const LLT Ty0 = Query.Types[0]; 800 unsigned Size = Ty0.getSizeInBits(); 801 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 802 unsigned Align = Query.MMODescrs[0].AlignInBits; 803 804 // No extending vector loads. 805 if (Size > MemSize && Ty0.isVector()) 806 return false; 807 808 // FIXME: Widening store from alignment not valid. 809 if (MemSize < Size) 810 MemSize = std::max(MemSize, Align); 811 812 switch (MemSize) { 813 case 8: 814 case 16: 815 return Size == 32; 816 case 32: 817 case 64: 818 case 128: 819 return true; 820 case 96: 821 return ST.hasDwordx3LoadStores(); 822 case 256: 823 case 512: 824 return true; 825 default: 826 return false; 827 } 828 }) 829 .widenScalarToNextPow2(0) 830 // TODO: v3s32->v4s32 with alignment 831 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 832 } 833 834 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 835 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 836 {S32, GlobalPtr, 16, 2 * 8}, 837 {S32, LocalPtr, 8, 8}, 838 {S32, LocalPtr, 16, 16}, 839 {S32, PrivatePtr, 8, 8}, 840 {S32, PrivatePtr, 16, 16}, 841 {S32, ConstantPtr, 8, 8}, 842 {S32, ConstantPtr, 16, 2 * 8}}); 843 if (ST.hasFlatAddressSpace()) { 844 ExtLoads.legalForTypesWithMemDesc( 845 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 846 } 847 848 ExtLoads.clampScalar(0, S32, S32) 849 .widenScalarToNextPow2(0) 850 .unsupportedIfMemSizeNotPow2() 851 .lower(); 852 853 auto &Atomics = getActionDefinitionsBuilder( 854 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 855 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 856 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 857 G_ATOMICRMW_UMIN}) 858 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 859 {S64, GlobalPtr}, {S64, LocalPtr}}); 860 if (ST.hasFlatAddressSpace()) { 861 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 862 } 863 864 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 865 .legalFor({{S32, LocalPtr}}); 866 867 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 868 // demarshalling 869 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 870 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 871 {S32, FlatPtr}, {S64, FlatPtr}}) 872 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 873 {S32, RegionPtr}, {S64, RegionPtr}}); 874 875 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS) 876 .lower(); 877 878 // TODO: Pointer types, any 32-bit or 64-bit vector 879 getActionDefinitionsBuilder(G_SELECT) 880 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 881 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 882 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1}) 883 .clampScalar(0, S16, S64) 884 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 885 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 886 .scalarize(1) 887 .clampMaxNumElements(0, S32, 2) 888 .clampMaxNumElements(0, LocalPtr, 2) 889 .clampMaxNumElements(0, PrivatePtr, 2) 890 .scalarize(0) 891 .widenScalarToNextPow2(0) 892 .legalIf(all(isPointer(0), typeIs(1, S1))); 893 894 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 895 // be more flexible with the shift amount type. 896 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 897 .legalFor({{S32, S32}, {S64, S32}}); 898 if (ST.has16BitInsts()) { 899 if (ST.hasVOP3PInsts()) { 900 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 901 .clampMaxNumElements(0, S16, 2); 902 } else 903 Shifts.legalFor({{S16, S32}, {S16, S16}}); 904 905 Shifts.clampScalar(1, S16, S32); 906 Shifts.clampScalar(0, S16, S64); 907 Shifts.widenScalarToNextPow2(0, 16); 908 } else { 909 // Make sure we legalize the shift amount type first, as the general 910 // expansion for the shifted type will produce much worse code if it hasn't 911 // been truncated already. 912 Shifts.clampScalar(1, S32, S32); 913 Shifts.clampScalar(0, S32, S64); 914 Shifts.widenScalarToNextPow2(0, 32); 915 } 916 Shifts.scalarize(0); 917 918 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 919 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 920 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 921 unsigned IdxTypeIdx = 2; 922 923 getActionDefinitionsBuilder(Op) 924 .customIf([=](const LegalityQuery &Query) { 925 const LLT EltTy = Query.Types[EltTypeIdx]; 926 const LLT VecTy = Query.Types[VecTypeIdx]; 927 const LLT IdxTy = Query.Types[IdxTypeIdx]; 928 return (EltTy.getSizeInBits() == 16 || 929 EltTy.getSizeInBits() % 32 == 0) && 930 VecTy.getSizeInBits() % 32 == 0 && 931 VecTy.getSizeInBits() <= 1024 && 932 IdxTy.getSizeInBits() == 32; 933 }) 934 .clampScalar(EltTypeIdx, S32, S64) 935 .clampScalar(VecTypeIdx, S32, S64) 936 .clampScalar(IdxTypeIdx, S32, S32); 937 } 938 939 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 940 .unsupportedIf([=](const LegalityQuery &Query) { 941 const LLT &EltTy = Query.Types[1].getElementType(); 942 return Query.Types[0] != EltTy; 943 }); 944 945 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 946 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 947 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 948 949 // FIXME: Doesn't handle extract of illegal sizes. 950 getActionDefinitionsBuilder(Op) 951 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 952 // FIXME: Multiples of 16 should not be legal. 953 .legalIf([=](const LegalityQuery &Query) { 954 const LLT BigTy = Query.Types[BigTyIdx]; 955 const LLT LitTy = Query.Types[LitTyIdx]; 956 return (BigTy.getSizeInBits() % 32 == 0) && 957 (LitTy.getSizeInBits() % 16 == 0); 958 }) 959 .widenScalarIf( 960 [=](const LegalityQuery &Query) { 961 const LLT BigTy = Query.Types[BigTyIdx]; 962 return (BigTy.getScalarSizeInBits() < 16); 963 }, 964 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 965 .widenScalarIf( 966 [=](const LegalityQuery &Query) { 967 const LLT LitTy = Query.Types[LitTyIdx]; 968 return (LitTy.getScalarSizeInBits() < 16); 969 }, 970 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 971 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 972 .widenScalarToNextPow2(BigTyIdx, 32); 973 974 } 975 976 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 977 .legalForCartesianProduct(AllS32Vectors, {S32}) 978 .legalForCartesianProduct(AllS64Vectors, {S64}) 979 .clampNumElements(0, V16S32, V32S32) 980 .clampNumElements(0, V2S64, V16S64) 981 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 982 983 if (ST.hasScalarPackInsts()) 984 BuildVector.legalFor({V2S16, S32}); 985 986 BuildVector 987 .minScalarSameAs(1, 0) 988 .legalIf(isRegisterType(0)) 989 .minScalarOrElt(0, S32); 990 991 if (ST.hasScalarPackInsts()) { 992 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 993 .legalFor({V2S16, S32}) 994 .lower(); 995 } else { 996 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 997 .lower(); 998 } 999 1000 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1001 .legalIf(isRegisterType(0)); 1002 1003 // TODO: Don't fully scalarize v2s16 pieces 1004 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1005 1006 // Merge/Unmerge 1007 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1008 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1009 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1010 1011 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1012 const LLT &Ty = Query.Types[TypeIdx]; 1013 if (Ty.isVector()) { 1014 const LLT &EltTy = Ty.getElementType(); 1015 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 1016 return true; 1017 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1018 return true; 1019 } 1020 return false; 1021 }; 1022 1023 auto &Builder = getActionDefinitionsBuilder(Op) 1024 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1025 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1026 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1027 // valid. 1028 .clampScalar(LitTyIdx, S16, S256) 1029 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1030 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1031 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1032 elementTypeIs(1, S16)), 1033 changeTo(1, V2S16)) 1034 // Break up vectors with weird elements into scalars 1035 .fewerElementsIf( 1036 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 1037 scalarize(0)) 1038 .fewerElementsIf( 1039 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 1040 scalarize(1)) 1041 .clampScalar(BigTyIdx, S32, S1024) 1042 .lowerFor({{S16, V2S16}}); 1043 1044 if (Op == G_MERGE_VALUES) { 1045 Builder.widenScalarIf( 1046 // TODO: Use 16-bit shifts if legal for 8-bit values? 1047 [=](const LegalityQuery &Query) { 1048 const LLT Ty = Query.Types[LitTyIdx]; 1049 return Ty.getSizeInBits() < 32; 1050 }, 1051 changeTo(LitTyIdx, S32)); 1052 } 1053 1054 Builder.widenScalarIf( 1055 [=](const LegalityQuery &Query) { 1056 const LLT Ty = Query.Types[BigTyIdx]; 1057 return !isPowerOf2_32(Ty.getSizeInBits()) && 1058 Ty.getSizeInBits() % 16 != 0; 1059 }, 1060 [=](const LegalityQuery &Query) { 1061 // Pick the next power of 2, or a multiple of 64 over 128. 1062 // Whichever is smaller. 1063 const LLT &Ty = Query.Types[BigTyIdx]; 1064 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1065 if (NewSizeInBits >= 256) { 1066 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1067 if (RoundedTo < NewSizeInBits) 1068 NewSizeInBits = RoundedTo; 1069 } 1070 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1071 }) 1072 .legalIf([=](const LegalityQuery &Query) { 1073 const LLT &BigTy = Query.Types[BigTyIdx]; 1074 const LLT &LitTy = Query.Types[LitTyIdx]; 1075 1076 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1077 return false; 1078 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1079 return false; 1080 1081 return BigTy.getSizeInBits() % 16 == 0 && 1082 LitTy.getSizeInBits() % 16 == 0 && 1083 BigTy.getSizeInBits() <= 1024; 1084 }) 1085 // Any vectors left are the wrong size. Scalarize them. 1086 .scalarize(0) 1087 .scalarize(1); 1088 } 1089 1090 getActionDefinitionsBuilder(G_SEXT_INREG).lower(); 1091 1092 computeTables(); 1093 verify(*ST.getInstrInfo()); 1094 } 1095 1096 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1097 MachineRegisterInfo &MRI, 1098 MachineIRBuilder &B, 1099 GISelChangeObserver &Observer) const { 1100 switch (MI.getOpcode()) { 1101 case TargetOpcode::G_ADDRSPACE_CAST: 1102 return legalizeAddrSpaceCast(MI, MRI, B); 1103 case TargetOpcode::G_FRINT: 1104 return legalizeFrint(MI, MRI, B); 1105 case TargetOpcode::G_FCEIL: 1106 return legalizeFceil(MI, MRI, B); 1107 case TargetOpcode::G_INTRINSIC_TRUNC: 1108 return legalizeIntrinsicTrunc(MI, MRI, B); 1109 case TargetOpcode::G_SITOFP: 1110 return legalizeITOFP(MI, MRI, B, true); 1111 case TargetOpcode::G_UITOFP: 1112 return legalizeITOFP(MI, MRI, B, false); 1113 case TargetOpcode::G_FMINNUM: 1114 case TargetOpcode::G_FMAXNUM: 1115 case TargetOpcode::G_FMINNUM_IEEE: 1116 case TargetOpcode::G_FMAXNUM_IEEE: 1117 return legalizeMinNumMaxNum(MI, MRI, B); 1118 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1119 return legalizeExtractVectorElt(MI, MRI, B); 1120 case TargetOpcode::G_INSERT_VECTOR_ELT: 1121 return legalizeInsertVectorElt(MI, MRI, B); 1122 case TargetOpcode::G_FSIN: 1123 case TargetOpcode::G_FCOS: 1124 return legalizeSinCos(MI, MRI, B); 1125 case TargetOpcode::G_GLOBAL_VALUE: 1126 return legalizeGlobalValue(MI, MRI, B); 1127 case TargetOpcode::G_LOAD: 1128 return legalizeLoad(MI, MRI, B, Observer); 1129 case TargetOpcode::G_FMAD: 1130 return legalizeFMad(MI, MRI, B); 1131 case TargetOpcode::G_FDIV: 1132 return legalizeFDIV(MI, MRI, B); 1133 case TargetOpcode::G_ATOMIC_CMPXCHG: 1134 return legalizeAtomicCmpXChg(MI, MRI, B); 1135 default: 1136 return false; 1137 } 1138 1139 llvm_unreachable("expected switch to return"); 1140 } 1141 1142 Register AMDGPULegalizerInfo::getSegmentAperture( 1143 unsigned AS, 1144 MachineRegisterInfo &MRI, 1145 MachineIRBuilder &B) const { 1146 MachineFunction &MF = B.getMF(); 1147 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1148 const LLT S32 = LLT::scalar(32); 1149 1150 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1151 1152 if (ST.hasApertureRegs()) { 1153 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1154 // getreg. 1155 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1156 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1157 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1158 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1159 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1160 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1161 unsigned Encoding = 1162 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1163 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1164 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1165 1166 Register ApertureReg = MRI.createGenericVirtualRegister(S32); 1167 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1168 1169 B.buildInstr(AMDGPU::S_GETREG_B32) 1170 .addDef(GetReg) 1171 .addImm(Encoding); 1172 MRI.setType(GetReg, S32); 1173 1174 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1175 B.buildInstr(TargetOpcode::G_SHL) 1176 .addDef(ApertureReg) 1177 .addUse(GetReg) 1178 .addUse(ShiftAmt.getReg(0)); 1179 1180 return ApertureReg; 1181 } 1182 1183 Register QueuePtr = MRI.createGenericVirtualRegister( 1184 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1185 1186 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1187 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1188 return Register(); 1189 1190 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1191 // private_segment_aperture_base_hi. 1192 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1193 1194 // TODO: can we be smarter about machine pointer info? 1195 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1196 MachineMemOperand *MMO = MF.getMachineMemOperand( 1197 PtrInfo, 1198 MachineMemOperand::MOLoad | 1199 MachineMemOperand::MODereferenceable | 1200 MachineMemOperand::MOInvariant, 1201 4, 1202 MinAlign(64, StructOffset)); 1203 1204 Register LoadResult = MRI.createGenericVirtualRegister(S32); 1205 Register LoadAddr; 1206 1207 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1208 B.buildLoad(LoadResult, LoadAddr, *MMO); 1209 return LoadResult; 1210 } 1211 1212 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1213 MachineInstr &MI, MachineRegisterInfo &MRI, 1214 MachineIRBuilder &B) const { 1215 MachineFunction &MF = B.getMF(); 1216 1217 B.setInstr(MI); 1218 1219 const LLT S32 = LLT::scalar(32); 1220 Register Dst = MI.getOperand(0).getReg(); 1221 Register Src = MI.getOperand(1).getReg(); 1222 1223 LLT DstTy = MRI.getType(Dst); 1224 LLT SrcTy = MRI.getType(Src); 1225 unsigned DestAS = DstTy.getAddressSpace(); 1226 unsigned SrcAS = SrcTy.getAddressSpace(); 1227 1228 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1229 // vector element. 1230 assert(!DstTy.isVector()); 1231 1232 const AMDGPUTargetMachine &TM 1233 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1234 1235 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1236 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1237 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1238 return true; 1239 } 1240 1241 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1242 // Truncate. 1243 B.buildExtract(Dst, Src, 0); 1244 MI.eraseFromParent(); 1245 return true; 1246 } 1247 1248 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1249 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1250 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1251 1252 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1253 // another. Merge operands are required to be the same type, but creating an 1254 // extra ptrtoint would be kind of pointless. 1255 auto HighAddr = B.buildConstant( 1256 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1257 B.buildMerge(Dst, {Src, HighAddr.getReg(0)}); 1258 MI.eraseFromParent(); 1259 return true; 1260 } 1261 1262 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1263 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1264 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1265 unsigned NullVal = TM.getNullPointerValue(DestAS); 1266 1267 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1268 auto FlatNull = B.buildConstant(SrcTy, 0); 1269 1270 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy); 1271 1272 // Extract low 32-bits of the pointer. 1273 B.buildExtract(PtrLo32, Src, 0); 1274 1275 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1276 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); 1277 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1278 1279 MI.eraseFromParent(); 1280 return true; 1281 } 1282 1283 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1284 return false; 1285 1286 if (!ST.hasFlatAddressSpace()) 1287 return false; 1288 1289 auto SegmentNull = 1290 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1291 auto FlatNull = 1292 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1293 1294 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1295 if (!ApertureReg.isValid()) 1296 return false; 1297 1298 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1299 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); 1300 1301 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy); 1302 1303 // Coerce the type of the low half of the result so we can use merge_values. 1304 Register SrcAsInt = MRI.createGenericVirtualRegister(S32); 1305 B.buildInstr(TargetOpcode::G_PTRTOINT) 1306 .addDef(SrcAsInt) 1307 .addUse(Src); 1308 1309 // TODO: Should we allow mismatched types but matching sizes in merges to 1310 // avoid the ptrtoint? 1311 B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); 1312 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); 1313 1314 MI.eraseFromParent(); 1315 return true; 1316 } 1317 1318 bool AMDGPULegalizerInfo::legalizeFrint( 1319 MachineInstr &MI, MachineRegisterInfo &MRI, 1320 MachineIRBuilder &B) const { 1321 B.setInstr(MI); 1322 1323 Register Src = MI.getOperand(1).getReg(); 1324 LLT Ty = MRI.getType(Src); 1325 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1326 1327 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1328 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1329 1330 auto C1 = B.buildFConstant(Ty, C1Val); 1331 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1332 1333 // TODO: Should this propagate fast-math-flags? 1334 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1335 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1336 1337 auto C2 = B.buildFConstant(Ty, C2Val); 1338 auto Fabs = B.buildFAbs(Ty, Src); 1339 1340 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1341 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1342 return true; 1343 } 1344 1345 bool AMDGPULegalizerInfo::legalizeFceil( 1346 MachineInstr &MI, MachineRegisterInfo &MRI, 1347 MachineIRBuilder &B) const { 1348 B.setInstr(MI); 1349 1350 const LLT S1 = LLT::scalar(1); 1351 const LLT S64 = LLT::scalar(64); 1352 1353 Register Src = MI.getOperand(1).getReg(); 1354 assert(MRI.getType(Src) == S64); 1355 1356 // result = trunc(src) 1357 // if (src > 0.0 && src != result) 1358 // result += 1.0 1359 1360 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src}); 1361 1362 const auto Zero = B.buildFConstant(S64, 0.0); 1363 const auto One = B.buildFConstant(S64, 1.0); 1364 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1365 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1366 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1367 auto Add = B.buildSelect(S64, And, One, Zero); 1368 1369 // TODO: Should this propagate fast-math-flags? 1370 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1371 return true; 1372 } 1373 1374 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1375 MachineIRBuilder &B) { 1376 const unsigned FractBits = 52; 1377 const unsigned ExpBits = 11; 1378 LLT S32 = LLT::scalar(32); 1379 1380 auto Const0 = B.buildConstant(S32, FractBits - 32); 1381 auto Const1 = B.buildConstant(S32, ExpBits); 1382 1383 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1384 .addUse(Const0.getReg(0)) 1385 .addUse(Const1.getReg(0)); 1386 1387 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1388 } 1389 1390 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1391 MachineInstr &MI, MachineRegisterInfo &MRI, 1392 MachineIRBuilder &B) const { 1393 B.setInstr(MI); 1394 1395 const LLT S1 = LLT::scalar(1); 1396 const LLT S32 = LLT::scalar(32); 1397 const LLT S64 = LLT::scalar(64); 1398 1399 Register Src = MI.getOperand(1).getReg(); 1400 assert(MRI.getType(Src) == S64); 1401 1402 // TODO: Should this use extract since the low half is unused? 1403 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1404 Register Hi = Unmerge.getReg(1); 1405 1406 // Extract the upper half, since this is where we will find the sign and 1407 // exponent. 1408 auto Exp = extractF64Exponent(Hi, B); 1409 1410 const unsigned FractBits = 52; 1411 1412 // Extract the sign bit. 1413 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1414 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1415 1416 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1417 1418 const auto Zero32 = B.buildConstant(S32, 0); 1419 1420 // Extend back to 64-bits. 1421 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); 1422 1423 auto Shr = B.buildAShr(S64, FractMask, Exp); 1424 auto Not = B.buildNot(S64, Shr); 1425 auto Tmp0 = B.buildAnd(S64, Src, Not); 1426 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1427 1428 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1429 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1430 1431 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1432 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1433 return true; 1434 } 1435 1436 bool AMDGPULegalizerInfo::legalizeITOFP( 1437 MachineInstr &MI, MachineRegisterInfo &MRI, 1438 MachineIRBuilder &B, bool Signed) const { 1439 B.setInstr(MI); 1440 1441 Register Dst = MI.getOperand(0).getReg(); 1442 Register Src = MI.getOperand(1).getReg(); 1443 1444 const LLT S64 = LLT::scalar(64); 1445 const LLT S32 = LLT::scalar(32); 1446 1447 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1448 1449 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1450 1451 auto CvtHi = Signed ? 1452 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1453 B.buildUITOFP(S64, Unmerge.getReg(1)); 1454 1455 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1456 1457 auto ThirtyTwo = B.buildConstant(S32, 32); 1458 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1459 .addUse(CvtHi.getReg(0)) 1460 .addUse(ThirtyTwo.getReg(0)); 1461 1462 // TODO: Should this propagate fast-math-flags? 1463 B.buildFAdd(Dst, LdExp, CvtLo); 1464 MI.eraseFromParent(); 1465 return true; 1466 } 1467 1468 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1469 MachineInstr &MI, MachineRegisterInfo &MRI, 1470 MachineIRBuilder &B) const { 1471 MachineFunction &MF = B.getMF(); 1472 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1473 1474 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1475 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1476 1477 // With ieee_mode disabled, the instructions have the correct behavior 1478 // already for G_FMINNUM/G_FMAXNUM 1479 if (!MFI->getMode().IEEE) 1480 return !IsIEEEOp; 1481 1482 if (IsIEEEOp) 1483 return true; 1484 1485 MachineIRBuilder HelperBuilder(MI); 1486 GISelObserverWrapper DummyObserver; 1487 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1488 HelperBuilder.setInstr(MI); 1489 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1490 } 1491 1492 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1493 MachineInstr &MI, MachineRegisterInfo &MRI, 1494 MachineIRBuilder &B) const { 1495 // TODO: Should move some of this into LegalizerHelper. 1496 1497 // TODO: Promote dynamic indexing of s16 to s32 1498 // TODO: Dynamic s64 indexing is only legal for SGPR. 1499 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); 1500 if (!IdxVal) // Dynamic case will be selected to register indexing. 1501 return true; 1502 1503 Register Dst = MI.getOperand(0).getReg(); 1504 Register Vec = MI.getOperand(1).getReg(); 1505 1506 LLT VecTy = MRI.getType(Vec); 1507 LLT EltTy = VecTy.getElementType(); 1508 assert(EltTy == MRI.getType(Dst)); 1509 1510 B.setInstr(MI); 1511 1512 if (IdxVal.getValue() < VecTy.getNumElements()) 1513 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); 1514 else 1515 B.buildUndef(Dst); 1516 1517 MI.eraseFromParent(); 1518 return true; 1519 } 1520 1521 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1522 MachineInstr &MI, MachineRegisterInfo &MRI, 1523 MachineIRBuilder &B) const { 1524 // TODO: Should move some of this into LegalizerHelper. 1525 1526 // TODO: Promote dynamic indexing of s16 to s32 1527 // TODO: Dynamic s64 indexing is only legal for SGPR. 1528 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); 1529 if (!IdxVal) // Dynamic case will be selected to register indexing. 1530 return true; 1531 1532 Register Dst = MI.getOperand(0).getReg(); 1533 Register Vec = MI.getOperand(1).getReg(); 1534 Register Ins = MI.getOperand(2).getReg(); 1535 1536 LLT VecTy = MRI.getType(Vec); 1537 LLT EltTy = VecTy.getElementType(); 1538 assert(EltTy == MRI.getType(Ins)); 1539 1540 B.setInstr(MI); 1541 1542 if (IdxVal.getValue() < VecTy.getNumElements()) 1543 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); 1544 else 1545 B.buildUndef(Dst); 1546 1547 MI.eraseFromParent(); 1548 return true; 1549 } 1550 1551 bool AMDGPULegalizerInfo::legalizeSinCos( 1552 MachineInstr &MI, MachineRegisterInfo &MRI, 1553 MachineIRBuilder &B) const { 1554 B.setInstr(MI); 1555 1556 Register DstReg = MI.getOperand(0).getReg(); 1557 Register SrcReg = MI.getOperand(1).getReg(); 1558 LLT Ty = MRI.getType(DstReg); 1559 unsigned Flags = MI.getFlags(); 1560 1561 Register TrigVal; 1562 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1563 if (ST.hasTrigReducedRange()) { 1564 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1565 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1566 .addUse(MulVal.getReg(0)) 1567 .setMIFlags(Flags).getReg(0); 1568 } else 1569 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1570 1571 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1572 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1573 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1574 .addUse(TrigVal) 1575 .setMIFlags(Flags); 1576 MI.eraseFromParent(); 1577 return true; 1578 } 1579 1580 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1581 Register DstReg, LLT PtrTy, 1582 MachineIRBuilder &B, const GlobalValue *GV, 1583 unsigned Offset, unsigned GAFlags) const { 1584 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1585 // to the following code sequence: 1586 // 1587 // For constant address space: 1588 // s_getpc_b64 s[0:1] 1589 // s_add_u32 s0, s0, $symbol 1590 // s_addc_u32 s1, s1, 0 1591 // 1592 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1593 // a fixup or relocation is emitted to replace $symbol with a literal 1594 // constant, which is a pc-relative offset from the encoding of the $symbol 1595 // operand to the global variable. 1596 // 1597 // For global address space: 1598 // s_getpc_b64 s[0:1] 1599 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1600 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1601 // 1602 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1603 // fixups or relocations are emitted to replace $symbol@*@lo and 1604 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1605 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1606 // operand to the global variable. 1607 // 1608 // What we want here is an offset from the value returned by s_getpc 1609 // (which is the address of the s_add_u32 instruction) to the global 1610 // variable, but since the encoding of $symbol starts 4 bytes after the start 1611 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1612 // small. This requires us to add 4 to the global variable offset in order to 1613 // compute the correct address. 1614 1615 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1616 1617 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1618 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1619 1620 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1621 .addDef(PCReg); 1622 1623 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1624 if (GAFlags == SIInstrInfo::MO_NONE) 1625 MIB.addImm(0); 1626 else 1627 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1628 1629 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1630 1631 if (PtrTy.getSizeInBits() == 32) 1632 B.buildExtract(DstReg, PCReg, 0); 1633 return true; 1634 } 1635 1636 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1637 MachineInstr &MI, MachineRegisterInfo &MRI, 1638 MachineIRBuilder &B) const { 1639 Register DstReg = MI.getOperand(0).getReg(); 1640 LLT Ty = MRI.getType(DstReg); 1641 unsigned AS = Ty.getAddressSpace(); 1642 1643 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1644 MachineFunction &MF = B.getMF(); 1645 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1646 B.setInstr(MI); 1647 1648 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1649 if (!MFI->isEntryFunction()) { 1650 const Function &Fn = MF.getFunction(); 1651 DiagnosticInfoUnsupported BadLDSDecl( 1652 Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 1653 Fn.getContext().diagnose(BadLDSDecl); 1654 } 1655 1656 // TODO: We could emit code to handle the initialization somewhere. 1657 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1658 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1659 MI.eraseFromParent(); 1660 return true; 1661 } 1662 1663 const Function &Fn = MF.getFunction(); 1664 DiagnosticInfoUnsupported BadInit( 1665 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1666 Fn.getContext().diagnose(BadInit); 1667 return true; 1668 } 1669 1670 const SITargetLowering *TLI = ST.getTargetLowering(); 1671 1672 if (TLI->shouldEmitFixup(GV)) { 1673 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 1674 MI.eraseFromParent(); 1675 return true; 1676 } 1677 1678 if (TLI->shouldEmitPCReloc(GV)) { 1679 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 1680 MI.eraseFromParent(); 1681 return true; 1682 } 1683 1684 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1685 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 1686 1687 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 1688 MachinePointerInfo::getGOT(MF), 1689 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1690 MachineMemOperand::MOInvariant, 1691 8 /*Size*/, 8 /*Align*/); 1692 1693 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 1694 1695 if (Ty.getSizeInBits() == 32) { 1696 // Truncate if this is a 32-bit constant adrdess. 1697 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 1698 B.buildExtract(DstReg, Load, 0); 1699 } else 1700 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 1701 1702 MI.eraseFromParent(); 1703 return true; 1704 } 1705 1706 bool AMDGPULegalizerInfo::legalizeLoad( 1707 MachineInstr &MI, MachineRegisterInfo &MRI, 1708 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 1709 B.setInstr(MI); 1710 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1711 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 1712 Observer.changingInstr(MI); 1713 MI.getOperand(1).setReg(Cast.getReg(0)); 1714 Observer.changedInstr(MI); 1715 return true; 1716 } 1717 1718 bool AMDGPULegalizerInfo::legalizeFMad( 1719 MachineInstr &MI, MachineRegisterInfo &MRI, 1720 MachineIRBuilder &B) const { 1721 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 1722 assert(Ty.isScalar()); 1723 1724 MachineFunction &MF = B.getMF(); 1725 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1726 1727 // TODO: Always legal with future ftz flag. 1728 if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals) 1729 return true; 1730 if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals) 1731 return true; 1732 1733 1734 MachineIRBuilder HelperBuilder(MI); 1735 GISelObserverWrapper DummyObserver; 1736 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1737 HelperBuilder.setMBB(*MI.getParent()); 1738 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 1739 } 1740 1741 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 1742 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 1743 Register DstReg = MI.getOperand(0).getReg(); 1744 Register PtrReg = MI.getOperand(1).getReg(); 1745 Register CmpVal = MI.getOperand(2).getReg(); 1746 Register NewVal = MI.getOperand(3).getReg(); 1747 1748 assert(SITargetLowering::isFlatGlobalAddrSpace( 1749 MRI.getType(PtrReg).getAddressSpace()) && 1750 "this should not have been custom lowered"); 1751 1752 LLT ValTy = MRI.getType(CmpVal); 1753 LLT VecTy = LLT::vector(2, ValTy); 1754 1755 B.setInstr(MI); 1756 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 1757 1758 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 1759 .addDef(DstReg) 1760 .addUse(PtrReg) 1761 .addUse(PackedVal) 1762 .setMemRefs(MI.memoperands()); 1763 1764 MI.eraseFromParent(); 1765 return true; 1766 } 1767 1768 // Return the use branch instruction, otherwise null if the usage is invalid. 1769 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 1770 MachineRegisterInfo &MRI) { 1771 Register CondDef = MI.getOperand(0).getReg(); 1772 if (!MRI.hasOneNonDBGUse(CondDef)) 1773 return nullptr; 1774 1775 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 1776 return UseMI.getParent() == MI.getParent() && 1777 UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr; 1778 } 1779 1780 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 1781 Register Reg, LLT Ty) const { 1782 Register LiveIn = MRI.getLiveInVirtReg(Reg); 1783 if (LiveIn) 1784 return LiveIn; 1785 1786 Register NewReg = MRI.createGenericVirtualRegister(Ty); 1787 MRI.addLiveIn(Reg, NewReg); 1788 return NewReg; 1789 } 1790 1791 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 1792 const ArgDescriptor *Arg) const { 1793 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 1794 return false; // TODO: Handle these 1795 1796 assert(Arg->getRegister().isPhysical()); 1797 1798 MachineRegisterInfo &MRI = *B.getMRI(); 1799 1800 LLT Ty = MRI.getType(DstReg); 1801 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 1802 1803 if (Arg->isMasked()) { 1804 // TODO: Should we try to emit this once in the entry block? 1805 const LLT S32 = LLT::scalar(32); 1806 const unsigned Mask = Arg->getMask(); 1807 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 1808 1809 Register AndMaskSrc = LiveIn; 1810 1811 if (Shift != 0) { 1812 auto ShiftAmt = B.buildConstant(S32, Shift); 1813 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 1814 } 1815 1816 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 1817 } else 1818 B.buildCopy(DstReg, LiveIn); 1819 1820 // Insert the argument copy if it doens't already exist. 1821 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 1822 if (!MRI.getVRegDef(LiveIn)) { 1823 // FIXME: Should have scoped insert pt 1824 MachineBasicBlock &OrigInsBB = B.getMBB(); 1825 auto OrigInsPt = B.getInsertPt(); 1826 1827 MachineBasicBlock &EntryMBB = B.getMF().front(); 1828 EntryMBB.addLiveIn(Arg->getRegister()); 1829 B.setInsertPt(EntryMBB, EntryMBB.begin()); 1830 B.buildCopy(LiveIn, Arg->getRegister()); 1831 1832 B.setInsertPt(OrigInsBB, OrigInsPt); 1833 } 1834 1835 return true; 1836 } 1837 1838 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 1839 MachineInstr &MI, 1840 MachineRegisterInfo &MRI, 1841 MachineIRBuilder &B, 1842 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 1843 B.setInstr(MI); 1844 1845 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1846 1847 const ArgDescriptor *Arg; 1848 const TargetRegisterClass *RC; 1849 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 1850 if (!Arg) { 1851 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 1852 return false; 1853 } 1854 1855 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 1856 MI.eraseFromParent(); 1857 return true; 1858 } 1859 1860 return false; 1861 } 1862 1863 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 1864 MachineRegisterInfo &MRI, 1865 MachineIRBuilder &B) const { 1866 B.setInstr(MI); 1867 Register Dst = MI.getOperand(0).getReg(); 1868 LLT DstTy = MRI.getType(Dst); 1869 LLT S16 = LLT::scalar(16); 1870 LLT S32 = LLT::scalar(32); 1871 LLT S64 = LLT::scalar(64); 1872 1873 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 1874 return true; 1875 1876 if (DstTy == S16) 1877 return legalizeFDIV16(MI, MRI, B); 1878 if (DstTy == S32) 1879 return legalizeFDIV32(MI, MRI, B); 1880 if (DstTy == S64) 1881 return legalizeFDIV64(MI, MRI, B); 1882 1883 return false; 1884 } 1885 1886 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 1887 MachineRegisterInfo &MRI, 1888 MachineIRBuilder &B) const { 1889 Register Res = MI.getOperand(0).getReg(); 1890 Register LHS = MI.getOperand(1).getReg(); 1891 Register RHS = MI.getOperand(2).getReg(); 1892 1893 uint16_t Flags = MI.getFlags(); 1894 1895 LLT ResTy = MRI.getType(Res); 1896 LLT S32 = LLT::scalar(32); 1897 LLT S64 = LLT::scalar(64); 1898 1899 const MachineFunction &MF = B.getMF(); 1900 bool Unsafe = 1901 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 1902 1903 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 1904 return false; 1905 1906 if (!Unsafe && ResTy == S32 && 1907 MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals) 1908 return false; 1909 1910 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 1911 // 1 / x -> RCP(x) 1912 if (CLHS->isExactlyValue(1.0)) { 1913 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 1914 .addUse(RHS) 1915 .setMIFlags(Flags); 1916 1917 MI.eraseFromParent(); 1918 return true; 1919 } 1920 1921 // -1 / x -> RCP( FNEG(x) ) 1922 if (CLHS->isExactlyValue(-1.0)) { 1923 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 1924 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 1925 .addUse(FNeg.getReg(0)) 1926 .setMIFlags(Flags); 1927 1928 MI.eraseFromParent(); 1929 return true; 1930 } 1931 } 1932 1933 // x / y -> x * (1.0 / y) 1934 if (Unsafe) { 1935 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 1936 .addUse(RHS) 1937 .setMIFlags(Flags); 1938 B.buildFMul(Res, LHS, RCP, Flags); 1939 1940 MI.eraseFromParent(); 1941 return true; 1942 } 1943 1944 return false; 1945 } 1946 1947 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 1948 MachineRegisterInfo &MRI, 1949 MachineIRBuilder &B) const { 1950 B.setInstr(MI); 1951 Register Res = MI.getOperand(0).getReg(); 1952 Register LHS = MI.getOperand(1).getReg(); 1953 Register RHS = MI.getOperand(2).getReg(); 1954 1955 uint16_t Flags = MI.getFlags(); 1956 1957 LLT S16 = LLT::scalar(16); 1958 LLT S32 = LLT::scalar(32); 1959 1960 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 1961 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 1962 1963 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 1964 .addUse(RHSExt.getReg(0)) 1965 .setMIFlags(Flags); 1966 1967 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 1968 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 1969 1970 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 1971 .addUse(RDst.getReg(0)) 1972 .addUse(RHS) 1973 .addUse(LHS) 1974 .setMIFlags(Flags); 1975 1976 MI.eraseFromParent(); 1977 return true; 1978 } 1979 1980 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 1981 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 1982 static void toggleSPDenormMode(bool Enable, 1983 MachineIRBuilder &B, 1984 const GCNSubtarget &ST, 1985 AMDGPU::SIModeRegisterDefaults Mode) { 1986 // Set SP denorm mode to this value. 1987 unsigned SPDenormMode = 1988 Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; 1989 1990 if (ST.hasDenormModeInst()) { 1991 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 1992 unsigned DPDenormModeDefault = Mode.FP64FP16Denormals 1993 ? FP_DENORM_FLUSH_NONE 1994 : FP_DENORM_FLUSH_IN_FLUSH_OUT; 1995 1996 unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 1997 B.buildInstr(AMDGPU::S_DENORM_MODE) 1998 .addImm(NewDenormModeValue); 1999 2000 } else { 2001 // Select FP32 bit field in mode register. 2002 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2003 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2004 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2005 2006 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2007 .addImm(SPDenormMode) 2008 .addImm(SPDenormModeBitField); 2009 } 2010 } 2011 2012 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2013 MachineRegisterInfo &MRI, 2014 MachineIRBuilder &B) const { 2015 B.setInstr(MI); 2016 Register Res = MI.getOperand(0).getReg(); 2017 Register LHS = MI.getOperand(1).getReg(); 2018 Register RHS = MI.getOperand(2).getReg(); 2019 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2020 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2021 2022 uint16_t Flags = MI.getFlags(); 2023 2024 LLT S32 = LLT::scalar(32); 2025 LLT S1 = LLT::scalar(1); 2026 2027 auto One = B.buildFConstant(S32, 1.0f); 2028 2029 auto DenominatorScaled = 2030 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2031 .addUse(RHS) 2032 .addUse(LHS) 2033 .addImm(1) 2034 .setMIFlags(Flags); 2035 auto NumeratorScaled = 2036 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2037 .addUse(LHS) 2038 .addUse(RHS) 2039 .addImm(0) 2040 .setMIFlags(Flags); 2041 2042 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2043 .addUse(DenominatorScaled.getReg(0)) 2044 .setMIFlags(Flags); 2045 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2046 2047 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2048 // aren't modeled as reading it. 2049 if (!Mode.FP32Denormals) 2050 toggleSPDenormMode(true, B, ST, Mode); 2051 2052 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2053 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2054 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2055 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2056 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2057 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2058 2059 if (!Mode.FP32Denormals) 2060 toggleSPDenormMode(false, B, ST, Mode); 2061 2062 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2063 .addUse(Fma4.getReg(0)) 2064 .addUse(Fma1.getReg(0)) 2065 .addUse(Fma3.getReg(0)) 2066 .addUse(NumeratorScaled.getReg(1)) 2067 .setMIFlags(Flags); 2068 2069 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2070 .addUse(Fmas.getReg(0)) 2071 .addUse(RHS) 2072 .addUse(LHS) 2073 .setMIFlags(Flags); 2074 2075 MI.eraseFromParent(); 2076 return true; 2077 } 2078 2079 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2080 MachineRegisterInfo &MRI, 2081 MachineIRBuilder &B) const { 2082 B.setInstr(MI); 2083 Register Res = MI.getOperand(0).getReg(); 2084 Register LHS = MI.getOperand(1).getReg(); 2085 Register RHS = MI.getOperand(2).getReg(); 2086 2087 uint16_t Flags = MI.getFlags(); 2088 2089 LLT S64 = LLT::scalar(64); 2090 LLT S1 = LLT::scalar(1); 2091 2092 auto One = B.buildFConstant(S64, 1.0); 2093 2094 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2095 .addUse(LHS) 2096 .addUse(RHS) 2097 .addImm(1) 2098 .setMIFlags(Flags); 2099 2100 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2101 2102 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2103 .addUse(DivScale0.getReg(0)) 2104 .setMIFlags(Flags); 2105 2106 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2107 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2108 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2109 2110 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2111 .addUse(LHS) 2112 .addUse(RHS) 2113 .addImm(0) 2114 .setMIFlags(Flags); 2115 2116 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2117 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); 2118 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2119 2120 Register Scale; 2121 if (!ST.hasUsableDivScaleConditionOutput()) { 2122 // Workaround a hardware bug on SI where the condition output from div_scale 2123 // is not usable. 2124 2125 Scale = MRI.createGenericVirtualRegister(S1); 2126 2127 LLT S32 = LLT::scalar(32); 2128 2129 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2130 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2131 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2132 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 2133 2134 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 2135 Scale1Unmerge.getReg(1)); 2136 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 2137 Scale0Unmerge.getReg(1)); 2138 B.buildXor(Scale, CmpNum, CmpDen); 2139 } else { 2140 Scale = DivScale1.getReg(1); 2141 } 2142 2143 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 2144 .addUse(Fma4.getReg(0)) 2145 .addUse(Fma3.getReg(0)) 2146 .addUse(Mul.getReg(0)) 2147 .addUse(Scale) 2148 .setMIFlags(Flags); 2149 2150 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, {S64}, false) 2151 .addDef(Res) 2152 .addUse(Fmas.getReg(0)) 2153 .addUse(RHS) 2154 .addUse(LHS) 2155 .setMIFlags(Flags); 2156 2157 MI.eraseFromParent(); 2158 return true; 2159 } 2160 2161 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 2162 MachineRegisterInfo &MRI, 2163 MachineIRBuilder &B) const { 2164 B.setInstr(MI); 2165 Register Res = MI.getOperand(0).getReg(); 2166 Register LHS = MI.getOperand(2).getReg(); 2167 Register RHS = MI.getOperand(3).getReg(); 2168 uint16_t Flags = MI.getFlags(); 2169 2170 LLT S32 = LLT::scalar(32); 2171 LLT S1 = LLT::scalar(1); 2172 2173 auto Abs = B.buildFAbs(S32, RHS, Flags); 2174 const APFloat C0Val(1.0f); 2175 2176 auto C0 = B.buildConstant(S32, 0x6f800000); 2177 auto C1 = B.buildConstant(S32, 0x2f800000); 2178 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 2179 2180 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 2181 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 2182 2183 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 2184 2185 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2186 .addUse(Mul0.getReg(0)) 2187 .setMIFlags(Flags); 2188 2189 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 2190 2191 B.buildFMul(Res, Sel, Mul1, Flags); 2192 2193 MI.eraseFromParent(); 2194 return true; 2195 } 2196 2197 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 2198 MachineRegisterInfo &MRI, 2199 MachineIRBuilder &B) const { 2200 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2201 if (!MFI->isEntryFunction()) { 2202 return legalizePreloadedArgIntrin(MI, MRI, B, 2203 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 2204 } 2205 2206 B.setInstr(MI); 2207 2208 uint64_t Offset = 2209 ST.getTargetLowering()->getImplicitParameterOffset( 2210 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 2211 Register DstReg = MI.getOperand(0).getReg(); 2212 LLT DstTy = MRI.getType(DstReg); 2213 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 2214 2215 const ArgDescriptor *Arg; 2216 const TargetRegisterClass *RC; 2217 std::tie(Arg, RC) 2218 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2219 if (!Arg) 2220 return false; 2221 2222 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 2223 if (!loadInputValue(KernargPtrReg, B, Arg)) 2224 return false; 2225 2226 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 2227 MI.eraseFromParent(); 2228 return true; 2229 } 2230 2231 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 2232 MachineRegisterInfo &MRI, 2233 MachineIRBuilder &B, 2234 unsigned AddrSpace) const { 2235 B.setInstr(MI); 2236 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 2237 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 2238 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 2239 MI.eraseFromParent(); 2240 return true; 2241 } 2242 2243 /// Handle register layout difference for f16 images for some subtargets. 2244 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 2245 MachineRegisterInfo &MRI, 2246 Register Reg) const { 2247 if (!ST.hasUnpackedD16VMem()) 2248 return Reg; 2249 2250 const LLT S16 = LLT::scalar(16); 2251 const LLT S32 = LLT::scalar(32); 2252 LLT StoreVT = MRI.getType(Reg); 2253 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 2254 2255 auto Unmerge = B.buildUnmerge(S16, Reg); 2256 2257 SmallVector<Register, 4> WideRegs; 2258 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 2259 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 2260 2261 int NumElts = StoreVT.getNumElements(); 2262 2263 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 2264 } 2265 2266 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI, 2267 MachineRegisterInfo &MRI, 2268 MachineIRBuilder &B, 2269 bool IsFormat) const { 2270 // TODO: Reject f16 format on targets where unsupported. 2271 Register VData = MI.getOperand(1).getReg(); 2272 LLT Ty = MRI.getType(VData); 2273 2274 B.setInstr(MI); 2275 2276 const LLT S32 = LLT::scalar(32); 2277 const LLT S16 = LLT::scalar(16); 2278 2279 // Fixup illegal register types for i8 stores. 2280 if (Ty == LLT::scalar(8) || Ty == S16) { 2281 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 2282 MI.getOperand(1).setReg(AnyExt); 2283 return true; 2284 } 2285 2286 if (Ty.isVector()) { 2287 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 2288 if (IsFormat) 2289 MI.getOperand(1).setReg(handleD16VData(B, MRI, VData)); 2290 return true; 2291 } 2292 2293 return Ty.getElementType() == S32 && Ty.getNumElements() <= 4; 2294 } 2295 2296 return Ty == S32; 2297 } 2298 2299 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 2300 MachineRegisterInfo &MRI, 2301 MachineIRBuilder &B) const { 2302 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 2303 auto IntrID = MI.getIntrinsicID(); 2304 switch (IntrID) { 2305 case Intrinsic::amdgcn_if: 2306 case Intrinsic::amdgcn_else: { 2307 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 2308 const SIRegisterInfo *TRI 2309 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 2310 2311 B.setInstr(*BrCond); 2312 Register Def = MI.getOperand(1).getReg(); 2313 Register Use = MI.getOperand(3).getReg(); 2314 2315 if (IntrID == Intrinsic::amdgcn_if) { 2316 B.buildInstr(AMDGPU::SI_IF) 2317 .addDef(Def) 2318 .addUse(Use) 2319 .addMBB(BrCond->getOperand(1).getMBB()); 2320 } else { 2321 B.buildInstr(AMDGPU::SI_ELSE) 2322 .addDef(Def) 2323 .addUse(Use) 2324 .addMBB(BrCond->getOperand(1).getMBB()) 2325 .addImm(0); 2326 } 2327 2328 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 2329 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 2330 MI.eraseFromParent(); 2331 BrCond->eraseFromParent(); 2332 return true; 2333 } 2334 2335 return false; 2336 } 2337 case Intrinsic::amdgcn_loop: { 2338 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 2339 const SIRegisterInfo *TRI 2340 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 2341 2342 B.setInstr(*BrCond); 2343 Register Reg = MI.getOperand(2).getReg(); 2344 B.buildInstr(AMDGPU::SI_LOOP) 2345 .addUse(Reg) 2346 .addMBB(BrCond->getOperand(1).getMBB()); 2347 MI.eraseFromParent(); 2348 BrCond->eraseFromParent(); 2349 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 2350 return true; 2351 } 2352 2353 return false; 2354 } 2355 case Intrinsic::amdgcn_kernarg_segment_ptr: 2356 return legalizePreloadedArgIntrin( 2357 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2358 case Intrinsic::amdgcn_implicitarg_ptr: 2359 return legalizeImplicitArgPtr(MI, MRI, B); 2360 case Intrinsic::amdgcn_workitem_id_x: 2361 return legalizePreloadedArgIntrin(MI, MRI, B, 2362 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 2363 case Intrinsic::amdgcn_workitem_id_y: 2364 return legalizePreloadedArgIntrin(MI, MRI, B, 2365 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 2366 case Intrinsic::amdgcn_workitem_id_z: 2367 return legalizePreloadedArgIntrin(MI, MRI, B, 2368 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 2369 case Intrinsic::amdgcn_workgroup_id_x: 2370 return legalizePreloadedArgIntrin(MI, MRI, B, 2371 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 2372 case Intrinsic::amdgcn_workgroup_id_y: 2373 return legalizePreloadedArgIntrin(MI, MRI, B, 2374 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 2375 case Intrinsic::amdgcn_workgroup_id_z: 2376 return legalizePreloadedArgIntrin(MI, MRI, B, 2377 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 2378 case Intrinsic::amdgcn_dispatch_ptr: 2379 return legalizePreloadedArgIntrin(MI, MRI, B, 2380 AMDGPUFunctionArgInfo::DISPATCH_PTR); 2381 case Intrinsic::amdgcn_queue_ptr: 2382 return legalizePreloadedArgIntrin(MI, MRI, B, 2383 AMDGPUFunctionArgInfo::QUEUE_PTR); 2384 case Intrinsic::amdgcn_implicit_buffer_ptr: 2385 return legalizePreloadedArgIntrin( 2386 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 2387 case Intrinsic::amdgcn_dispatch_id: 2388 return legalizePreloadedArgIntrin(MI, MRI, B, 2389 AMDGPUFunctionArgInfo::DISPATCH_ID); 2390 case Intrinsic::amdgcn_fdiv_fast: 2391 return legalizeFDIVFastIntrin(MI, MRI, B); 2392 case Intrinsic::amdgcn_is_shared: 2393 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 2394 case Intrinsic::amdgcn_is_private: 2395 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 2396 case Intrinsic::amdgcn_wavefrontsize: { 2397 B.setInstr(MI); 2398 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 2399 MI.eraseFromParent(); 2400 return true; 2401 } 2402 case Intrinsic::amdgcn_raw_buffer_store: 2403 return legalizeRawBufferStore(MI, MRI, B, false); 2404 case Intrinsic::amdgcn_raw_buffer_store_format: 2405 return legalizeRawBufferStore(MI, MRI, B, true); 2406 default: 2407 return true; 2408 } 2409 2410 return true; 2411 } 2412