1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPU.h" 22 #include "AMDGPULegalizerInfo.h" 23 #include "AMDGPUTargetMachine.h" 24 #include "SIMachineFunctionInfo.h" 25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/CodeGen/TargetOpcodes.h" 28 #include "llvm/CodeGen/ValueTypes.h" 29 #include "llvm/IR/DerivedTypes.h" 30 #include "llvm/IR/DiagnosticInfo.h" 31 #include "llvm/IR/Type.h" 32 #include "llvm/Support/Debug.h" 33 34 #define DEBUG_TYPE "amdgpu-legalinfo" 35 36 using namespace llvm; 37 using namespace LegalizeActions; 38 using namespace LegalizeMutations; 39 using namespace LegalityPredicates; 40 41 42 static LegalityPredicate isMultiple32(unsigned TypeIdx, 43 unsigned MaxSize = 1024) { 44 return [=](const LegalityQuery &Query) { 45 const LLT Ty = Query.Types[TypeIdx]; 46 const LLT EltTy = Ty.getScalarType(); 47 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 48 }; 49 } 50 51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 52 return [=](const LegalityQuery &Query) { 53 return Query.Types[TypeIdx].getSizeInBits() == Size; 54 }; 55 } 56 57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 58 return [=](const LegalityQuery &Query) { 59 const LLT Ty = Query.Types[TypeIdx]; 60 return Ty.isVector() && 61 Ty.getNumElements() % 2 != 0 && 62 Ty.getElementType().getSizeInBits() < 32 && 63 Ty.getSizeInBits() % 32 != 0; 64 }; 65 } 66 67 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 68 return [=](const LegalityQuery &Query) { 69 const LLT Ty = Query.Types[TypeIdx]; 70 const LLT EltTy = Ty.getScalarType(); 71 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 72 }; 73 } 74 75 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 76 return [=](const LegalityQuery &Query) { 77 const LLT Ty = Query.Types[TypeIdx]; 78 const LLT EltTy = Ty.getElementType(); 79 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 80 }; 81 } 82 83 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 84 return [=](const LegalityQuery &Query) { 85 const LLT Ty = Query.Types[TypeIdx]; 86 const LLT EltTy = Ty.getElementType(); 87 unsigned Size = Ty.getSizeInBits(); 88 unsigned Pieces = (Size + 63) / 64; 89 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 90 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 91 }; 92 } 93 94 // Increase the number of vector elements to reach the next multiple of 32-bit 95 // type. 96 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 97 return [=](const LegalityQuery &Query) { 98 const LLT Ty = Query.Types[TypeIdx]; 99 100 const LLT EltTy = Ty.getElementType(); 101 const int Size = Ty.getSizeInBits(); 102 const int EltSize = EltTy.getSizeInBits(); 103 const int NextMul32 = (Size + 31) / 32; 104 105 assert(EltSize < 32); 106 107 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 108 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 109 }; 110 } 111 112 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 113 return [=](const LegalityQuery &Query) { 114 const LLT QueryTy = Query.Types[TypeIdx]; 115 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 116 }; 117 } 118 119 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 120 return [=](const LegalityQuery &Query) { 121 const LLT QueryTy = Query.Types[TypeIdx]; 122 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 123 }; 124 } 125 126 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 127 return [=](const LegalityQuery &Query) { 128 const LLT QueryTy = Query.Types[TypeIdx]; 129 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 130 }; 131 } 132 133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 134 // v2s16. 135 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 136 return [=](const LegalityQuery &Query) { 137 const LLT Ty = Query.Types[TypeIdx]; 138 if (Ty.isVector()) { 139 const int EltSize = Ty.getElementType().getSizeInBits(); 140 return EltSize == 32 || EltSize == 64 || 141 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 142 EltSize == 128 || EltSize == 256; 143 } 144 145 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 146 }; 147 } 148 149 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 150 return [=](const LegalityQuery &Query) { 151 return Query.Types[TypeIdx].getElementType() == Type; 152 }; 153 } 154 155 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 156 return [=](const LegalityQuery &Query) { 157 const LLT Ty = Query.Types[TypeIdx]; 158 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 159 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 160 }; 161 } 162 163 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 164 const GCNTargetMachine &TM) 165 : ST(ST_) { 166 using namespace TargetOpcode; 167 168 auto GetAddrSpacePtr = [&TM](unsigned AS) { 169 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 170 }; 171 172 const LLT S1 = LLT::scalar(1); 173 const LLT S8 = LLT::scalar(8); 174 const LLT S16 = LLT::scalar(16); 175 const LLT S32 = LLT::scalar(32); 176 const LLT S64 = LLT::scalar(64); 177 const LLT S96 = LLT::scalar(96); 178 const LLT S128 = LLT::scalar(128); 179 const LLT S256 = LLT::scalar(256); 180 const LLT S1024 = LLT::scalar(1024); 181 182 const LLT V2S16 = LLT::vector(2, 16); 183 const LLT V4S16 = LLT::vector(4, 16); 184 185 const LLT V2S32 = LLT::vector(2, 32); 186 const LLT V3S32 = LLT::vector(3, 32); 187 const LLT V4S32 = LLT::vector(4, 32); 188 const LLT V5S32 = LLT::vector(5, 32); 189 const LLT V6S32 = LLT::vector(6, 32); 190 const LLT V7S32 = LLT::vector(7, 32); 191 const LLT V8S32 = LLT::vector(8, 32); 192 const LLT V9S32 = LLT::vector(9, 32); 193 const LLT V10S32 = LLT::vector(10, 32); 194 const LLT V11S32 = LLT::vector(11, 32); 195 const LLT V12S32 = LLT::vector(12, 32); 196 const LLT V13S32 = LLT::vector(13, 32); 197 const LLT V14S32 = LLT::vector(14, 32); 198 const LLT V15S32 = LLT::vector(15, 32); 199 const LLT V16S32 = LLT::vector(16, 32); 200 const LLT V32S32 = LLT::vector(32, 32); 201 202 const LLT V2S64 = LLT::vector(2, 64); 203 const LLT V3S64 = LLT::vector(3, 64); 204 const LLT V4S64 = LLT::vector(4, 64); 205 const LLT V5S64 = LLT::vector(5, 64); 206 const LLT V6S64 = LLT::vector(6, 64); 207 const LLT V7S64 = LLT::vector(7, 64); 208 const LLT V8S64 = LLT::vector(8, 64); 209 const LLT V16S64 = LLT::vector(16, 64); 210 211 std::initializer_list<LLT> AllS32Vectors = 212 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 213 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 214 std::initializer_list<LLT> AllS64Vectors = 215 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 216 217 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 218 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 219 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 220 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 221 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 222 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 223 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 224 225 const LLT CodePtr = FlatPtr; 226 227 const std::initializer_list<LLT> AddrSpaces64 = { 228 GlobalPtr, ConstantPtr, FlatPtr 229 }; 230 231 const std::initializer_list<LLT> AddrSpaces32 = { 232 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 233 }; 234 235 const std::initializer_list<LLT> FPTypesBase = { 236 S32, S64 237 }; 238 239 const std::initializer_list<LLT> FPTypes16 = { 240 S32, S64, S16 241 }; 242 243 const std::initializer_list<LLT> FPTypesPK16 = { 244 S32, S64, S16, V2S16 245 }; 246 247 setAction({G_BRCOND, S1}, Legal); 248 249 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 250 // elements for v3s16 251 getActionDefinitionsBuilder(G_PHI) 252 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 253 .legalFor(AllS32Vectors) 254 .legalFor(AllS64Vectors) 255 .legalFor(AddrSpaces64) 256 .legalFor(AddrSpaces32) 257 .clampScalar(0, S32, S256) 258 .widenScalarToNextPow2(0, 32) 259 .clampMaxNumElements(0, S32, 16) 260 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 261 .legalIf(isPointer(0)); 262 263 if (ST.has16BitInsts()) { 264 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 265 .legalFor({S32, S16}) 266 .clampScalar(0, S16, S32) 267 .scalarize(0); 268 } else { 269 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 270 .legalFor({S32}) 271 .clampScalar(0, S32, S32) 272 .scalarize(0); 273 } 274 275 // FIXME: Not really legal. Placeholder for custom lowering. 276 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 277 .legalFor({S32, S64}) 278 .clampScalar(0, S32, S64) 279 .widenScalarToNextPow2(0, 32) 280 .scalarize(0); 281 282 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 283 .legalFor({S32}) 284 .clampScalar(0, S32, S32) 285 .scalarize(0); 286 287 // Report legal for any types we can handle anywhere. For the cases only legal 288 // on the SALU, RegBankSelect will be able to re-legalize. 289 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 290 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 291 .clampScalar(0, S32, S64) 292 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 293 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 294 .widenScalarToNextPow2(0) 295 .scalarize(0); 296 297 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 298 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 299 .legalFor({{S32, S1}}) 300 .clampScalar(0, S32, S32) 301 .scalarize(0); // TODO: Implement. 302 303 getActionDefinitionsBuilder({G_SADDO, G_SSUBO}) 304 .lower(); 305 306 getActionDefinitionsBuilder(G_BITCAST) 307 // Don't worry about the size constraint. 308 .legalIf(all(isRegisterType(0), isRegisterType(1))) 309 // FIXME: Testing hack 310 .legalForCartesianProduct({S16, LLT::vector(2, 8), }); 311 312 getActionDefinitionsBuilder(G_FCONSTANT) 313 .legalFor({S32, S64, S16}) 314 .clampScalar(0, S16, S64); 315 316 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 317 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 318 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 319 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 320 .clampScalarOrElt(0, S32, S1024) 321 .legalIf(isMultiple32(0)) 322 .widenScalarToNextPow2(0, 32) 323 .clampMaxNumElements(0, S32, 16); 324 325 326 // FIXME: i1 operands to intrinsics should always be legal, but other i1 327 // values may not be legal. We need to figure out how to distinguish 328 // between these two scenarios. 329 getActionDefinitionsBuilder(G_CONSTANT) 330 .legalFor({S1, S32, S64, S16, GlobalPtr, 331 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 332 .clampScalar(0, S32, S64) 333 .widenScalarToNextPow2(0) 334 .legalIf(isPointer(0)); 335 336 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 337 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 338 .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr}); 339 340 341 auto &FPOpActions = getActionDefinitionsBuilder( 342 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 343 .legalFor({S32, S64}); 344 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 345 .customFor({S32, S64}); 346 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 347 .customFor({S32, S64}); 348 349 if (ST.has16BitInsts()) { 350 if (ST.hasVOP3PInsts()) 351 FPOpActions.legalFor({S16, V2S16}); 352 else 353 FPOpActions.legalFor({S16}); 354 355 TrigActions.customFor({S16}); 356 FDIVActions.customFor({S16}); 357 } 358 359 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 360 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 361 362 if (ST.hasVOP3PInsts()) { 363 MinNumMaxNum.customFor(FPTypesPK16) 364 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 365 .clampMaxNumElements(0, S16, 2) 366 .clampScalar(0, S16, S64) 367 .scalarize(0); 368 } else if (ST.has16BitInsts()) { 369 MinNumMaxNum.customFor(FPTypes16) 370 .clampScalar(0, S16, S64) 371 .scalarize(0); 372 } else { 373 MinNumMaxNum.customFor(FPTypesBase) 374 .clampScalar(0, S32, S64) 375 .scalarize(0); 376 } 377 378 if (ST.hasVOP3PInsts()) 379 FPOpActions.clampMaxNumElements(0, S16, 2); 380 381 FPOpActions 382 .scalarize(0) 383 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 384 385 TrigActions 386 .scalarize(0) 387 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 388 389 FDIVActions 390 .scalarize(0) 391 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 392 393 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 394 .legalFor(FPTypesPK16) 395 .clampMaxNumElements(0, S16, 2) 396 .scalarize(0) 397 .clampScalar(0, S16, S64); 398 399 // TODO: Implement 400 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); 401 402 if (ST.has16BitInsts()) { 403 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 404 .legalFor({S32, S64, S16}) 405 .scalarize(0) 406 .clampScalar(0, S16, S64); 407 } else { 408 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 409 .legalFor({S32, S64}) 410 .scalarize(0) 411 .clampScalar(0, S32, S64); 412 } 413 414 getActionDefinitionsBuilder(G_FPTRUNC) 415 .legalFor({{S32, S64}, {S16, S32}}) 416 .scalarize(0); 417 418 getActionDefinitionsBuilder(G_FPEXT) 419 .legalFor({{S64, S32}, {S32, S16}}) 420 .lowerFor({{S64, S16}}) // FIXME: Implement 421 .scalarize(0); 422 423 // TODO: Verify V_BFI_B32 is generated from expanded bit ops. 424 getActionDefinitionsBuilder(G_FCOPYSIGN).lower(); 425 426 getActionDefinitionsBuilder(G_FSUB) 427 // Use actual fsub instruction 428 .legalFor({S32}) 429 // Must use fadd + fneg 430 .lowerFor({S64, S16, V2S16}) 431 .scalarize(0) 432 .clampScalar(0, S32, S64); 433 434 // Whether this is legal depends on the floating point mode for the function. 435 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 436 if (ST.hasMadF16()) 437 FMad.customFor({S32, S16}); 438 else 439 FMad.customFor({S32}); 440 FMad.scalarize(0) 441 .lower(); 442 443 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 444 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 445 {S32, S1}, {S64, S1}, {S16, S1}, 446 {S96, S32}, 447 // FIXME: Hack 448 {S64, LLT::scalar(33)}, 449 {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}}) 450 .scalarize(0); 451 452 // TODO: Split s1->s64 during regbankselect for VALU. 453 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 454 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 455 .lowerFor({{S32, S64}}) 456 .lowerIf(typeIs(1, S1)) 457 .customFor({{S64, S64}}); 458 if (ST.has16BitInsts()) 459 IToFP.legalFor({{S16, S16}}); 460 IToFP.clampScalar(1, S32, S64) 461 .scalarize(0); 462 463 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 464 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}); 465 if (ST.has16BitInsts()) 466 FPToI.legalFor({{S16, S16}}); 467 else 468 FPToI.minScalar(1, S32); 469 470 FPToI.minScalar(0, S32) 471 .scalarize(0); 472 473 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 474 .legalFor({S32, S64}) 475 .scalarize(0); 476 477 if (ST.has16BitInsts()) { 478 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 479 .legalFor({S16, S32, S64}) 480 .clampScalar(0, S16, S64) 481 .scalarize(0); 482 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 483 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 484 .legalFor({S32, S64}) 485 .clampScalar(0, S32, S64) 486 .scalarize(0); 487 } else { 488 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 489 .legalFor({S32}) 490 .customFor({S64}) 491 .clampScalar(0, S32, S64) 492 .scalarize(0); 493 } 494 495 getActionDefinitionsBuilder(G_PTR_ADD) 496 .legalForCartesianProduct(AddrSpaces64, {S64}) 497 .legalForCartesianProduct(AddrSpaces32, {S32}) 498 .scalarize(0); 499 500 getActionDefinitionsBuilder(G_PTR_MASK) 501 .scalarize(0) 502 .alwaysLegal(); 503 504 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 505 506 auto &CmpBuilder = 507 getActionDefinitionsBuilder(G_ICMP) 508 .legalForCartesianProduct( 509 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 510 .legalFor({{S1, S32}, {S1, S64}}); 511 if (ST.has16BitInsts()) { 512 CmpBuilder.legalFor({{S1, S16}}); 513 } 514 515 CmpBuilder 516 .widenScalarToNextPow2(1) 517 .clampScalar(1, S32, S64) 518 .scalarize(0) 519 .legalIf(all(typeIs(0, S1), isPointer(1))); 520 521 getActionDefinitionsBuilder(G_FCMP) 522 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 523 .widenScalarToNextPow2(1) 524 .clampScalar(1, S32, S64) 525 .scalarize(0); 526 527 // FIXME: fexp, flog2, flog10 needs to be custom lowered. 528 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, 529 G_FLOG, G_FLOG2, G_FLOG10}) 530 .legalFor({S32}) 531 .scalarize(0); 532 533 // The 64-bit versions produce 32-bit results, but only on the SALU. 534 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, 535 G_CTTZ, G_CTTZ_ZERO_UNDEF, 536 G_CTPOP}) 537 .legalFor({{S32, S32}, {S32, S64}}) 538 .clampScalar(0, S32, S32) 539 .clampScalar(1, S32, S64) 540 .scalarize(0) 541 .widenScalarToNextPow2(0, 32) 542 .widenScalarToNextPow2(1, 32); 543 544 // TODO: Expand for > s32 545 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE}) 546 .legalFor({S32}) 547 .clampScalar(0, S32, S32) 548 .scalarize(0); 549 550 if (ST.has16BitInsts()) { 551 if (ST.hasVOP3PInsts()) { 552 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 553 .legalFor({S32, S16, V2S16}) 554 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 555 .clampMaxNumElements(0, S16, 2) 556 .clampScalar(0, S16, S32) 557 .widenScalarToNextPow2(0) 558 .scalarize(0); 559 } else { 560 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 561 .legalFor({S32, S16}) 562 .widenScalarToNextPow2(0) 563 .clampScalar(0, S16, S32) 564 .scalarize(0); 565 } 566 } else { 567 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 568 .legalFor({S32}) 569 .clampScalar(0, S32, S32) 570 .widenScalarToNextPow2(0) 571 .scalarize(0); 572 } 573 574 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 575 return [=](const LegalityQuery &Query) { 576 return Query.Types[TypeIdx0].getSizeInBits() < 577 Query.Types[TypeIdx1].getSizeInBits(); 578 }; 579 }; 580 581 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 582 return [=](const LegalityQuery &Query) { 583 return Query.Types[TypeIdx0].getSizeInBits() > 584 Query.Types[TypeIdx1].getSizeInBits(); 585 }; 586 }; 587 588 getActionDefinitionsBuilder(G_INTTOPTR) 589 // List the common cases 590 .legalForCartesianProduct(AddrSpaces64, {S64}) 591 .legalForCartesianProduct(AddrSpaces32, {S32}) 592 .scalarize(0) 593 // Accept any address space as long as the size matches 594 .legalIf(sameSize(0, 1)) 595 .widenScalarIf(smallerThan(1, 0), 596 [](const LegalityQuery &Query) { 597 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 598 }) 599 .narrowScalarIf(greaterThan(1, 0), 600 [](const LegalityQuery &Query) { 601 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 602 }); 603 604 getActionDefinitionsBuilder(G_PTRTOINT) 605 // List the common cases 606 .legalForCartesianProduct(AddrSpaces64, {S64}) 607 .legalForCartesianProduct(AddrSpaces32, {S32}) 608 .scalarize(0) 609 // Accept any address space as long as the size matches 610 .legalIf(sameSize(0, 1)) 611 .widenScalarIf(smallerThan(0, 1), 612 [](const LegalityQuery &Query) { 613 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 614 }) 615 .narrowScalarIf( 616 greaterThan(0, 1), 617 [](const LegalityQuery &Query) { 618 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 619 }); 620 621 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 622 .scalarize(0) 623 .custom(); 624 625 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 626 // handle some operations by just promoting the register during 627 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 628 auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned { 629 switch (AS) { 630 // FIXME: Private element size. 631 case AMDGPUAS::PRIVATE_ADDRESS: 632 return 32; 633 // FIXME: Check subtarget 634 case AMDGPUAS::LOCAL_ADDRESS: 635 return ST.useDS128() ? 128 : 64; 636 637 // Treat constant and global as identical. SMRD loads are sometimes usable 638 // for global loads (ideally constant address space should be eliminated) 639 // depending on the context. Legality cannot be context dependent, but 640 // RegBankSelect can split the load as necessary depending on the pointer 641 // register bank/uniformity and if the memory is invariant or not written in 642 // a kernel. 643 case AMDGPUAS::CONSTANT_ADDRESS: 644 case AMDGPUAS::GLOBAL_ADDRESS: 645 return 512; 646 default: 647 return 128; 648 } 649 }; 650 651 const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool { 652 const LLT DstTy = Query.Types[0]; 653 654 // Split vector extloads. 655 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 656 unsigned Align = Query.MMODescrs[0].AlignInBits; 657 658 if (MemSize < DstTy.getSizeInBits()) 659 MemSize = std::max(MemSize, Align); 660 661 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 662 return true; 663 664 const LLT PtrTy = Query.Types[1]; 665 unsigned AS = PtrTy.getAddressSpace(); 666 if (MemSize > maxSizeForAddrSpace(AS)) 667 return true; 668 669 // Catch weird sized loads that don't evenly divide into the access sizes 670 // TODO: May be able to widen depending on alignment etc. 671 unsigned NumRegs = MemSize / 32; 672 if (NumRegs == 3 && !ST.hasDwordx3LoadStores()) 673 return true; 674 675 if (Align < MemSize) { 676 const SITargetLowering *TLI = ST.getTargetLowering(); 677 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 678 } 679 680 return false; 681 }; 682 683 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 684 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 685 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 686 687 // TODO: Refine based on subtargets which support unaligned access or 128-bit 688 // LDS 689 // TODO: Unsupported flat for SI. 690 691 for (unsigned Op : {G_LOAD, G_STORE}) { 692 const bool IsStore = Op == G_STORE; 693 694 auto &Actions = getActionDefinitionsBuilder(Op); 695 // Whitelist the common cases. 696 // TODO: Pointer loads 697 // TODO: Wide constant loads 698 // TODO: Only CI+ has 3x loads 699 // TODO: Loads to s16 on gfx9 700 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 701 {V2S32, GlobalPtr, 64, GlobalAlign32}, 702 {V3S32, GlobalPtr, 96, GlobalAlign32}, 703 {S96, GlobalPtr, 96, GlobalAlign32}, 704 {V4S32, GlobalPtr, 128, GlobalAlign32}, 705 {S128, GlobalPtr, 128, GlobalAlign32}, 706 {S64, GlobalPtr, 64, GlobalAlign32}, 707 {V2S64, GlobalPtr, 128, GlobalAlign32}, 708 {V2S16, GlobalPtr, 32, GlobalAlign32}, 709 {S32, GlobalPtr, 8, GlobalAlign8}, 710 {S32, GlobalPtr, 16, GlobalAlign16}, 711 712 {S32, LocalPtr, 32, 32}, 713 {S64, LocalPtr, 64, 32}, 714 {V2S32, LocalPtr, 64, 32}, 715 {S32, LocalPtr, 8, 8}, 716 {S32, LocalPtr, 16, 16}, 717 {V2S16, LocalPtr, 32, 32}, 718 719 {S32, PrivatePtr, 32, 32}, 720 {S32, PrivatePtr, 8, 8}, 721 {S32, PrivatePtr, 16, 16}, 722 {V2S16, PrivatePtr, 32, 32}, 723 724 {S32, FlatPtr, 32, GlobalAlign32}, 725 {S32, FlatPtr, 16, GlobalAlign16}, 726 {S32, FlatPtr, 8, GlobalAlign8}, 727 {V2S16, FlatPtr, 32, GlobalAlign32}, 728 729 {S32, ConstantPtr, 32, GlobalAlign32}, 730 {V2S32, ConstantPtr, 64, GlobalAlign32}, 731 {V3S32, ConstantPtr, 96, GlobalAlign32}, 732 {V4S32, ConstantPtr, 128, GlobalAlign32}, 733 {S64, ConstantPtr, 64, GlobalAlign32}, 734 {S128, ConstantPtr, 128, GlobalAlign32}, 735 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 736 Actions 737 .customIf(typeIs(1, Constant32Ptr)) 738 .narrowScalarIf( 739 [=](const LegalityQuery &Query) -> bool { 740 return !Query.Types[0].isVector() && needToSplitLoad(Query); 741 }, 742 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 743 const LLT DstTy = Query.Types[0]; 744 const LLT PtrTy = Query.Types[1]; 745 746 const unsigned DstSize = DstTy.getSizeInBits(); 747 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 748 749 // Split extloads. 750 if (DstSize > MemSize) 751 return std::make_pair(0, LLT::scalar(MemSize)); 752 753 if (DstSize > 32 && (DstSize % 32 != 0)) { 754 // FIXME: Need a way to specify non-extload of larger size if 755 // suitably aligned. 756 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 757 } 758 759 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 760 if (MemSize > MaxSize) 761 return std::make_pair(0, LLT::scalar(MaxSize)); 762 763 unsigned Align = Query.MMODescrs[0].AlignInBits; 764 return std::make_pair(0, LLT::scalar(Align)); 765 }) 766 .fewerElementsIf( 767 [=](const LegalityQuery &Query) -> bool { 768 return Query.Types[0].isVector() && needToSplitLoad(Query); 769 }, 770 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 771 const LLT DstTy = Query.Types[0]; 772 const LLT PtrTy = Query.Types[1]; 773 774 LLT EltTy = DstTy.getElementType(); 775 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 776 777 // Split if it's too large for the address space. 778 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 779 unsigned NumElts = DstTy.getNumElements(); 780 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 781 782 // FIXME: Refine when odd breakdowns handled 783 // The scalars will need to be re-legalized. 784 if (NumPieces == 1 || NumPieces >= NumElts || 785 NumElts % NumPieces != 0) 786 return std::make_pair(0, EltTy); 787 788 return std::make_pair(0, 789 LLT::vector(NumElts / NumPieces, EltTy)); 790 } 791 792 // Need to split because of alignment. 793 unsigned Align = Query.MMODescrs[0].AlignInBits; 794 unsigned EltSize = EltTy.getSizeInBits(); 795 if (EltSize > Align && 796 (EltSize / Align < DstTy.getNumElements())) { 797 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 798 } 799 800 // May need relegalization for the scalars. 801 return std::make_pair(0, EltTy); 802 }) 803 .minScalar(0, S32); 804 805 if (IsStore) 806 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 807 808 // TODO: Need a bitcast lower option? 809 Actions 810 .legalIf([=](const LegalityQuery &Query) { 811 const LLT Ty0 = Query.Types[0]; 812 unsigned Size = Ty0.getSizeInBits(); 813 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 814 unsigned Align = Query.MMODescrs[0].AlignInBits; 815 816 // FIXME: Widening store from alignment not valid. 817 if (MemSize < Size) 818 MemSize = std::max(MemSize, Align); 819 820 // No extending vector loads. 821 if (Size > MemSize && Ty0.isVector()) 822 return false; 823 824 switch (MemSize) { 825 case 8: 826 case 16: 827 return Size == 32; 828 case 32: 829 case 64: 830 case 128: 831 return true; 832 case 96: 833 return ST.hasDwordx3LoadStores(); 834 case 256: 835 case 512: 836 return true; 837 default: 838 return false; 839 } 840 }) 841 .widenScalarToNextPow2(0) 842 // TODO: v3s32->v4s32 with alignment 843 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 844 } 845 846 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 847 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 848 {S32, GlobalPtr, 16, 2 * 8}, 849 {S32, LocalPtr, 8, 8}, 850 {S32, LocalPtr, 16, 16}, 851 {S32, PrivatePtr, 8, 8}, 852 {S32, PrivatePtr, 16, 16}, 853 {S32, ConstantPtr, 8, 8}, 854 {S32, ConstantPtr, 16, 2 * 8}}); 855 if (ST.hasFlatAddressSpace()) { 856 ExtLoads.legalForTypesWithMemDesc( 857 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 858 } 859 860 ExtLoads.clampScalar(0, S32, S32) 861 .widenScalarToNextPow2(0) 862 .unsupportedIfMemSizeNotPow2() 863 .lower(); 864 865 auto &Atomics = getActionDefinitionsBuilder( 866 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 867 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 868 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 869 G_ATOMICRMW_UMIN}) 870 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 871 {S64, GlobalPtr}, {S64, LocalPtr}}); 872 if (ST.hasFlatAddressSpace()) { 873 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 874 } 875 876 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 877 .legalFor({{S32, LocalPtr}}); 878 879 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 880 // demarshalling 881 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 882 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 883 {S32, FlatPtr}, {S64, FlatPtr}}) 884 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 885 {S32, RegionPtr}, {S64, RegionPtr}}); 886 887 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS) 888 .lower(); 889 890 // TODO: Pointer types, any 32-bit or 64-bit vector 891 getActionDefinitionsBuilder(G_SELECT) 892 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 893 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 894 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1}) 895 .clampScalar(0, S16, S64) 896 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 897 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 898 .scalarize(1) 899 .clampMaxNumElements(0, S32, 2) 900 .clampMaxNumElements(0, LocalPtr, 2) 901 .clampMaxNumElements(0, PrivatePtr, 2) 902 .scalarize(0) 903 .widenScalarToNextPow2(0) 904 .legalIf(all(isPointer(0), typeIs(1, S1))); 905 906 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 907 // be more flexible with the shift amount type. 908 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 909 .legalFor({{S32, S32}, {S64, S32}}); 910 if (ST.has16BitInsts()) { 911 if (ST.hasVOP3PInsts()) { 912 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 913 .clampMaxNumElements(0, S16, 2); 914 } else 915 Shifts.legalFor({{S16, S32}, {S16, S16}}); 916 917 Shifts.clampScalar(1, S16, S32); 918 Shifts.clampScalar(0, S16, S64); 919 Shifts.widenScalarToNextPow2(0, 16); 920 } else { 921 // Make sure we legalize the shift amount type first, as the general 922 // expansion for the shifted type will produce much worse code if it hasn't 923 // been truncated already. 924 Shifts.clampScalar(1, S32, S32); 925 Shifts.clampScalar(0, S32, S64); 926 Shifts.widenScalarToNextPow2(0, 32); 927 } 928 Shifts.scalarize(0); 929 930 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 931 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 932 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 933 unsigned IdxTypeIdx = 2; 934 935 getActionDefinitionsBuilder(Op) 936 .customIf([=](const LegalityQuery &Query) { 937 const LLT EltTy = Query.Types[EltTypeIdx]; 938 const LLT VecTy = Query.Types[VecTypeIdx]; 939 const LLT IdxTy = Query.Types[IdxTypeIdx]; 940 return (EltTy.getSizeInBits() == 16 || 941 EltTy.getSizeInBits() % 32 == 0) && 942 VecTy.getSizeInBits() % 32 == 0 && 943 VecTy.getSizeInBits() <= 1024 && 944 IdxTy.getSizeInBits() == 32; 945 }) 946 .clampScalar(EltTypeIdx, S32, S64) 947 .clampScalar(VecTypeIdx, S32, S64) 948 .clampScalar(IdxTypeIdx, S32, S32); 949 } 950 951 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 952 .unsupportedIf([=](const LegalityQuery &Query) { 953 const LLT &EltTy = Query.Types[1].getElementType(); 954 return Query.Types[0] != EltTy; 955 }); 956 957 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 958 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 959 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 960 961 // FIXME: Doesn't handle extract of illegal sizes. 962 getActionDefinitionsBuilder(Op) 963 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 964 // FIXME: Multiples of 16 should not be legal. 965 .legalIf([=](const LegalityQuery &Query) { 966 const LLT BigTy = Query.Types[BigTyIdx]; 967 const LLT LitTy = Query.Types[LitTyIdx]; 968 return (BigTy.getSizeInBits() % 32 == 0) && 969 (LitTy.getSizeInBits() % 16 == 0); 970 }) 971 .widenScalarIf( 972 [=](const LegalityQuery &Query) { 973 const LLT BigTy = Query.Types[BigTyIdx]; 974 return (BigTy.getScalarSizeInBits() < 16); 975 }, 976 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 977 .widenScalarIf( 978 [=](const LegalityQuery &Query) { 979 const LLT LitTy = Query.Types[LitTyIdx]; 980 return (LitTy.getScalarSizeInBits() < 16); 981 }, 982 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 983 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 984 .widenScalarToNextPow2(BigTyIdx, 32); 985 986 } 987 988 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 989 .legalForCartesianProduct(AllS32Vectors, {S32}) 990 .legalForCartesianProduct(AllS64Vectors, {S64}) 991 .clampNumElements(0, V16S32, V32S32) 992 .clampNumElements(0, V2S64, V16S64) 993 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 994 995 if (ST.hasScalarPackInsts()) 996 BuildVector.legalFor({V2S16, S32}); 997 998 BuildVector 999 .minScalarSameAs(1, 0) 1000 .legalIf(isRegisterType(0)) 1001 .minScalarOrElt(0, S32); 1002 1003 if (ST.hasScalarPackInsts()) { 1004 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1005 .legalFor({V2S16, S32}) 1006 .lower(); 1007 } else { 1008 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1009 .lower(); 1010 } 1011 1012 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1013 .legalIf(isRegisterType(0)); 1014 1015 // TODO: Don't fully scalarize v2s16 pieces 1016 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1017 1018 // Merge/Unmerge 1019 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1020 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1021 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1022 1023 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1024 const LLT &Ty = Query.Types[TypeIdx]; 1025 if (Ty.isVector()) { 1026 const LLT &EltTy = Ty.getElementType(); 1027 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 1028 return true; 1029 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1030 return true; 1031 } 1032 return false; 1033 }; 1034 1035 auto &Builder = getActionDefinitionsBuilder(Op) 1036 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1037 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1038 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1039 // valid. 1040 .clampScalar(LitTyIdx, S16, S256) 1041 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1042 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1043 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1044 elementTypeIs(1, S16)), 1045 changeTo(1, V2S16)) 1046 // Break up vectors with weird elements into scalars 1047 .fewerElementsIf( 1048 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 1049 scalarize(0)) 1050 .fewerElementsIf( 1051 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 1052 scalarize(1)) 1053 .clampScalar(BigTyIdx, S32, S1024) 1054 .lowerFor({{S16, V2S16}}); 1055 1056 if (Op == G_MERGE_VALUES) { 1057 Builder.widenScalarIf( 1058 // TODO: Use 16-bit shifts if legal for 8-bit values? 1059 [=](const LegalityQuery &Query) { 1060 const LLT Ty = Query.Types[LitTyIdx]; 1061 return Ty.getSizeInBits() < 32; 1062 }, 1063 changeTo(LitTyIdx, S32)); 1064 } 1065 1066 Builder.widenScalarIf( 1067 [=](const LegalityQuery &Query) { 1068 const LLT Ty = Query.Types[BigTyIdx]; 1069 return !isPowerOf2_32(Ty.getSizeInBits()) && 1070 Ty.getSizeInBits() % 16 != 0; 1071 }, 1072 [=](const LegalityQuery &Query) { 1073 // Pick the next power of 2, or a multiple of 64 over 128. 1074 // Whichever is smaller. 1075 const LLT &Ty = Query.Types[BigTyIdx]; 1076 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1077 if (NewSizeInBits >= 256) { 1078 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1079 if (RoundedTo < NewSizeInBits) 1080 NewSizeInBits = RoundedTo; 1081 } 1082 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1083 }) 1084 .legalIf([=](const LegalityQuery &Query) { 1085 const LLT &BigTy = Query.Types[BigTyIdx]; 1086 const LLT &LitTy = Query.Types[LitTyIdx]; 1087 1088 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1089 return false; 1090 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1091 return false; 1092 1093 return BigTy.getSizeInBits() % 16 == 0 && 1094 LitTy.getSizeInBits() % 16 == 0 && 1095 BigTy.getSizeInBits() <= 1024; 1096 }) 1097 // Any vectors left are the wrong size. Scalarize them. 1098 .scalarize(0) 1099 .scalarize(1); 1100 } 1101 1102 getActionDefinitionsBuilder(G_SEXT_INREG).lower(); 1103 1104 computeTables(); 1105 verify(*ST.getInstrInfo()); 1106 } 1107 1108 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1109 MachineRegisterInfo &MRI, 1110 MachineIRBuilder &B, 1111 GISelChangeObserver &Observer) const { 1112 switch (MI.getOpcode()) { 1113 case TargetOpcode::G_ADDRSPACE_CAST: 1114 return legalizeAddrSpaceCast(MI, MRI, B); 1115 case TargetOpcode::G_FRINT: 1116 return legalizeFrint(MI, MRI, B); 1117 case TargetOpcode::G_FCEIL: 1118 return legalizeFceil(MI, MRI, B); 1119 case TargetOpcode::G_INTRINSIC_TRUNC: 1120 return legalizeIntrinsicTrunc(MI, MRI, B); 1121 case TargetOpcode::G_SITOFP: 1122 return legalizeITOFP(MI, MRI, B, true); 1123 case TargetOpcode::G_UITOFP: 1124 return legalizeITOFP(MI, MRI, B, false); 1125 case TargetOpcode::G_FMINNUM: 1126 case TargetOpcode::G_FMAXNUM: 1127 case TargetOpcode::G_FMINNUM_IEEE: 1128 case TargetOpcode::G_FMAXNUM_IEEE: 1129 return legalizeMinNumMaxNum(MI, MRI, B); 1130 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1131 return legalizeExtractVectorElt(MI, MRI, B); 1132 case TargetOpcode::G_INSERT_VECTOR_ELT: 1133 return legalizeInsertVectorElt(MI, MRI, B); 1134 case TargetOpcode::G_FSIN: 1135 case TargetOpcode::G_FCOS: 1136 return legalizeSinCos(MI, MRI, B); 1137 case TargetOpcode::G_GLOBAL_VALUE: 1138 return legalizeGlobalValue(MI, MRI, B); 1139 case TargetOpcode::G_LOAD: 1140 return legalizeLoad(MI, MRI, B, Observer); 1141 case TargetOpcode::G_FMAD: 1142 return legalizeFMad(MI, MRI, B); 1143 case TargetOpcode::G_FDIV: 1144 return legalizeFDIV(MI, MRI, B); 1145 case TargetOpcode::G_ATOMIC_CMPXCHG: 1146 return legalizeAtomicCmpXChg(MI, MRI, B); 1147 default: 1148 return false; 1149 } 1150 1151 llvm_unreachable("expected switch to return"); 1152 } 1153 1154 Register AMDGPULegalizerInfo::getSegmentAperture( 1155 unsigned AS, 1156 MachineRegisterInfo &MRI, 1157 MachineIRBuilder &B) const { 1158 MachineFunction &MF = B.getMF(); 1159 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1160 const LLT S32 = LLT::scalar(32); 1161 1162 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1163 1164 if (ST.hasApertureRegs()) { 1165 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1166 // getreg. 1167 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1168 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1169 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1170 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1171 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1172 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1173 unsigned Encoding = 1174 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1175 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1176 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1177 1178 Register ApertureReg = MRI.createGenericVirtualRegister(S32); 1179 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1180 1181 B.buildInstr(AMDGPU::S_GETREG_B32) 1182 .addDef(GetReg) 1183 .addImm(Encoding); 1184 MRI.setType(GetReg, S32); 1185 1186 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1187 B.buildInstr(TargetOpcode::G_SHL) 1188 .addDef(ApertureReg) 1189 .addUse(GetReg) 1190 .addUse(ShiftAmt.getReg(0)); 1191 1192 return ApertureReg; 1193 } 1194 1195 Register QueuePtr = MRI.createGenericVirtualRegister( 1196 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1197 1198 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1199 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1200 return Register(); 1201 1202 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1203 // private_segment_aperture_base_hi. 1204 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1205 1206 // TODO: can we be smarter about machine pointer info? 1207 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1208 MachineMemOperand *MMO = MF.getMachineMemOperand( 1209 PtrInfo, 1210 MachineMemOperand::MOLoad | 1211 MachineMemOperand::MODereferenceable | 1212 MachineMemOperand::MOInvariant, 1213 4, 1214 MinAlign(64, StructOffset)); 1215 1216 Register LoadResult = MRI.createGenericVirtualRegister(S32); 1217 Register LoadAddr; 1218 1219 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1220 B.buildLoad(LoadResult, LoadAddr, *MMO); 1221 return LoadResult; 1222 } 1223 1224 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1225 MachineInstr &MI, MachineRegisterInfo &MRI, 1226 MachineIRBuilder &B) const { 1227 MachineFunction &MF = B.getMF(); 1228 1229 B.setInstr(MI); 1230 1231 const LLT S32 = LLT::scalar(32); 1232 Register Dst = MI.getOperand(0).getReg(); 1233 Register Src = MI.getOperand(1).getReg(); 1234 1235 LLT DstTy = MRI.getType(Dst); 1236 LLT SrcTy = MRI.getType(Src); 1237 unsigned DestAS = DstTy.getAddressSpace(); 1238 unsigned SrcAS = SrcTy.getAddressSpace(); 1239 1240 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1241 // vector element. 1242 assert(!DstTy.isVector()); 1243 1244 const AMDGPUTargetMachine &TM 1245 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1246 1247 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1248 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1249 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1250 return true; 1251 } 1252 1253 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1254 // Truncate. 1255 B.buildExtract(Dst, Src, 0); 1256 MI.eraseFromParent(); 1257 return true; 1258 } 1259 1260 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1261 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1262 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1263 1264 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1265 // another. Merge operands are required to be the same type, but creating an 1266 // extra ptrtoint would be kind of pointless. 1267 auto HighAddr = B.buildConstant( 1268 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1269 B.buildMerge(Dst, {Src, HighAddr.getReg(0)}); 1270 MI.eraseFromParent(); 1271 return true; 1272 } 1273 1274 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1275 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1276 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1277 unsigned NullVal = TM.getNullPointerValue(DestAS); 1278 1279 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1280 auto FlatNull = B.buildConstant(SrcTy, 0); 1281 1282 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy); 1283 1284 // Extract low 32-bits of the pointer. 1285 B.buildExtract(PtrLo32, Src, 0); 1286 1287 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1288 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); 1289 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1290 1291 MI.eraseFromParent(); 1292 return true; 1293 } 1294 1295 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1296 return false; 1297 1298 if (!ST.hasFlatAddressSpace()) 1299 return false; 1300 1301 auto SegmentNull = 1302 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1303 auto FlatNull = 1304 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1305 1306 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1307 if (!ApertureReg.isValid()) 1308 return false; 1309 1310 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1311 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); 1312 1313 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy); 1314 1315 // Coerce the type of the low half of the result so we can use merge_values. 1316 Register SrcAsInt = MRI.createGenericVirtualRegister(S32); 1317 B.buildInstr(TargetOpcode::G_PTRTOINT) 1318 .addDef(SrcAsInt) 1319 .addUse(Src); 1320 1321 // TODO: Should we allow mismatched types but matching sizes in merges to 1322 // avoid the ptrtoint? 1323 B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); 1324 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); 1325 1326 MI.eraseFromParent(); 1327 return true; 1328 } 1329 1330 bool AMDGPULegalizerInfo::legalizeFrint( 1331 MachineInstr &MI, MachineRegisterInfo &MRI, 1332 MachineIRBuilder &B) const { 1333 B.setInstr(MI); 1334 1335 Register Src = MI.getOperand(1).getReg(); 1336 LLT Ty = MRI.getType(Src); 1337 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1338 1339 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1340 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1341 1342 auto C1 = B.buildFConstant(Ty, C1Val); 1343 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1344 1345 // TODO: Should this propagate fast-math-flags? 1346 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1347 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1348 1349 auto C2 = B.buildFConstant(Ty, C2Val); 1350 auto Fabs = B.buildFAbs(Ty, Src); 1351 1352 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1353 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1354 return true; 1355 } 1356 1357 bool AMDGPULegalizerInfo::legalizeFceil( 1358 MachineInstr &MI, MachineRegisterInfo &MRI, 1359 MachineIRBuilder &B) const { 1360 B.setInstr(MI); 1361 1362 const LLT S1 = LLT::scalar(1); 1363 const LLT S64 = LLT::scalar(64); 1364 1365 Register Src = MI.getOperand(1).getReg(); 1366 assert(MRI.getType(Src) == S64); 1367 1368 // result = trunc(src) 1369 // if (src > 0.0 && src != result) 1370 // result += 1.0 1371 1372 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src}); 1373 1374 const auto Zero = B.buildFConstant(S64, 0.0); 1375 const auto One = B.buildFConstant(S64, 1.0); 1376 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1377 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1378 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1379 auto Add = B.buildSelect(S64, And, One, Zero); 1380 1381 // TODO: Should this propagate fast-math-flags? 1382 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1383 return true; 1384 } 1385 1386 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1387 MachineIRBuilder &B) { 1388 const unsigned FractBits = 52; 1389 const unsigned ExpBits = 11; 1390 LLT S32 = LLT::scalar(32); 1391 1392 auto Const0 = B.buildConstant(S32, FractBits - 32); 1393 auto Const1 = B.buildConstant(S32, ExpBits); 1394 1395 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1396 .addUse(Const0.getReg(0)) 1397 .addUse(Const1.getReg(0)); 1398 1399 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1400 } 1401 1402 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1403 MachineInstr &MI, MachineRegisterInfo &MRI, 1404 MachineIRBuilder &B) const { 1405 B.setInstr(MI); 1406 1407 const LLT S1 = LLT::scalar(1); 1408 const LLT S32 = LLT::scalar(32); 1409 const LLT S64 = LLT::scalar(64); 1410 1411 Register Src = MI.getOperand(1).getReg(); 1412 assert(MRI.getType(Src) == S64); 1413 1414 // TODO: Should this use extract since the low half is unused? 1415 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1416 Register Hi = Unmerge.getReg(1); 1417 1418 // Extract the upper half, since this is where we will find the sign and 1419 // exponent. 1420 auto Exp = extractF64Exponent(Hi, B); 1421 1422 const unsigned FractBits = 52; 1423 1424 // Extract the sign bit. 1425 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1426 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1427 1428 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1429 1430 const auto Zero32 = B.buildConstant(S32, 0); 1431 1432 // Extend back to 64-bits. 1433 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); 1434 1435 auto Shr = B.buildAShr(S64, FractMask, Exp); 1436 auto Not = B.buildNot(S64, Shr); 1437 auto Tmp0 = B.buildAnd(S64, Src, Not); 1438 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1439 1440 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1441 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1442 1443 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1444 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1445 return true; 1446 } 1447 1448 bool AMDGPULegalizerInfo::legalizeITOFP( 1449 MachineInstr &MI, MachineRegisterInfo &MRI, 1450 MachineIRBuilder &B, bool Signed) const { 1451 B.setInstr(MI); 1452 1453 Register Dst = MI.getOperand(0).getReg(); 1454 Register Src = MI.getOperand(1).getReg(); 1455 1456 const LLT S64 = LLT::scalar(64); 1457 const LLT S32 = LLT::scalar(32); 1458 1459 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1460 1461 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1462 1463 auto CvtHi = Signed ? 1464 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1465 B.buildUITOFP(S64, Unmerge.getReg(1)); 1466 1467 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1468 1469 auto ThirtyTwo = B.buildConstant(S32, 32); 1470 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1471 .addUse(CvtHi.getReg(0)) 1472 .addUse(ThirtyTwo.getReg(0)); 1473 1474 // TODO: Should this propagate fast-math-flags? 1475 B.buildFAdd(Dst, LdExp, CvtLo); 1476 MI.eraseFromParent(); 1477 return true; 1478 } 1479 1480 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1481 MachineInstr &MI, MachineRegisterInfo &MRI, 1482 MachineIRBuilder &B) const { 1483 MachineFunction &MF = B.getMF(); 1484 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1485 1486 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1487 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1488 1489 // With ieee_mode disabled, the instructions have the correct behavior 1490 // already for G_FMINNUM/G_FMAXNUM 1491 if (!MFI->getMode().IEEE) 1492 return !IsIEEEOp; 1493 1494 if (IsIEEEOp) 1495 return true; 1496 1497 MachineIRBuilder HelperBuilder(MI); 1498 GISelObserverWrapper DummyObserver; 1499 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1500 HelperBuilder.setInstr(MI); 1501 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1502 } 1503 1504 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1505 MachineInstr &MI, MachineRegisterInfo &MRI, 1506 MachineIRBuilder &B) const { 1507 // TODO: Should move some of this into LegalizerHelper. 1508 1509 // TODO: Promote dynamic indexing of s16 to s32 1510 // TODO: Dynamic s64 indexing is only legal for SGPR. 1511 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); 1512 if (!IdxVal) // Dynamic case will be selected to register indexing. 1513 return true; 1514 1515 Register Dst = MI.getOperand(0).getReg(); 1516 Register Vec = MI.getOperand(1).getReg(); 1517 1518 LLT VecTy = MRI.getType(Vec); 1519 LLT EltTy = VecTy.getElementType(); 1520 assert(EltTy == MRI.getType(Dst)); 1521 1522 B.setInstr(MI); 1523 1524 if (IdxVal.getValue() < VecTy.getNumElements()) 1525 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); 1526 else 1527 B.buildUndef(Dst); 1528 1529 MI.eraseFromParent(); 1530 return true; 1531 } 1532 1533 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1534 MachineInstr &MI, MachineRegisterInfo &MRI, 1535 MachineIRBuilder &B) const { 1536 // TODO: Should move some of this into LegalizerHelper. 1537 1538 // TODO: Promote dynamic indexing of s16 to s32 1539 // TODO: Dynamic s64 indexing is only legal for SGPR. 1540 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); 1541 if (!IdxVal) // Dynamic case will be selected to register indexing. 1542 return true; 1543 1544 Register Dst = MI.getOperand(0).getReg(); 1545 Register Vec = MI.getOperand(1).getReg(); 1546 Register Ins = MI.getOperand(2).getReg(); 1547 1548 LLT VecTy = MRI.getType(Vec); 1549 LLT EltTy = VecTy.getElementType(); 1550 assert(EltTy == MRI.getType(Ins)); 1551 1552 B.setInstr(MI); 1553 1554 if (IdxVal.getValue() < VecTy.getNumElements()) 1555 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); 1556 else 1557 B.buildUndef(Dst); 1558 1559 MI.eraseFromParent(); 1560 return true; 1561 } 1562 1563 bool AMDGPULegalizerInfo::legalizeSinCos( 1564 MachineInstr &MI, MachineRegisterInfo &MRI, 1565 MachineIRBuilder &B) const { 1566 B.setInstr(MI); 1567 1568 Register DstReg = MI.getOperand(0).getReg(); 1569 Register SrcReg = MI.getOperand(1).getReg(); 1570 LLT Ty = MRI.getType(DstReg); 1571 unsigned Flags = MI.getFlags(); 1572 1573 Register TrigVal; 1574 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1575 if (ST.hasTrigReducedRange()) { 1576 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1577 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1578 .addUse(MulVal.getReg(0)) 1579 .setMIFlags(Flags).getReg(0); 1580 } else 1581 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1582 1583 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1584 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1585 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1586 .addUse(TrigVal) 1587 .setMIFlags(Flags); 1588 MI.eraseFromParent(); 1589 return true; 1590 } 1591 1592 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1593 Register DstReg, LLT PtrTy, 1594 MachineIRBuilder &B, const GlobalValue *GV, 1595 unsigned Offset, unsigned GAFlags) const { 1596 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1597 // to the following code sequence: 1598 // 1599 // For constant address space: 1600 // s_getpc_b64 s[0:1] 1601 // s_add_u32 s0, s0, $symbol 1602 // s_addc_u32 s1, s1, 0 1603 // 1604 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1605 // a fixup or relocation is emitted to replace $symbol with a literal 1606 // constant, which is a pc-relative offset from the encoding of the $symbol 1607 // operand to the global variable. 1608 // 1609 // For global address space: 1610 // s_getpc_b64 s[0:1] 1611 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1612 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1613 // 1614 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1615 // fixups or relocations are emitted to replace $symbol@*@lo and 1616 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1617 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1618 // operand to the global variable. 1619 // 1620 // What we want here is an offset from the value returned by s_getpc 1621 // (which is the address of the s_add_u32 instruction) to the global 1622 // variable, but since the encoding of $symbol starts 4 bytes after the start 1623 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1624 // small. This requires us to add 4 to the global variable offset in order to 1625 // compute the correct address. 1626 1627 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1628 1629 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1630 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1631 1632 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1633 .addDef(PCReg); 1634 1635 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1636 if (GAFlags == SIInstrInfo::MO_NONE) 1637 MIB.addImm(0); 1638 else 1639 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1640 1641 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1642 1643 if (PtrTy.getSizeInBits() == 32) 1644 B.buildExtract(DstReg, PCReg, 0); 1645 return true; 1646 } 1647 1648 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1649 MachineInstr &MI, MachineRegisterInfo &MRI, 1650 MachineIRBuilder &B) const { 1651 Register DstReg = MI.getOperand(0).getReg(); 1652 LLT Ty = MRI.getType(DstReg); 1653 unsigned AS = Ty.getAddressSpace(); 1654 1655 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1656 MachineFunction &MF = B.getMF(); 1657 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1658 B.setInstr(MI); 1659 1660 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1661 if (!MFI->isEntryFunction()) { 1662 const Function &Fn = MF.getFunction(); 1663 DiagnosticInfoUnsupported BadLDSDecl( 1664 Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 1665 Fn.getContext().diagnose(BadLDSDecl); 1666 } 1667 1668 // TODO: We could emit code to handle the initialization somewhere. 1669 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1670 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1671 MI.eraseFromParent(); 1672 return true; 1673 } 1674 1675 const Function &Fn = MF.getFunction(); 1676 DiagnosticInfoUnsupported BadInit( 1677 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1678 Fn.getContext().diagnose(BadInit); 1679 return true; 1680 } 1681 1682 const SITargetLowering *TLI = ST.getTargetLowering(); 1683 1684 if (TLI->shouldEmitFixup(GV)) { 1685 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 1686 MI.eraseFromParent(); 1687 return true; 1688 } 1689 1690 if (TLI->shouldEmitPCReloc(GV)) { 1691 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 1692 MI.eraseFromParent(); 1693 return true; 1694 } 1695 1696 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1697 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 1698 1699 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 1700 MachinePointerInfo::getGOT(MF), 1701 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1702 MachineMemOperand::MOInvariant, 1703 8 /*Size*/, 8 /*Align*/); 1704 1705 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 1706 1707 if (Ty.getSizeInBits() == 32) { 1708 // Truncate if this is a 32-bit constant adrdess. 1709 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 1710 B.buildExtract(DstReg, Load, 0); 1711 } else 1712 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 1713 1714 MI.eraseFromParent(); 1715 return true; 1716 } 1717 1718 bool AMDGPULegalizerInfo::legalizeLoad( 1719 MachineInstr &MI, MachineRegisterInfo &MRI, 1720 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 1721 B.setInstr(MI); 1722 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1723 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 1724 Observer.changingInstr(MI); 1725 MI.getOperand(1).setReg(Cast.getReg(0)); 1726 Observer.changedInstr(MI); 1727 return true; 1728 } 1729 1730 bool AMDGPULegalizerInfo::legalizeFMad( 1731 MachineInstr &MI, MachineRegisterInfo &MRI, 1732 MachineIRBuilder &B) const { 1733 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 1734 assert(Ty.isScalar()); 1735 1736 MachineFunction &MF = B.getMF(); 1737 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1738 1739 // TODO: Always legal with future ftz flag. 1740 if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals) 1741 return true; 1742 if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals) 1743 return true; 1744 1745 1746 MachineIRBuilder HelperBuilder(MI); 1747 GISelObserverWrapper DummyObserver; 1748 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1749 HelperBuilder.setMBB(*MI.getParent()); 1750 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 1751 } 1752 1753 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 1754 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 1755 Register DstReg = MI.getOperand(0).getReg(); 1756 Register PtrReg = MI.getOperand(1).getReg(); 1757 Register CmpVal = MI.getOperand(2).getReg(); 1758 Register NewVal = MI.getOperand(3).getReg(); 1759 1760 assert(SITargetLowering::isFlatGlobalAddrSpace( 1761 MRI.getType(PtrReg).getAddressSpace()) && 1762 "this should not have been custom lowered"); 1763 1764 LLT ValTy = MRI.getType(CmpVal); 1765 LLT VecTy = LLT::vector(2, ValTy); 1766 1767 B.setInstr(MI); 1768 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 1769 1770 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 1771 .addDef(DstReg) 1772 .addUse(PtrReg) 1773 .addUse(PackedVal) 1774 .setMemRefs(MI.memoperands()); 1775 1776 MI.eraseFromParent(); 1777 return true; 1778 } 1779 1780 // Return the use branch instruction, otherwise null if the usage is invalid. 1781 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 1782 MachineRegisterInfo &MRI) { 1783 Register CondDef = MI.getOperand(0).getReg(); 1784 if (!MRI.hasOneNonDBGUse(CondDef)) 1785 return nullptr; 1786 1787 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 1788 return UseMI.getParent() == MI.getParent() && 1789 UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr; 1790 } 1791 1792 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 1793 Register Reg, LLT Ty) const { 1794 Register LiveIn = MRI.getLiveInVirtReg(Reg); 1795 if (LiveIn) 1796 return LiveIn; 1797 1798 Register NewReg = MRI.createGenericVirtualRegister(Ty); 1799 MRI.addLiveIn(Reg, NewReg); 1800 return NewReg; 1801 } 1802 1803 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 1804 const ArgDescriptor *Arg) const { 1805 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 1806 return false; // TODO: Handle these 1807 1808 assert(Arg->getRegister().isPhysical()); 1809 1810 MachineRegisterInfo &MRI = *B.getMRI(); 1811 1812 LLT Ty = MRI.getType(DstReg); 1813 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 1814 1815 if (Arg->isMasked()) { 1816 // TODO: Should we try to emit this once in the entry block? 1817 const LLT S32 = LLT::scalar(32); 1818 const unsigned Mask = Arg->getMask(); 1819 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 1820 1821 Register AndMaskSrc = LiveIn; 1822 1823 if (Shift != 0) { 1824 auto ShiftAmt = B.buildConstant(S32, Shift); 1825 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 1826 } 1827 1828 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 1829 } else 1830 B.buildCopy(DstReg, LiveIn); 1831 1832 // Insert the argument copy if it doens't already exist. 1833 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 1834 if (!MRI.getVRegDef(LiveIn)) { 1835 // FIXME: Should have scoped insert pt 1836 MachineBasicBlock &OrigInsBB = B.getMBB(); 1837 auto OrigInsPt = B.getInsertPt(); 1838 1839 MachineBasicBlock &EntryMBB = B.getMF().front(); 1840 EntryMBB.addLiveIn(Arg->getRegister()); 1841 B.setInsertPt(EntryMBB, EntryMBB.begin()); 1842 B.buildCopy(LiveIn, Arg->getRegister()); 1843 1844 B.setInsertPt(OrigInsBB, OrigInsPt); 1845 } 1846 1847 return true; 1848 } 1849 1850 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 1851 MachineInstr &MI, 1852 MachineRegisterInfo &MRI, 1853 MachineIRBuilder &B, 1854 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 1855 B.setInstr(MI); 1856 1857 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1858 1859 const ArgDescriptor *Arg; 1860 const TargetRegisterClass *RC; 1861 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 1862 if (!Arg) { 1863 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 1864 return false; 1865 } 1866 1867 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 1868 MI.eraseFromParent(); 1869 return true; 1870 } 1871 1872 return false; 1873 } 1874 1875 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 1876 MachineRegisterInfo &MRI, 1877 MachineIRBuilder &B) const { 1878 B.setInstr(MI); 1879 Register Dst = MI.getOperand(0).getReg(); 1880 LLT DstTy = MRI.getType(Dst); 1881 LLT S16 = LLT::scalar(16); 1882 LLT S32 = LLT::scalar(32); 1883 LLT S64 = LLT::scalar(64); 1884 1885 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 1886 return true; 1887 1888 if (DstTy == S16) 1889 return legalizeFDIV16(MI, MRI, B); 1890 if (DstTy == S32) 1891 return legalizeFDIV32(MI, MRI, B); 1892 if (DstTy == S64) 1893 return legalizeFDIV64(MI, MRI, B); 1894 1895 return false; 1896 } 1897 1898 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 1899 MachineRegisterInfo &MRI, 1900 MachineIRBuilder &B) const { 1901 Register Res = MI.getOperand(0).getReg(); 1902 Register LHS = MI.getOperand(1).getReg(); 1903 Register RHS = MI.getOperand(2).getReg(); 1904 1905 uint16_t Flags = MI.getFlags(); 1906 1907 LLT ResTy = MRI.getType(Res); 1908 LLT S32 = LLT::scalar(32); 1909 LLT S64 = LLT::scalar(64); 1910 1911 const MachineFunction &MF = B.getMF(); 1912 bool Unsafe = 1913 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 1914 1915 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 1916 return false; 1917 1918 if (!Unsafe && ResTy == S32 && 1919 MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals) 1920 return false; 1921 1922 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 1923 // 1 / x -> RCP(x) 1924 if (CLHS->isExactlyValue(1.0)) { 1925 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 1926 .addUse(RHS) 1927 .setMIFlags(Flags); 1928 1929 MI.eraseFromParent(); 1930 return true; 1931 } 1932 1933 // -1 / x -> RCP( FNEG(x) ) 1934 if (CLHS->isExactlyValue(-1.0)) { 1935 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 1936 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 1937 .addUse(FNeg.getReg(0)) 1938 .setMIFlags(Flags); 1939 1940 MI.eraseFromParent(); 1941 return true; 1942 } 1943 } 1944 1945 // x / y -> x * (1.0 / y) 1946 if (Unsafe) { 1947 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 1948 .addUse(RHS) 1949 .setMIFlags(Flags); 1950 B.buildFMul(Res, LHS, RCP, Flags); 1951 1952 MI.eraseFromParent(); 1953 return true; 1954 } 1955 1956 return false; 1957 } 1958 1959 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 1960 MachineRegisterInfo &MRI, 1961 MachineIRBuilder &B) const { 1962 B.setInstr(MI); 1963 Register Res = MI.getOperand(0).getReg(); 1964 Register LHS = MI.getOperand(1).getReg(); 1965 Register RHS = MI.getOperand(2).getReg(); 1966 1967 uint16_t Flags = MI.getFlags(); 1968 1969 LLT S16 = LLT::scalar(16); 1970 LLT S32 = LLT::scalar(32); 1971 1972 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 1973 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 1974 1975 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 1976 .addUse(RHSExt.getReg(0)) 1977 .setMIFlags(Flags); 1978 1979 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 1980 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 1981 1982 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 1983 .addUse(RDst.getReg(0)) 1984 .addUse(RHS) 1985 .addUse(LHS) 1986 .setMIFlags(Flags); 1987 1988 MI.eraseFromParent(); 1989 return true; 1990 } 1991 1992 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 1993 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 1994 static void toggleSPDenormMode(bool Enable, 1995 MachineIRBuilder &B, 1996 const GCNSubtarget &ST, 1997 AMDGPU::SIModeRegisterDefaults Mode) { 1998 // Set SP denorm mode to this value. 1999 unsigned SPDenormMode = 2000 Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; 2001 2002 if (ST.hasDenormModeInst()) { 2003 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2004 unsigned DPDenormModeDefault = Mode.FP64FP16Denormals 2005 ? FP_DENORM_FLUSH_NONE 2006 : FP_DENORM_FLUSH_IN_FLUSH_OUT; 2007 2008 unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2009 B.buildInstr(AMDGPU::S_DENORM_MODE) 2010 .addImm(NewDenormModeValue); 2011 2012 } else { 2013 // Select FP32 bit field in mode register. 2014 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2015 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2016 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2017 2018 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2019 .addImm(SPDenormMode) 2020 .addImm(SPDenormModeBitField); 2021 } 2022 } 2023 2024 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2025 MachineRegisterInfo &MRI, 2026 MachineIRBuilder &B) const { 2027 B.setInstr(MI); 2028 Register Res = MI.getOperand(0).getReg(); 2029 Register LHS = MI.getOperand(1).getReg(); 2030 Register RHS = MI.getOperand(2).getReg(); 2031 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2032 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2033 2034 uint16_t Flags = MI.getFlags(); 2035 2036 LLT S32 = LLT::scalar(32); 2037 LLT S1 = LLT::scalar(1); 2038 2039 auto One = B.buildFConstant(S32, 1.0f); 2040 2041 auto DenominatorScaled = 2042 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2043 .addUse(RHS) 2044 .addUse(LHS) 2045 .addImm(1) 2046 .setMIFlags(Flags); 2047 auto NumeratorScaled = 2048 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2049 .addUse(LHS) 2050 .addUse(RHS) 2051 .addImm(0) 2052 .setMIFlags(Flags); 2053 2054 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2055 .addUse(DenominatorScaled.getReg(0)) 2056 .setMIFlags(Flags); 2057 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2058 2059 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2060 // aren't modeled as reading it. 2061 if (!Mode.FP32Denormals) 2062 toggleSPDenormMode(true, B, ST, Mode); 2063 2064 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2065 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2066 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2067 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2068 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2069 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2070 2071 if (!Mode.FP32Denormals) 2072 toggleSPDenormMode(false, B, ST, Mode); 2073 2074 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2075 .addUse(Fma4.getReg(0)) 2076 .addUse(Fma1.getReg(0)) 2077 .addUse(Fma3.getReg(0)) 2078 .addUse(NumeratorScaled.getReg(1)) 2079 .setMIFlags(Flags); 2080 2081 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2082 .addUse(Fmas.getReg(0)) 2083 .addUse(RHS) 2084 .addUse(LHS) 2085 .setMIFlags(Flags); 2086 2087 MI.eraseFromParent(); 2088 return true; 2089 } 2090 2091 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2092 MachineRegisterInfo &MRI, 2093 MachineIRBuilder &B) const { 2094 B.setInstr(MI); 2095 Register Res = MI.getOperand(0).getReg(); 2096 Register LHS = MI.getOperand(1).getReg(); 2097 Register RHS = MI.getOperand(2).getReg(); 2098 2099 uint16_t Flags = MI.getFlags(); 2100 2101 LLT S64 = LLT::scalar(64); 2102 LLT S1 = LLT::scalar(1); 2103 2104 auto One = B.buildFConstant(S64, 1.0); 2105 2106 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2107 .addUse(LHS) 2108 .addUse(RHS) 2109 .addImm(1) 2110 .setMIFlags(Flags); 2111 2112 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2113 2114 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2115 .addUse(DivScale0.getReg(0)) 2116 .setMIFlags(Flags); 2117 2118 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2119 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2120 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2121 2122 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2123 .addUse(LHS) 2124 .addUse(RHS) 2125 .addImm(0) 2126 .setMIFlags(Flags); 2127 2128 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2129 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); 2130 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2131 2132 Register Scale; 2133 if (!ST.hasUsableDivScaleConditionOutput()) { 2134 // Workaround a hardware bug on SI where the condition output from div_scale 2135 // is not usable. 2136 2137 Scale = MRI.createGenericVirtualRegister(S1); 2138 2139 LLT S32 = LLT::scalar(32); 2140 2141 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2142 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2143 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2144 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 2145 2146 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 2147 Scale1Unmerge.getReg(1)); 2148 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 2149 Scale0Unmerge.getReg(1)); 2150 B.buildXor(Scale, CmpNum, CmpDen); 2151 } else { 2152 Scale = DivScale1.getReg(1); 2153 } 2154 2155 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 2156 .addUse(Fma4.getReg(0)) 2157 .addUse(Fma3.getReg(0)) 2158 .addUse(Mul.getReg(0)) 2159 .addUse(Scale) 2160 .setMIFlags(Flags); 2161 2162 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 2163 .addUse(Fmas.getReg(0)) 2164 .addUse(RHS) 2165 .addUse(LHS) 2166 .setMIFlags(Flags); 2167 2168 MI.eraseFromParent(); 2169 return true; 2170 } 2171 2172 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 2173 MachineRegisterInfo &MRI, 2174 MachineIRBuilder &B) const { 2175 B.setInstr(MI); 2176 Register Res = MI.getOperand(0).getReg(); 2177 Register LHS = MI.getOperand(2).getReg(); 2178 Register RHS = MI.getOperand(3).getReg(); 2179 uint16_t Flags = MI.getFlags(); 2180 2181 LLT S32 = LLT::scalar(32); 2182 LLT S1 = LLT::scalar(1); 2183 2184 auto Abs = B.buildFAbs(S32, RHS, Flags); 2185 const APFloat C0Val(1.0f); 2186 2187 auto C0 = B.buildConstant(S32, 0x6f800000); 2188 auto C1 = B.buildConstant(S32, 0x2f800000); 2189 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 2190 2191 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 2192 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 2193 2194 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 2195 2196 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2197 .addUse(Mul0.getReg(0)) 2198 .setMIFlags(Flags); 2199 2200 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 2201 2202 B.buildFMul(Res, Sel, Mul1, Flags); 2203 2204 MI.eraseFromParent(); 2205 return true; 2206 } 2207 2208 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 2209 MachineRegisterInfo &MRI, 2210 MachineIRBuilder &B) const { 2211 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2212 if (!MFI->isEntryFunction()) { 2213 return legalizePreloadedArgIntrin(MI, MRI, B, 2214 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 2215 } 2216 2217 B.setInstr(MI); 2218 2219 uint64_t Offset = 2220 ST.getTargetLowering()->getImplicitParameterOffset( 2221 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 2222 Register DstReg = MI.getOperand(0).getReg(); 2223 LLT DstTy = MRI.getType(DstReg); 2224 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 2225 2226 const ArgDescriptor *Arg; 2227 const TargetRegisterClass *RC; 2228 std::tie(Arg, RC) 2229 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2230 if (!Arg) 2231 return false; 2232 2233 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 2234 if (!loadInputValue(KernargPtrReg, B, Arg)) 2235 return false; 2236 2237 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 2238 MI.eraseFromParent(); 2239 return true; 2240 } 2241 2242 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 2243 MachineRegisterInfo &MRI, 2244 MachineIRBuilder &B, 2245 unsigned AddrSpace) const { 2246 B.setInstr(MI); 2247 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 2248 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 2249 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 2250 MI.eraseFromParent(); 2251 return true; 2252 } 2253 2254 /// Handle register layout difference for f16 images for some subtargets. 2255 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 2256 MachineRegisterInfo &MRI, 2257 Register Reg) const { 2258 if (!ST.hasUnpackedD16VMem()) 2259 return Reg; 2260 2261 const LLT S16 = LLT::scalar(16); 2262 const LLT S32 = LLT::scalar(32); 2263 LLT StoreVT = MRI.getType(Reg); 2264 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 2265 2266 auto Unmerge = B.buildUnmerge(S16, Reg); 2267 2268 SmallVector<Register, 4> WideRegs; 2269 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 2270 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 2271 2272 int NumElts = StoreVT.getNumElements(); 2273 2274 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 2275 } 2276 2277 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI, 2278 MachineRegisterInfo &MRI, 2279 MachineIRBuilder &B, 2280 bool IsFormat) const { 2281 // TODO: Reject f16 format on targets where unsupported. 2282 Register VData = MI.getOperand(1).getReg(); 2283 LLT Ty = MRI.getType(VData); 2284 2285 B.setInstr(MI); 2286 2287 const LLT S32 = LLT::scalar(32); 2288 const LLT S16 = LLT::scalar(16); 2289 2290 // Fixup illegal register types for i8 stores. 2291 if (Ty == LLT::scalar(8) || Ty == S16) { 2292 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 2293 MI.getOperand(1).setReg(AnyExt); 2294 return true; 2295 } 2296 2297 if (Ty.isVector()) { 2298 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 2299 if (IsFormat) 2300 MI.getOperand(1).setReg(handleD16VData(B, MRI, VData)); 2301 return true; 2302 } 2303 2304 return Ty.getElementType() == S32 && Ty.getNumElements() <= 4; 2305 } 2306 2307 return Ty == S32; 2308 } 2309 2310 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 2311 MachineRegisterInfo &MRI, 2312 MachineIRBuilder &B) const { 2313 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 2314 auto IntrID = MI.getIntrinsicID(); 2315 switch (IntrID) { 2316 case Intrinsic::amdgcn_if: 2317 case Intrinsic::amdgcn_else: { 2318 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 2319 const SIRegisterInfo *TRI 2320 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 2321 2322 B.setInstr(*BrCond); 2323 Register Def = MI.getOperand(1).getReg(); 2324 Register Use = MI.getOperand(3).getReg(); 2325 2326 if (IntrID == Intrinsic::amdgcn_if) { 2327 B.buildInstr(AMDGPU::SI_IF) 2328 .addDef(Def) 2329 .addUse(Use) 2330 .addMBB(BrCond->getOperand(1).getMBB()); 2331 } else { 2332 B.buildInstr(AMDGPU::SI_ELSE) 2333 .addDef(Def) 2334 .addUse(Use) 2335 .addMBB(BrCond->getOperand(1).getMBB()) 2336 .addImm(0); 2337 } 2338 2339 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 2340 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 2341 MI.eraseFromParent(); 2342 BrCond->eraseFromParent(); 2343 return true; 2344 } 2345 2346 return false; 2347 } 2348 case Intrinsic::amdgcn_loop: { 2349 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 2350 const SIRegisterInfo *TRI 2351 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 2352 2353 B.setInstr(*BrCond); 2354 Register Reg = MI.getOperand(2).getReg(); 2355 B.buildInstr(AMDGPU::SI_LOOP) 2356 .addUse(Reg) 2357 .addMBB(BrCond->getOperand(1).getMBB()); 2358 MI.eraseFromParent(); 2359 BrCond->eraseFromParent(); 2360 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 2361 return true; 2362 } 2363 2364 return false; 2365 } 2366 case Intrinsic::amdgcn_kernarg_segment_ptr: 2367 return legalizePreloadedArgIntrin( 2368 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2369 case Intrinsic::amdgcn_implicitarg_ptr: 2370 return legalizeImplicitArgPtr(MI, MRI, B); 2371 case Intrinsic::amdgcn_workitem_id_x: 2372 return legalizePreloadedArgIntrin(MI, MRI, B, 2373 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 2374 case Intrinsic::amdgcn_workitem_id_y: 2375 return legalizePreloadedArgIntrin(MI, MRI, B, 2376 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 2377 case Intrinsic::amdgcn_workitem_id_z: 2378 return legalizePreloadedArgIntrin(MI, MRI, B, 2379 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 2380 case Intrinsic::amdgcn_workgroup_id_x: 2381 return legalizePreloadedArgIntrin(MI, MRI, B, 2382 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 2383 case Intrinsic::amdgcn_workgroup_id_y: 2384 return legalizePreloadedArgIntrin(MI, MRI, B, 2385 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 2386 case Intrinsic::amdgcn_workgroup_id_z: 2387 return legalizePreloadedArgIntrin(MI, MRI, B, 2388 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 2389 case Intrinsic::amdgcn_dispatch_ptr: 2390 return legalizePreloadedArgIntrin(MI, MRI, B, 2391 AMDGPUFunctionArgInfo::DISPATCH_PTR); 2392 case Intrinsic::amdgcn_queue_ptr: 2393 return legalizePreloadedArgIntrin(MI, MRI, B, 2394 AMDGPUFunctionArgInfo::QUEUE_PTR); 2395 case Intrinsic::amdgcn_implicit_buffer_ptr: 2396 return legalizePreloadedArgIntrin( 2397 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 2398 case Intrinsic::amdgcn_dispatch_id: 2399 return legalizePreloadedArgIntrin(MI, MRI, B, 2400 AMDGPUFunctionArgInfo::DISPATCH_ID); 2401 case Intrinsic::amdgcn_fdiv_fast: 2402 return legalizeFDIVFastIntrin(MI, MRI, B); 2403 case Intrinsic::amdgcn_is_shared: 2404 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 2405 case Intrinsic::amdgcn_is_private: 2406 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 2407 case Intrinsic::amdgcn_wavefrontsize: { 2408 B.setInstr(MI); 2409 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 2410 MI.eraseFromParent(); 2411 return true; 2412 } 2413 case Intrinsic::amdgcn_raw_buffer_store: 2414 return legalizeRawBufferStore(MI, MRI, B, false); 2415 case Intrinsic::amdgcn_raw_buffer_store_format: 2416 return legalizeRawBufferStore(MI, MRI, B, true); 2417 default: 2418 return true; 2419 } 2420 2421 return true; 2422 } 2423