1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPU.h" 22 #include "AMDGPULegalizerInfo.h" 23 #include "AMDGPUTargetMachine.h" 24 #include "SIMachineFunctionInfo.h" 25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/CodeGen/TargetOpcodes.h" 28 #include "llvm/CodeGen/ValueTypes.h" 29 #include "llvm/IR/DerivedTypes.h" 30 #include "llvm/IR/DiagnosticInfo.h" 31 #include "llvm/IR/Type.h" 32 #include "llvm/Support/Debug.h" 33 34 #define DEBUG_TYPE "amdgpu-legalinfo" 35 36 using namespace llvm; 37 using namespace LegalizeActions; 38 using namespace LegalizeMutations; 39 using namespace LegalityPredicates; 40 41 42 static LegalityPredicate isMultiple32(unsigned TypeIdx, 43 unsigned MaxSize = 1024) { 44 return [=](const LegalityQuery &Query) { 45 const LLT Ty = Query.Types[TypeIdx]; 46 const LLT EltTy = Ty.getScalarType(); 47 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 48 }; 49 } 50 51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 52 return [=](const LegalityQuery &Query) { 53 return Query.Types[TypeIdx].getSizeInBits() == Size; 54 }; 55 } 56 57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 58 return [=](const LegalityQuery &Query) { 59 const LLT Ty = Query.Types[TypeIdx]; 60 return Ty.isVector() && 61 Ty.getNumElements() % 2 != 0 && 62 Ty.getElementType().getSizeInBits() < 32 && 63 Ty.getSizeInBits() % 32 != 0; 64 }; 65 } 66 67 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 68 return [=](const LegalityQuery &Query) { 69 const LLT Ty = Query.Types[TypeIdx]; 70 const LLT EltTy = Ty.getScalarType(); 71 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 72 }; 73 } 74 75 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 76 return [=](const LegalityQuery &Query) { 77 const LLT Ty = Query.Types[TypeIdx]; 78 const LLT EltTy = Ty.getElementType(); 79 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 80 }; 81 } 82 83 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 84 return [=](const LegalityQuery &Query) { 85 const LLT Ty = Query.Types[TypeIdx]; 86 const LLT EltTy = Ty.getElementType(); 87 unsigned Size = Ty.getSizeInBits(); 88 unsigned Pieces = (Size + 63) / 64; 89 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 90 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 91 }; 92 } 93 94 // Increase the number of vector elements to reach the next multiple of 32-bit 95 // type. 96 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 97 return [=](const LegalityQuery &Query) { 98 const LLT Ty = Query.Types[TypeIdx]; 99 100 const LLT EltTy = Ty.getElementType(); 101 const int Size = Ty.getSizeInBits(); 102 const int EltSize = EltTy.getSizeInBits(); 103 const int NextMul32 = (Size + 31) / 32; 104 105 assert(EltSize < 32); 106 107 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 108 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 109 }; 110 } 111 112 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 113 return [=](const LegalityQuery &Query) { 114 const LLT QueryTy = Query.Types[TypeIdx]; 115 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 116 }; 117 } 118 119 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 120 return [=](const LegalityQuery &Query) { 121 const LLT QueryTy = Query.Types[TypeIdx]; 122 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 123 }; 124 } 125 126 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 127 return [=](const LegalityQuery &Query) { 128 const LLT QueryTy = Query.Types[TypeIdx]; 129 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 130 }; 131 } 132 133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 134 // v2s16. 135 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 136 return [=](const LegalityQuery &Query) { 137 const LLT Ty = Query.Types[TypeIdx]; 138 if (Ty.isVector()) { 139 const int EltSize = Ty.getElementType().getSizeInBits(); 140 return EltSize == 32 || EltSize == 64 || 141 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 142 EltSize == 128 || EltSize == 256; 143 } 144 145 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 146 }; 147 } 148 149 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 150 return [=](const LegalityQuery &Query) { 151 return Query.Types[TypeIdx].getElementType() == Type; 152 }; 153 } 154 155 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 156 return [=](const LegalityQuery &Query) { 157 const LLT Ty = Query.Types[TypeIdx]; 158 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 159 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 160 }; 161 } 162 163 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 164 const GCNTargetMachine &TM) 165 : ST(ST_) { 166 using namespace TargetOpcode; 167 168 auto GetAddrSpacePtr = [&TM](unsigned AS) { 169 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 170 }; 171 172 const LLT S1 = LLT::scalar(1); 173 const LLT S8 = LLT::scalar(8); 174 const LLT S16 = LLT::scalar(16); 175 const LLT S32 = LLT::scalar(32); 176 const LLT S64 = LLT::scalar(64); 177 const LLT S96 = LLT::scalar(96); 178 const LLT S128 = LLT::scalar(128); 179 const LLT S256 = LLT::scalar(256); 180 const LLT S1024 = LLT::scalar(1024); 181 182 const LLT V2S16 = LLT::vector(2, 16); 183 const LLT V4S16 = LLT::vector(4, 16); 184 185 const LLT V2S32 = LLT::vector(2, 32); 186 const LLT V3S32 = LLT::vector(3, 32); 187 const LLT V4S32 = LLT::vector(4, 32); 188 const LLT V5S32 = LLT::vector(5, 32); 189 const LLT V6S32 = LLT::vector(6, 32); 190 const LLT V7S32 = LLT::vector(7, 32); 191 const LLT V8S32 = LLT::vector(8, 32); 192 const LLT V9S32 = LLT::vector(9, 32); 193 const LLT V10S32 = LLT::vector(10, 32); 194 const LLT V11S32 = LLT::vector(11, 32); 195 const LLT V12S32 = LLT::vector(12, 32); 196 const LLT V13S32 = LLT::vector(13, 32); 197 const LLT V14S32 = LLT::vector(14, 32); 198 const LLT V15S32 = LLT::vector(15, 32); 199 const LLT V16S32 = LLT::vector(16, 32); 200 const LLT V32S32 = LLT::vector(32, 32); 201 202 const LLT V2S64 = LLT::vector(2, 64); 203 const LLT V3S64 = LLT::vector(3, 64); 204 const LLT V4S64 = LLT::vector(4, 64); 205 const LLT V5S64 = LLT::vector(5, 64); 206 const LLT V6S64 = LLT::vector(6, 64); 207 const LLT V7S64 = LLT::vector(7, 64); 208 const LLT V8S64 = LLT::vector(8, 64); 209 const LLT V16S64 = LLT::vector(16, 64); 210 211 std::initializer_list<LLT> AllS32Vectors = 212 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 213 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 214 std::initializer_list<LLT> AllS64Vectors = 215 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 216 217 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 218 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 219 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 220 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 221 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 222 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 223 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 224 225 const LLT CodePtr = FlatPtr; 226 227 const std::initializer_list<LLT> AddrSpaces64 = { 228 GlobalPtr, ConstantPtr, FlatPtr 229 }; 230 231 const std::initializer_list<LLT> AddrSpaces32 = { 232 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 233 }; 234 235 const std::initializer_list<LLT> FPTypesBase = { 236 S32, S64 237 }; 238 239 const std::initializer_list<LLT> FPTypes16 = { 240 S32, S64, S16 241 }; 242 243 const std::initializer_list<LLT> FPTypesPK16 = { 244 S32, S64, S16, V2S16 245 }; 246 247 setAction({G_BRCOND, S1}, Legal); 248 249 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 250 // elements for v3s16 251 getActionDefinitionsBuilder(G_PHI) 252 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 253 .legalFor(AllS32Vectors) 254 .legalFor(AllS64Vectors) 255 .legalFor(AddrSpaces64) 256 .legalFor(AddrSpaces32) 257 .clampScalar(0, S32, S256) 258 .widenScalarToNextPow2(0, 32) 259 .clampMaxNumElements(0, S32, 16) 260 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 261 .legalIf(isPointer(0)); 262 263 if (ST.has16BitInsts()) { 264 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 265 .legalFor({S32, S16}) 266 .clampScalar(0, S16, S32) 267 .scalarize(0); 268 } else { 269 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 270 .legalFor({S32}) 271 .clampScalar(0, S32, S32) 272 .scalarize(0); 273 } 274 275 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 276 .legalFor({S32}) 277 .clampScalar(0, S32, S32) 278 .scalarize(0); 279 280 // Report legal for any types we can handle anywhere. For the cases only legal 281 // on the SALU, RegBankSelect will be able to re-legalize. 282 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 283 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 284 .clampScalar(0, S32, S64) 285 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 286 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 287 .widenScalarToNextPow2(0) 288 .scalarize(0); 289 290 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 291 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 292 .legalFor({{S32, S1}}) 293 .clampScalar(0, S32, S32) 294 .scalarize(0); // TODO: Implement. 295 296 getActionDefinitionsBuilder({G_SADDO, G_SSUBO}) 297 .lower(); 298 299 getActionDefinitionsBuilder(G_BITCAST) 300 // Don't worry about the size constraint. 301 .legalIf(all(isRegisterType(0), isRegisterType(1))) 302 // FIXME: Testing hack 303 .legalForCartesianProduct({S16, LLT::vector(2, 8), }); 304 305 getActionDefinitionsBuilder(G_FCONSTANT) 306 .legalFor({S32, S64, S16}) 307 .clampScalar(0, S16, S64); 308 309 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 310 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 311 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 312 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 313 .clampScalarOrElt(0, S32, S1024) 314 .legalIf(isMultiple32(0)) 315 .widenScalarToNextPow2(0, 32) 316 .clampMaxNumElements(0, S32, 16); 317 318 319 // FIXME: i1 operands to intrinsics should always be legal, but other i1 320 // values may not be legal. We need to figure out how to distinguish 321 // between these two scenarios. 322 getActionDefinitionsBuilder(G_CONSTANT) 323 .legalFor({S1, S32, S64, S16, GlobalPtr, 324 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 325 .clampScalar(0, S32, S64) 326 .widenScalarToNextPow2(0) 327 .legalIf(isPointer(0)); 328 329 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 330 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 331 .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr}); 332 333 334 auto &FPOpActions = getActionDefinitionsBuilder( 335 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 336 .legalFor({S32, S64}); 337 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 338 .customFor({S32, S64}); 339 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 340 .customFor({S32, S64}); 341 342 if (ST.has16BitInsts()) { 343 if (ST.hasVOP3PInsts()) 344 FPOpActions.legalFor({S16, V2S16}); 345 else 346 FPOpActions.legalFor({S16}); 347 348 TrigActions.customFor({S16}); 349 FDIVActions.customFor({S16}); 350 } 351 352 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 353 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 354 355 if (ST.hasVOP3PInsts()) { 356 MinNumMaxNum.customFor(FPTypesPK16) 357 .clampMaxNumElements(0, S16, 2) 358 .clampScalar(0, S16, S64) 359 .scalarize(0); 360 } else if (ST.has16BitInsts()) { 361 MinNumMaxNum.customFor(FPTypes16) 362 .clampScalar(0, S16, S64) 363 .scalarize(0); 364 } else { 365 MinNumMaxNum.customFor(FPTypesBase) 366 .clampScalar(0, S32, S64) 367 .scalarize(0); 368 } 369 370 if (ST.hasVOP3PInsts()) 371 FPOpActions.clampMaxNumElements(0, S16, 2); 372 373 FPOpActions 374 .scalarize(0) 375 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 376 377 TrigActions 378 .scalarize(0) 379 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 380 381 FDIVActions 382 .scalarize(0) 383 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 384 385 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 386 .legalFor(FPTypesPK16) 387 .clampMaxNumElements(0, S16, 2) 388 .scalarize(0) 389 .clampScalar(0, S16, S64); 390 391 // TODO: Implement 392 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); 393 394 if (ST.has16BitInsts()) { 395 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 396 .legalFor({S32, S64, S16}) 397 .scalarize(0) 398 .clampScalar(0, S16, S64); 399 } else { 400 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 401 .legalFor({S32, S64}) 402 .scalarize(0) 403 .clampScalar(0, S32, S64); 404 } 405 406 getActionDefinitionsBuilder(G_FPTRUNC) 407 .legalFor({{S32, S64}, {S16, S32}}) 408 .scalarize(0); 409 410 getActionDefinitionsBuilder(G_FPEXT) 411 .legalFor({{S64, S32}, {S32, S16}}) 412 .lowerFor({{S64, S16}}) // FIXME: Implement 413 .scalarize(0); 414 415 // TODO: Verify V_BFI_B32 is generated from expanded bit ops. 416 getActionDefinitionsBuilder(G_FCOPYSIGN).lower(); 417 418 getActionDefinitionsBuilder(G_FSUB) 419 // Use actual fsub instruction 420 .legalFor({S32}) 421 // Must use fadd + fneg 422 .lowerFor({S64, S16, V2S16}) 423 .scalarize(0) 424 .clampScalar(0, S32, S64); 425 426 // Whether this is legal depends on the floating point mode for the function. 427 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 428 if (ST.hasMadF16()) 429 FMad.customFor({S32, S16}); 430 else 431 FMad.customFor({S32}); 432 FMad.scalarize(0) 433 .lower(); 434 435 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 436 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 437 {S32, S1}, {S64, S1}, {S16, S1}, 438 {S96, S32}, 439 // FIXME: Hack 440 {S64, LLT::scalar(33)}, 441 {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}}) 442 .scalarize(0); 443 444 // TODO: Split s1->s64 during regbankselect for VALU. 445 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 446 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 447 .lowerFor({{S32, S64}}) 448 .lowerIf(typeIs(1, S1)) 449 .customFor({{S64, S64}}); 450 if (ST.has16BitInsts()) 451 IToFP.legalFor({{S16, S16}}); 452 IToFP.clampScalar(1, S32, S64) 453 .scalarize(0); 454 455 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 456 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}); 457 if (ST.has16BitInsts()) 458 FPToI.legalFor({{S16, S16}}); 459 else 460 FPToI.minScalar(1, S32); 461 462 FPToI.minScalar(0, S32) 463 .scalarize(0); 464 465 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 466 .legalFor({S32, S64}) 467 .scalarize(0); 468 469 if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 470 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 471 .legalFor({S32, S64}) 472 .clampScalar(0, S32, S64) 473 .scalarize(0); 474 } else { 475 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 476 .legalFor({S32}) 477 .customFor({S64}) 478 .clampScalar(0, S32, S64) 479 .scalarize(0); 480 } 481 482 getActionDefinitionsBuilder(G_PTR_ADD) 483 .legalForCartesianProduct(AddrSpaces64, {S64}) 484 .legalForCartesianProduct(AddrSpaces32, {S32}) 485 .scalarize(0); 486 487 getActionDefinitionsBuilder(G_PTR_MASK) 488 .scalarize(0) 489 .alwaysLegal(); 490 491 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 492 493 auto &CmpBuilder = 494 getActionDefinitionsBuilder(G_ICMP) 495 .legalForCartesianProduct( 496 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 497 .legalFor({{S1, S32}, {S1, S64}}); 498 if (ST.has16BitInsts()) { 499 CmpBuilder.legalFor({{S1, S16}}); 500 } 501 502 CmpBuilder 503 .widenScalarToNextPow2(1) 504 .clampScalar(1, S32, S64) 505 .scalarize(0) 506 .legalIf(all(typeIs(0, S1), isPointer(1))); 507 508 getActionDefinitionsBuilder(G_FCMP) 509 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 510 .widenScalarToNextPow2(1) 511 .clampScalar(1, S32, S64) 512 .scalarize(0); 513 514 // FIXME: fexp, flog2, flog10 needs to be custom lowered. 515 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, 516 G_FLOG, G_FLOG2, G_FLOG10}) 517 .legalFor({S32}) 518 .scalarize(0); 519 520 // The 64-bit versions produce 32-bit results, but only on the SALU. 521 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, 522 G_CTTZ, G_CTTZ_ZERO_UNDEF, 523 G_CTPOP}) 524 .legalFor({{S32, S32}, {S32, S64}}) 525 .clampScalar(0, S32, S32) 526 .clampScalar(1, S32, S64) 527 .scalarize(0) 528 .widenScalarToNextPow2(0, 32) 529 .widenScalarToNextPow2(1, 32); 530 531 // TODO: Expand for > s32 532 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE}) 533 .legalFor({S32}) 534 .clampScalar(0, S32, S32) 535 .scalarize(0); 536 537 if (ST.has16BitInsts()) { 538 if (ST.hasVOP3PInsts()) { 539 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 540 .legalFor({S32, S16, V2S16}) 541 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 542 .clampMaxNumElements(0, S16, 2) 543 .clampScalar(0, S16, S32) 544 .widenScalarToNextPow2(0) 545 .scalarize(0); 546 } else { 547 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 548 .legalFor({S32, S16}) 549 .widenScalarToNextPow2(0) 550 .clampScalar(0, S16, S32) 551 .scalarize(0); 552 } 553 } else { 554 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 555 .legalFor({S32}) 556 .clampScalar(0, S32, S32) 557 .widenScalarToNextPow2(0) 558 .scalarize(0); 559 } 560 561 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 562 return [=](const LegalityQuery &Query) { 563 return Query.Types[TypeIdx0].getSizeInBits() < 564 Query.Types[TypeIdx1].getSizeInBits(); 565 }; 566 }; 567 568 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 569 return [=](const LegalityQuery &Query) { 570 return Query.Types[TypeIdx0].getSizeInBits() > 571 Query.Types[TypeIdx1].getSizeInBits(); 572 }; 573 }; 574 575 getActionDefinitionsBuilder(G_INTTOPTR) 576 // List the common cases 577 .legalForCartesianProduct(AddrSpaces64, {S64}) 578 .legalForCartesianProduct(AddrSpaces32, {S32}) 579 .scalarize(0) 580 // Accept any address space as long as the size matches 581 .legalIf(sameSize(0, 1)) 582 .widenScalarIf(smallerThan(1, 0), 583 [](const LegalityQuery &Query) { 584 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 585 }) 586 .narrowScalarIf(greaterThan(1, 0), 587 [](const LegalityQuery &Query) { 588 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 589 }); 590 591 getActionDefinitionsBuilder(G_PTRTOINT) 592 // List the common cases 593 .legalForCartesianProduct(AddrSpaces64, {S64}) 594 .legalForCartesianProduct(AddrSpaces32, {S32}) 595 .scalarize(0) 596 // Accept any address space as long as the size matches 597 .legalIf(sameSize(0, 1)) 598 .widenScalarIf(smallerThan(0, 1), 599 [](const LegalityQuery &Query) { 600 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 601 }) 602 .narrowScalarIf( 603 greaterThan(0, 1), 604 [](const LegalityQuery &Query) { 605 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 606 }); 607 608 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 609 .scalarize(0) 610 .custom(); 611 612 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 613 // handle some operations by just promoting the register during 614 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 615 auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned { 616 switch (AS) { 617 // FIXME: Private element size. 618 case AMDGPUAS::PRIVATE_ADDRESS: 619 return 32; 620 // FIXME: Check subtarget 621 case AMDGPUAS::LOCAL_ADDRESS: 622 return ST.useDS128() ? 128 : 64; 623 624 // Treat constant and global as identical. SMRD loads are sometimes usable 625 // for global loads (ideally constant address space should be eliminated) 626 // depending on the context. Legality cannot be context dependent, but 627 // RegBankSelect can split the load as necessary depending on the pointer 628 // register bank/uniformity and if the memory is invariant or not written in 629 // a kernel. 630 case AMDGPUAS::CONSTANT_ADDRESS: 631 case AMDGPUAS::GLOBAL_ADDRESS: 632 return 512; 633 default: 634 return 128; 635 } 636 }; 637 638 const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool { 639 const LLT DstTy = Query.Types[0]; 640 641 // Split vector extloads. 642 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 643 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 644 return true; 645 646 const LLT PtrTy = Query.Types[1]; 647 unsigned AS = PtrTy.getAddressSpace(); 648 if (MemSize > maxSizeForAddrSpace(AS)) 649 return true; 650 651 // Catch weird sized loads that don't evenly divide into the access sizes 652 // TODO: May be able to widen depending on alignment etc. 653 unsigned NumRegs = MemSize / 32; 654 if (NumRegs == 3 && !ST.hasDwordx3LoadStores()) 655 return true; 656 657 unsigned Align = Query.MMODescrs[0].AlignInBits; 658 if (Align < MemSize) { 659 const SITargetLowering *TLI = ST.getTargetLowering(); 660 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 661 } 662 663 return false; 664 }; 665 666 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 667 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 668 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 669 670 // TODO: Refine based on subtargets which support unaligned access or 128-bit 671 // LDS 672 // TODO: Unsupported flat for SI. 673 674 for (unsigned Op : {G_LOAD, G_STORE}) { 675 const bool IsStore = Op == G_STORE; 676 677 auto &Actions = getActionDefinitionsBuilder(Op); 678 // Whitelist the common cases. 679 // TODO: Pointer loads 680 // TODO: Wide constant loads 681 // TODO: Only CI+ has 3x loads 682 // TODO: Loads to s16 on gfx9 683 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 684 {V2S32, GlobalPtr, 64, GlobalAlign32}, 685 {V3S32, GlobalPtr, 96, GlobalAlign32}, 686 {S96, GlobalPtr, 96, GlobalAlign32}, 687 {V4S32, GlobalPtr, 128, GlobalAlign32}, 688 {S128, GlobalPtr, 128, GlobalAlign32}, 689 {S64, GlobalPtr, 64, GlobalAlign32}, 690 {V2S64, GlobalPtr, 128, GlobalAlign32}, 691 {V2S16, GlobalPtr, 32, GlobalAlign32}, 692 {S32, GlobalPtr, 8, GlobalAlign8}, 693 {S32, GlobalPtr, 16, GlobalAlign16}, 694 695 {S32, LocalPtr, 32, 32}, 696 {S64, LocalPtr, 64, 32}, 697 {V2S32, LocalPtr, 64, 32}, 698 {S32, LocalPtr, 8, 8}, 699 {S32, LocalPtr, 16, 16}, 700 {V2S16, LocalPtr, 32, 32}, 701 702 {S32, PrivatePtr, 32, 32}, 703 {S32, PrivatePtr, 8, 8}, 704 {S32, PrivatePtr, 16, 16}, 705 {V2S16, PrivatePtr, 32, 32}, 706 707 {S32, FlatPtr, 32, GlobalAlign32}, 708 {S32, FlatPtr, 16, GlobalAlign16}, 709 {S32, FlatPtr, 8, GlobalAlign8}, 710 {V2S16, FlatPtr, 32, GlobalAlign32}, 711 712 {S32, ConstantPtr, 32, GlobalAlign32}, 713 {V2S32, ConstantPtr, 64, GlobalAlign32}, 714 {V3S32, ConstantPtr, 96, GlobalAlign32}, 715 {V4S32, ConstantPtr, 128, GlobalAlign32}, 716 {S64, ConstantPtr, 64, GlobalAlign32}, 717 {S128, ConstantPtr, 128, GlobalAlign32}, 718 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 719 Actions 720 .customIf(typeIs(1, Constant32Ptr)) 721 .narrowScalarIf( 722 [=](const LegalityQuery &Query) -> bool { 723 return !Query.Types[0].isVector() && needToSplitLoad(Query); 724 }, 725 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 726 const LLT DstTy = Query.Types[0]; 727 const LLT PtrTy = Query.Types[1]; 728 729 const unsigned DstSize = DstTy.getSizeInBits(); 730 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 731 732 // Split extloads. 733 if (DstSize > MemSize) 734 return std::make_pair(0, LLT::scalar(MemSize)); 735 736 if (DstSize > 32 && (DstSize % 32 != 0)) { 737 // FIXME: Need a way to specify non-extload of larger size if 738 // suitably aligned. 739 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 740 } 741 742 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 743 if (MemSize > MaxSize) 744 return std::make_pair(0, LLT::scalar(MaxSize)); 745 746 unsigned Align = Query.MMODescrs[0].AlignInBits; 747 return std::make_pair(0, LLT::scalar(Align)); 748 }) 749 .fewerElementsIf( 750 [=](const LegalityQuery &Query) -> bool { 751 return Query.Types[0].isVector() && needToSplitLoad(Query); 752 }, 753 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 754 const LLT DstTy = Query.Types[0]; 755 const LLT PtrTy = Query.Types[1]; 756 757 LLT EltTy = DstTy.getElementType(); 758 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 759 760 // Split if it's too large for the address space. 761 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 762 unsigned NumElts = DstTy.getNumElements(); 763 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 764 765 // FIXME: Refine when odd breakdowns handled 766 // The scalars will need to be re-legalized. 767 if (NumPieces == 1 || NumPieces >= NumElts || 768 NumElts % NumPieces != 0) 769 return std::make_pair(0, EltTy); 770 771 return std::make_pair(0, 772 LLT::vector(NumElts / NumPieces, EltTy)); 773 } 774 775 // Need to split because of alignment. 776 unsigned Align = Query.MMODescrs[0].AlignInBits; 777 unsigned EltSize = EltTy.getSizeInBits(); 778 if (EltSize > Align && 779 (EltSize / Align < DstTy.getNumElements())) { 780 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 781 } 782 783 // May need relegalization for the scalars. 784 return std::make_pair(0, EltTy); 785 }) 786 .minScalar(0, S32); 787 788 if (IsStore) 789 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 790 791 // TODO: Need a bitcast lower option? 792 Actions 793 .legalIf([=](const LegalityQuery &Query) { 794 const LLT Ty0 = Query.Types[0]; 795 unsigned Size = Ty0.getSizeInBits(); 796 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 797 unsigned Align = Query.MMODescrs[0].AlignInBits; 798 799 // No extending vector loads. 800 if (Size > MemSize && Ty0.isVector()) 801 return false; 802 803 // FIXME: Widening store from alignment not valid. 804 if (MemSize < Size) 805 MemSize = std::max(MemSize, Align); 806 807 switch (MemSize) { 808 case 8: 809 case 16: 810 return Size == 32; 811 case 32: 812 case 64: 813 case 128: 814 return true; 815 case 96: 816 return ST.hasDwordx3LoadStores(); 817 case 256: 818 case 512: 819 return true; 820 default: 821 return false; 822 } 823 }) 824 .widenScalarToNextPow2(0) 825 // TODO: v3s32->v4s32 with alignment 826 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 827 } 828 829 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 830 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 831 {S32, GlobalPtr, 16, 2 * 8}, 832 {S32, LocalPtr, 8, 8}, 833 {S32, LocalPtr, 16, 16}, 834 {S32, PrivatePtr, 8, 8}, 835 {S32, PrivatePtr, 16, 16}, 836 {S32, ConstantPtr, 8, 8}, 837 {S32, ConstantPtr, 16, 2 * 8}}); 838 if (ST.hasFlatAddressSpace()) { 839 ExtLoads.legalForTypesWithMemDesc( 840 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 841 } 842 843 ExtLoads.clampScalar(0, S32, S32) 844 .widenScalarToNextPow2(0) 845 .unsupportedIfMemSizeNotPow2() 846 .lower(); 847 848 auto &Atomics = getActionDefinitionsBuilder( 849 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 850 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 851 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 852 G_ATOMICRMW_UMIN}) 853 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 854 {S64, GlobalPtr}, {S64, LocalPtr}}); 855 if (ST.hasFlatAddressSpace()) { 856 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 857 } 858 859 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 860 .legalFor({{S32, LocalPtr}}); 861 862 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 863 // demarshalling 864 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 865 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 866 {S32, FlatPtr}, {S64, FlatPtr}}) 867 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 868 {S32, RegionPtr}, {S64, RegionPtr}}); 869 870 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS) 871 .lower(); 872 873 // TODO: Pointer types, any 32-bit or 64-bit vector 874 getActionDefinitionsBuilder(G_SELECT) 875 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 876 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 877 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1}) 878 .clampScalar(0, S16, S64) 879 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 880 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 881 .scalarize(1) 882 .clampMaxNumElements(0, S32, 2) 883 .clampMaxNumElements(0, LocalPtr, 2) 884 .clampMaxNumElements(0, PrivatePtr, 2) 885 .scalarize(0) 886 .widenScalarToNextPow2(0) 887 .legalIf(all(isPointer(0), typeIs(1, S1))); 888 889 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 890 // be more flexible with the shift amount type. 891 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 892 .legalFor({{S32, S32}, {S64, S32}}); 893 if (ST.has16BitInsts()) { 894 if (ST.hasVOP3PInsts()) { 895 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 896 .clampMaxNumElements(0, S16, 2); 897 } else 898 Shifts.legalFor({{S16, S32}, {S16, S16}}); 899 900 Shifts.clampScalar(1, S16, S32); 901 Shifts.clampScalar(0, S16, S64); 902 Shifts.widenScalarToNextPow2(0, 16); 903 } else { 904 // Make sure we legalize the shift amount type first, as the general 905 // expansion for the shifted type will produce much worse code if it hasn't 906 // been truncated already. 907 Shifts.clampScalar(1, S32, S32); 908 Shifts.clampScalar(0, S32, S64); 909 Shifts.widenScalarToNextPow2(0, 32); 910 } 911 Shifts.scalarize(0); 912 913 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 914 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 915 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 916 unsigned IdxTypeIdx = 2; 917 918 getActionDefinitionsBuilder(Op) 919 .customIf([=](const LegalityQuery &Query) { 920 const LLT EltTy = Query.Types[EltTypeIdx]; 921 const LLT VecTy = Query.Types[VecTypeIdx]; 922 const LLT IdxTy = Query.Types[IdxTypeIdx]; 923 return (EltTy.getSizeInBits() == 16 || 924 EltTy.getSizeInBits() % 32 == 0) && 925 VecTy.getSizeInBits() % 32 == 0 && 926 VecTy.getSizeInBits() <= 1024 && 927 IdxTy.getSizeInBits() == 32; 928 }) 929 .clampScalar(EltTypeIdx, S32, S64) 930 .clampScalar(VecTypeIdx, S32, S64) 931 .clampScalar(IdxTypeIdx, S32, S32); 932 } 933 934 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 935 .unsupportedIf([=](const LegalityQuery &Query) { 936 const LLT &EltTy = Query.Types[1].getElementType(); 937 return Query.Types[0] != EltTy; 938 }); 939 940 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 941 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 942 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 943 944 // FIXME: Doesn't handle extract of illegal sizes. 945 getActionDefinitionsBuilder(Op) 946 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 947 // FIXME: Multiples of 16 should not be legal. 948 .legalIf([=](const LegalityQuery &Query) { 949 const LLT BigTy = Query.Types[BigTyIdx]; 950 const LLT LitTy = Query.Types[LitTyIdx]; 951 return (BigTy.getSizeInBits() % 32 == 0) && 952 (LitTy.getSizeInBits() % 16 == 0); 953 }) 954 .widenScalarIf( 955 [=](const LegalityQuery &Query) { 956 const LLT BigTy = Query.Types[BigTyIdx]; 957 return (BigTy.getScalarSizeInBits() < 16); 958 }, 959 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 960 .widenScalarIf( 961 [=](const LegalityQuery &Query) { 962 const LLT LitTy = Query.Types[LitTyIdx]; 963 return (LitTy.getScalarSizeInBits() < 16); 964 }, 965 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 966 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 967 .widenScalarToNextPow2(BigTyIdx, 32); 968 969 } 970 971 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 972 .legalForCartesianProduct(AllS32Vectors, {S32}) 973 .legalForCartesianProduct(AllS64Vectors, {S64}) 974 .clampNumElements(0, V16S32, V32S32) 975 .clampNumElements(0, V2S64, V16S64) 976 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 977 978 if (ST.hasScalarPackInsts()) 979 BuildVector.legalFor({V2S16, S32}); 980 981 BuildVector 982 .minScalarSameAs(1, 0) 983 .legalIf(isRegisterType(0)) 984 .minScalarOrElt(0, S32); 985 986 if (ST.hasScalarPackInsts()) { 987 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 988 .legalFor({V2S16, S32}) 989 .lower(); 990 } else { 991 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 992 .lower(); 993 } 994 995 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 996 .legalIf(isRegisterType(0)); 997 998 // TODO: Don't fully scalarize v2s16 pieces 999 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1000 1001 // Merge/Unmerge 1002 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1003 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1004 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1005 1006 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1007 const LLT &Ty = Query.Types[TypeIdx]; 1008 if (Ty.isVector()) { 1009 const LLT &EltTy = Ty.getElementType(); 1010 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 1011 return true; 1012 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1013 return true; 1014 } 1015 return false; 1016 }; 1017 1018 auto &Builder = getActionDefinitionsBuilder(Op) 1019 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1020 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1021 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1022 // valid. 1023 .clampScalar(LitTyIdx, S16, S256) 1024 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1025 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1026 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1027 elementTypeIs(1, S16)), 1028 changeTo(1, V2S16)) 1029 // Break up vectors with weird elements into scalars 1030 .fewerElementsIf( 1031 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 1032 scalarize(0)) 1033 .fewerElementsIf( 1034 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 1035 scalarize(1)) 1036 .clampScalar(BigTyIdx, S32, S1024) 1037 .lowerFor({{S16, V2S16}}); 1038 1039 if (Op == G_MERGE_VALUES) { 1040 Builder.widenScalarIf( 1041 // TODO: Use 16-bit shifts if legal for 8-bit values? 1042 [=](const LegalityQuery &Query) { 1043 const LLT Ty = Query.Types[LitTyIdx]; 1044 return Ty.getSizeInBits() < 32; 1045 }, 1046 changeTo(LitTyIdx, S32)); 1047 } 1048 1049 Builder.widenScalarIf( 1050 [=](const LegalityQuery &Query) { 1051 const LLT Ty = Query.Types[BigTyIdx]; 1052 return !isPowerOf2_32(Ty.getSizeInBits()) && 1053 Ty.getSizeInBits() % 16 != 0; 1054 }, 1055 [=](const LegalityQuery &Query) { 1056 // Pick the next power of 2, or a multiple of 64 over 128. 1057 // Whichever is smaller. 1058 const LLT &Ty = Query.Types[BigTyIdx]; 1059 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1060 if (NewSizeInBits >= 256) { 1061 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1062 if (RoundedTo < NewSizeInBits) 1063 NewSizeInBits = RoundedTo; 1064 } 1065 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1066 }) 1067 .legalIf([=](const LegalityQuery &Query) { 1068 const LLT &BigTy = Query.Types[BigTyIdx]; 1069 const LLT &LitTy = Query.Types[LitTyIdx]; 1070 1071 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1072 return false; 1073 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1074 return false; 1075 1076 return BigTy.getSizeInBits() % 16 == 0 && 1077 LitTy.getSizeInBits() % 16 == 0 && 1078 BigTy.getSizeInBits() <= 1024; 1079 }) 1080 // Any vectors left are the wrong size. Scalarize them. 1081 .scalarize(0) 1082 .scalarize(1); 1083 } 1084 1085 getActionDefinitionsBuilder(G_SEXT_INREG).lower(); 1086 1087 computeTables(); 1088 verify(*ST.getInstrInfo()); 1089 } 1090 1091 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1092 MachineRegisterInfo &MRI, 1093 MachineIRBuilder &B, 1094 GISelChangeObserver &Observer) const { 1095 switch (MI.getOpcode()) { 1096 case TargetOpcode::G_ADDRSPACE_CAST: 1097 return legalizeAddrSpaceCast(MI, MRI, B); 1098 case TargetOpcode::G_FRINT: 1099 return legalizeFrint(MI, MRI, B); 1100 case TargetOpcode::G_FCEIL: 1101 return legalizeFceil(MI, MRI, B); 1102 case TargetOpcode::G_INTRINSIC_TRUNC: 1103 return legalizeIntrinsicTrunc(MI, MRI, B); 1104 case TargetOpcode::G_SITOFP: 1105 return legalizeITOFP(MI, MRI, B, true); 1106 case TargetOpcode::G_UITOFP: 1107 return legalizeITOFP(MI, MRI, B, false); 1108 case TargetOpcode::G_FMINNUM: 1109 case TargetOpcode::G_FMAXNUM: 1110 case TargetOpcode::G_FMINNUM_IEEE: 1111 case TargetOpcode::G_FMAXNUM_IEEE: 1112 return legalizeMinNumMaxNum(MI, MRI, B); 1113 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1114 return legalizeExtractVectorElt(MI, MRI, B); 1115 case TargetOpcode::G_INSERT_VECTOR_ELT: 1116 return legalizeInsertVectorElt(MI, MRI, B); 1117 case TargetOpcode::G_FSIN: 1118 case TargetOpcode::G_FCOS: 1119 return legalizeSinCos(MI, MRI, B); 1120 case TargetOpcode::G_GLOBAL_VALUE: 1121 return legalizeGlobalValue(MI, MRI, B); 1122 case TargetOpcode::G_LOAD: 1123 return legalizeLoad(MI, MRI, B, Observer); 1124 case TargetOpcode::G_FMAD: 1125 return legalizeFMad(MI, MRI, B); 1126 case TargetOpcode::G_FDIV: 1127 return legalizeFDIV(MI, MRI, B); 1128 case TargetOpcode::G_ATOMIC_CMPXCHG: 1129 return legalizeAtomicCmpXChg(MI, MRI, B); 1130 default: 1131 return false; 1132 } 1133 1134 llvm_unreachable("expected switch to return"); 1135 } 1136 1137 Register AMDGPULegalizerInfo::getSegmentAperture( 1138 unsigned AS, 1139 MachineRegisterInfo &MRI, 1140 MachineIRBuilder &B) const { 1141 MachineFunction &MF = B.getMF(); 1142 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1143 const LLT S32 = LLT::scalar(32); 1144 1145 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1146 1147 if (ST.hasApertureRegs()) { 1148 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1149 // getreg. 1150 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1151 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1152 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1153 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1154 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1155 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1156 unsigned Encoding = 1157 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1158 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1159 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1160 1161 Register ApertureReg = MRI.createGenericVirtualRegister(S32); 1162 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1163 1164 B.buildInstr(AMDGPU::S_GETREG_B32) 1165 .addDef(GetReg) 1166 .addImm(Encoding); 1167 MRI.setType(GetReg, S32); 1168 1169 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1170 B.buildInstr(TargetOpcode::G_SHL) 1171 .addDef(ApertureReg) 1172 .addUse(GetReg) 1173 .addUse(ShiftAmt.getReg(0)); 1174 1175 return ApertureReg; 1176 } 1177 1178 Register QueuePtr = MRI.createGenericVirtualRegister( 1179 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1180 1181 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1182 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1183 return Register(); 1184 1185 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1186 // private_segment_aperture_base_hi. 1187 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1188 1189 // FIXME: Don't use undef 1190 Value *V = UndefValue::get(PointerType::get( 1191 Type::getInt8Ty(MF.getFunction().getContext()), 1192 AMDGPUAS::CONSTANT_ADDRESS)); 1193 1194 MachinePointerInfo PtrInfo(V, StructOffset); 1195 MachineMemOperand *MMO = MF.getMachineMemOperand( 1196 PtrInfo, 1197 MachineMemOperand::MOLoad | 1198 MachineMemOperand::MODereferenceable | 1199 MachineMemOperand::MOInvariant, 1200 4, 1201 MinAlign(64, StructOffset)); 1202 1203 Register LoadResult = MRI.createGenericVirtualRegister(S32); 1204 Register LoadAddr; 1205 1206 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1207 B.buildLoad(LoadResult, LoadAddr, *MMO); 1208 return LoadResult; 1209 } 1210 1211 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1212 MachineInstr &MI, MachineRegisterInfo &MRI, 1213 MachineIRBuilder &B) const { 1214 MachineFunction &MF = B.getMF(); 1215 1216 B.setInstr(MI); 1217 1218 const LLT S32 = LLT::scalar(32); 1219 Register Dst = MI.getOperand(0).getReg(); 1220 Register Src = MI.getOperand(1).getReg(); 1221 1222 LLT DstTy = MRI.getType(Dst); 1223 LLT SrcTy = MRI.getType(Src); 1224 unsigned DestAS = DstTy.getAddressSpace(); 1225 unsigned SrcAS = SrcTy.getAddressSpace(); 1226 1227 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1228 // vector element. 1229 assert(!DstTy.isVector()); 1230 1231 const AMDGPUTargetMachine &TM 1232 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1233 1234 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1235 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1236 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1237 return true; 1238 } 1239 1240 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1241 // Truncate. 1242 B.buildExtract(Dst, Src, 0); 1243 MI.eraseFromParent(); 1244 return true; 1245 } 1246 1247 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1248 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1249 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1250 1251 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1252 // another. Merge operands are required to be the same type, but creating an 1253 // extra ptrtoint would be kind of pointless. 1254 auto HighAddr = B.buildConstant( 1255 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1256 B.buildMerge(Dst, {Src, HighAddr.getReg(0)}); 1257 MI.eraseFromParent(); 1258 return true; 1259 } 1260 1261 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1262 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1263 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1264 unsigned NullVal = TM.getNullPointerValue(DestAS); 1265 1266 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1267 auto FlatNull = B.buildConstant(SrcTy, 0); 1268 1269 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy); 1270 1271 // Extract low 32-bits of the pointer. 1272 B.buildExtract(PtrLo32, Src, 0); 1273 1274 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1275 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); 1276 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1277 1278 MI.eraseFromParent(); 1279 return true; 1280 } 1281 1282 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1283 return false; 1284 1285 if (!ST.hasFlatAddressSpace()) 1286 return false; 1287 1288 auto SegmentNull = 1289 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1290 auto FlatNull = 1291 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1292 1293 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1294 if (!ApertureReg.isValid()) 1295 return false; 1296 1297 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1298 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); 1299 1300 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy); 1301 1302 // Coerce the type of the low half of the result so we can use merge_values. 1303 Register SrcAsInt = MRI.createGenericVirtualRegister(S32); 1304 B.buildInstr(TargetOpcode::G_PTRTOINT) 1305 .addDef(SrcAsInt) 1306 .addUse(Src); 1307 1308 // TODO: Should we allow mismatched types but matching sizes in merges to 1309 // avoid the ptrtoint? 1310 B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); 1311 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); 1312 1313 MI.eraseFromParent(); 1314 return true; 1315 } 1316 1317 bool AMDGPULegalizerInfo::legalizeFrint( 1318 MachineInstr &MI, MachineRegisterInfo &MRI, 1319 MachineIRBuilder &B) const { 1320 B.setInstr(MI); 1321 1322 Register Src = MI.getOperand(1).getReg(); 1323 LLT Ty = MRI.getType(Src); 1324 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1325 1326 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1327 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1328 1329 auto C1 = B.buildFConstant(Ty, C1Val); 1330 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1331 1332 // TODO: Should this propagate fast-math-flags? 1333 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1334 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1335 1336 auto C2 = B.buildFConstant(Ty, C2Val); 1337 auto Fabs = B.buildFAbs(Ty, Src); 1338 1339 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1340 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1341 return true; 1342 } 1343 1344 bool AMDGPULegalizerInfo::legalizeFceil( 1345 MachineInstr &MI, MachineRegisterInfo &MRI, 1346 MachineIRBuilder &B) const { 1347 B.setInstr(MI); 1348 1349 const LLT S1 = LLT::scalar(1); 1350 const LLT S64 = LLT::scalar(64); 1351 1352 Register Src = MI.getOperand(1).getReg(); 1353 assert(MRI.getType(Src) == S64); 1354 1355 // result = trunc(src) 1356 // if (src > 0.0 && src != result) 1357 // result += 1.0 1358 1359 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src}); 1360 1361 const auto Zero = B.buildFConstant(S64, 0.0); 1362 const auto One = B.buildFConstant(S64, 1.0); 1363 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1364 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1365 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1366 auto Add = B.buildSelect(S64, And, One, Zero); 1367 1368 // TODO: Should this propagate fast-math-flags? 1369 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1370 return true; 1371 } 1372 1373 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1374 MachineIRBuilder &B) { 1375 const unsigned FractBits = 52; 1376 const unsigned ExpBits = 11; 1377 LLT S32 = LLT::scalar(32); 1378 1379 auto Const0 = B.buildConstant(S32, FractBits - 32); 1380 auto Const1 = B.buildConstant(S32, ExpBits); 1381 1382 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1383 .addUse(Const0.getReg(0)) 1384 .addUse(Const1.getReg(0)); 1385 1386 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1387 } 1388 1389 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1390 MachineInstr &MI, MachineRegisterInfo &MRI, 1391 MachineIRBuilder &B) const { 1392 B.setInstr(MI); 1393 1394 const LLT S1 = LLT::scalar(1); 1395 const LLT S32 = LLT::scalar(32); 1396 const LLT S64 = LLT::scalar(64); 1397 1398 Register Src = MI.getOperand(1).getReg(); 1399 assert(MRI.getType(Src) == S64); 1400 1401 // TODO: Should this use extract since the low half is unused? 1402 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1403 Register Hi = Unmerge.getReg(1); 1404 1405 // Extract the upper half, since this is where we will find the sign and 1406 // exponent. 1407 auto Exp = extractF64Exponent(Hi, B); 1408 1409 const unsigned FractBits = 52; 1410 1411 // Extract the sign bit. 1412 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1413 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1414 1415 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1416 1417 const auto Zero32 = B.buildConstant(S32, 0); 1418 1419 // Extend back to 64-bits. 1420 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); 1421 1422 auto Shr = B.buildAShr(S64, FractMask, Exp); 1423 auto Not = B.buildNot(S64, Shr); 1424 auto Tmp0 = B.buildAnd(S64, Src, Not); 1425 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1426 1427 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1428 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1429 1430 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1431 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1432 return true; 1433 } 1434 1435 bool AMDGPULegalizerInfo::legalizeITOFP( 1436 MachineInstr &MI, MachineRegisterInfo &MRI, 1437 MachineIRBuilder &B, bool Signed) const { 1438 B.setInstr(MI); 1439 1440 Register Dst = MI.getOperand(0).getReg(); 1441 Register Src = MI.getOperand(1).getReg(); 1442 1443 const LLT S64 = LLT::scalar(64); 1444 const LLT S32 = LLT::scalar(32); 1445 1446 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1447 1448 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1449 1450 auto CvtHi = Signed ? 1451 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1452 B.buildUITOFP(S64, Unmerge.getReg(1)); 1453 1454 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1455 1456 auto ThirtyTwo = B.buildConstant(S32, 32); 1457 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1458 .addUse(CvtHi.getReg(0)) 1459 .addUse(ThirtyTwo.getReg(0)); 1460 1461 // TODO: Should this propagate fast-math-flags? 1462 B.buildFAdd(Dst, LdExp, CvtLo); 1463 MI.eraseFromParent(); 1464 return true; 1465 } 1466 1467 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1468 MachineInstr &MI, MachineRegisterInfo &MRI, 1469 MachineIRBuilder &B) const { 1470 MachineFunction &MF = B.getMF(); 1471 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1472 1473 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1474 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1475 1476 // With ieee_mode disabled, the instructions have the correct behavior 1477 // already for G_FMINNUM/G_FMAXNUM 1478 if (!MFI->getMode().IEEE) 1479 return !IsIEEEOp; 1480 1481 if (IsIEEEOp) 1482 return true; 1483 1484 MachineIRBuilder HelperBuilder(MI); 1485 GISelObserverWrapper DummyObserver; 1486 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1487 HelperBuilder.setInstr(MI); 1488 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1489 } 1490 1491 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1492 MachineInstr &MI, MachineRegisterInfo &MRI, 1493 MachineIRBuilder &B) const { 1494 // TODO: Should move some of this into LegalizerHelper. 1495 1496 // TODO: Promote dynamic indexing of s16 to s32 1497 // TODO: Dynamic s64 indexing is only legal for SGPR. 1498 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); 1499 if (!IdxVal) // Dynamic case will be selected to register indexing. 1500 return true; 1501 1502 Register Dst = MI.getOperand(0).getReg(); 1503 Register Vec = MI.getOperand(1).getReg(); 1504 1505 LLT VecTy = MRI.getType(Vec); 1506 LLT EltTy = VecTy.getElementType(); 1507 assert(EltTy == MRI.getType(Dst)); 1508 1509 B.setInstr(MI); 1510 1511 if (IdxVal.getValue() < VecTy.getNumElements()) 1512 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); 1513 else 1514 B.buildUndef(Dst); 1515 1516 MI.eraseFromParent(); 1517 return true; 1518 } 1519 1520 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1521 MachineInstr &MI, MachineRegisterInfo &MRI, 1522 MachineIRBuilder &B) const { 1523 // TODO: Should move some of this into LegalizerHelper. 1524 1525 // TODO: Promote dynamic indexing of s16 to s32 1526 // TODO: Dynamic s64 indexing is only legal for SGPR. 1527 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); 1528 if (!IdxVal) // Dynamic case will be selected to register indexing. 1529 return true; 1530 1531 Register Dst = MI.getOperand(0).getReg(); 1532 Register Vec = MI.getOperand(1).getReg(); 1533 Register Ins = MI.getOperand(2).getReg(); 1534 1535 LLT VecTy = MRI.getType(Vec); 1536 LLT EltTy = VecTy.getElementType(); 1537 assert(EltTy == MRI.getType(Ins)); 1538 1539 B.setInstr(MI); 1540 1541 if (IdxVal.getValue() < VecTy.getNumElements()) 1542 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); 1543 else 1544 B.buildUndef(Dst); 1545 1546 MI.eraseFromParent(); 1547 return true; 1548 } 1549 1550 bool AMDGPULegalizerInfo::legalizeSinCos( 1551 MachineInstr &MI, MachineRegisterInfo &MRI, 1552 MachineIRBuilder &B) const { 1553 B.setInstr(MI); 1554 1555 Register DstReg = MI.getOperand(0).getReg(); 1556 Register SrcReg = MI.getOperand(1).getReg(); 1557 LLT Ty = MRI.getType(DstReg); 1558 unsigned Flags = MI.getFlags(); 1559 1560 Register TrigVal; 1561 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1562 if (ST.hasTrigReducedRange()) { 1563 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1564 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1565 .addUse(MulVal.getReg(0)) 1566 .setMIFlags(Flags).getReg(0); 1567 } else 1568 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1569 1570 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1571 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1572 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1573 .addUse(TrigVal) 1574 .setMIFlags(Flags); 1575 MI.eraseFromParent(); 1576 return true; 1577 } 1578 1579 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1580 Register DstReg, LLT PtrTy, 1581 MachineIRBuilder &B, const GlobalValue *GV, 1582 unsigned Offset, unsigned GAFlags) const { 1583 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1584 // to the following code sequence: 1585 // 1586 // For constant address space: 1587 // s_getpc_b64 s[0:1] 1588 // s_add_u32 s0, s0, $symbol 1589 // s_addc_u32 s1, s1, 0 1590 // 1591 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1592 // a fixup or relocation is emitted to replace $symbol with a literal 1593 // constant, which is a pc-relative offset from the encoding of the $symbol 1594 // operand to the global variable. 1595 // 1596 // For global address space: 1597 // s_getpc_b64 s[0:1] 1598 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1599 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1600 // 1601 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1602 // fixups or relocations are emitted to replace $symbol@*@lo and 1603 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1604 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1605 // operand to the global variable. 1606 // 1607 // What we want here is an offset from the value returned by s_getpc 1608 // (which is the address of the s_add_u32 instruction) to the global 1609 // variable, but since the encoding of $symbol starts 4 bytes after the start 1610 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1611 // small. This requires us to add 4 to the global variable offset in order to 1612 // compute the correct address. 1613 1614 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1615 1616 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1617 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1618 1619 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1620 .addDef(PCReg); 1621 1622 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1623 if (GAFlags == SIInstrInfo::MO_NONE) 1624 MIB.addImm(0); 1625 else 1626 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1627 1628 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1629 1630 if (PtrTy.getSizeInBits() == 32) 1631 B.buildExtract(DstReg, PCReg, 0); 1632 return true; 1633 } 1634 1635 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1636 MachineInstr &MI, MachineRegisterInfo &MRI, 1637 MachineIRBuilder &B) const { 1638 Register DstReg = MI.getOperand(0).getReg(); 1639 LLT Ty = MRI.getType(DstReg); 1640 unsigned AS = Ty.getAddressSpace(); 1641 1642 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1643 MachineFunction &MF = B.getMF(); 1644 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1645 B.setInstr(MI); 1646 1647 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1648 if (!MFI->isEntryFunction()) { 1649 const Function &Fn = MF.getFunction(); 1650 DiagnosticInfoUnsupported BadLDSDecl( 1651 Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 1652 Fn.getContext().diagnose(BadLDSDecl); 1653 } 1654 1655 // TODO: We could emit code to handle the initialization somewhere. 1656 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1657 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1658 MI.eraseFromParent(); 1659 return true; 1660 } 1661 1662 const Function &Fn = MF.getFunction(); 1663 DiagnosticInfoUnsupported BadInit( 1664 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1665 Fn.getContext().diagnose(BadInit); 1666 return true; 1667 } 1668 1669 const SITargetLowering *TLI = ST.getTargetLowering(); 1670 1671 if (TLI->shouldEmitFixup(GV)) { 1672 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 1673 MI.eraseFromParent(); 1674 return true; 1675 } 1676 1677 if (TLI->shouldEmitPCReloc(GV)) { 1678 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 1679 MI.eraseFromParent(); 1680 return true; 1681 } 1682 1683 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1684 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 1685 1686 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 1687 MachinePointerInfo::getGOT(MF), 1688 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1689 MachineMemOperand::MOInvariant, 1690 8 /*Size*/, 8 /*Align*/); 1691 1692 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 1693 1694 if (Ty.getSizeInBits() == 32) { 1695 // Truncate if this is a 32-bit constant adrdess. 1696 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 1697 B.buildExtract(DstReg, Load, 0); 1698 } else 1699 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 1700 1701 MI.eraseFromParent(); 1702 return true; 1703 } 1704 1705 bool AMDGPULegalizerInfo::legalizeLoad( 1706 MachineInstr &MI, MachineRegisterInfo &MRI, 1707 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 1708 B.setInstr(MI); 1709 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1710 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 1711 Observer.changingInstr(MI); 1712 MI.getOperand(1).setReg(Cast.getReg(0)); 1713 Observer.changedInstr(MI); 1714 return true; 1715 } 1716 1717 bool AMDGPULegalizerInfo::legalizeFMad( 1718 MachineInstr &MI, MachineRegisterInfo &MRI, 1719 MachineIRBuilder &B) const { 1720 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 1721 assert(Ty.isScalar()); 1722 1723 MachineFunction &MF = B.getMF(); 1724 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1725 1726 // TODO: Always legal with future ftz flag. 1727 if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals) 1728 return true; 1729 if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals) 1730 return true; 1731 1732 1733 MachineIRBuilder HelperBuilder(MI); 1734 GISelObserverWrapper DummyObserver; 1735 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1736 HelperBuilder.setMBB(*MI.getParent()); 1737 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 1738 } 1739 1740 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 1741 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 1742 Register DstReg = MI.getOperand(0).getReg(); 1743 Register PtrReg = MI.getOperand(1).getReg(); 1744 Register CmpVal = MI.getOperand(2).getReg(); 1745 Register NewVal = MI.getOperand(3).getReg(); 1746 1747 assert(SITargetLowering::isFlatGlobalAddrSpace( 1748 MRI.getType(PtrReg).getAddressSpace()) && 1749 "this should not have been custom lowered"); 1750 1751 LLT ValTy = MRI.getType(CmpVal); 1752 LLT VecTy = LLT::vector(2, ValTy); 1753 1754 B.setInstr(MI); 1755 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 1756 1757 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 1758 .addDef(DstReg) 1759 .addUse(PtrReg) 1760 .addUse(PackedVal) 1761 .setMemRefs(MI.memoperands()); 1762 1763 MI.eraseFromParent(); 1764 return true; 1765 } 1766 1767 // Return the use branch instruction, otherwise null if the usage is invalid. 1768 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 1769 MachineRegisterInfo &MRI) { 1770 Register CondDef = MI.getOperand(0).getReg(); 1771 if (!MRI.hasOneNonDBGUse(CondDef)) 1772 return nullptr; 1773 1774 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 1775 return UseMI.getParent() == MI.getParent() && 1776 UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr; 1777 } 1778 1779 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 1780 Register Reg, LLT Ty) const { 1781 Register LiveIn = MRI.getLiveInVirtReg(Reg); 1782 if (LiveIn) 1783 return LiveIn; 1784 1785 Register NewReg = MRI.createGenericVirtualRegister(Ty); 1786 MRI.addLiveIn(Reg, NewReg); 1787 return NewReg; 1788 } 1789 1790 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 1791 const ArgDescriptor *Arg) const { 1792 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 1793 return false; // TODO: Handle these 1794 1795 assert(Arg->getRegister().isPhysical()); 1796 1797 MachineRegisterInfo &MRI = *B.getMRI(); 1798 1799 LLT Ty = MRI.getType(DstReg); 1800 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 1801 1802 if (Arg->isMasked()) { 1803 // TODO: Should we try to emit this once in the entry block? 1804 const LLT S32 = LLT::scalar(32); 1805 const unsigned Mask = Arg->getMask(); 1806 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 1807 1808 Register AndMaskSrc = LiveIn; 1809 1810 if (Shift != 0) { 1811 auto ShiftAmt = B.buildConstant(S32, Shift); 1812 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 1813 } 1814 1815 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 1816 } else 1817 B.buildCopy(DstReg, LiveIn); 1818 1819 // Insert the argument copy if it doens't already exist. 1820 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 1821 if (!MRI.getVRegDef(LiveIn)) { 1822 // FIXME: Should have scoped insert pt 1823 MachineBasicBlock &OrigInsBB = B.getMBB(); 1824 auto OrigInsPt = B.getInsertPt(); 1825 1826 MachineBasicBlock &EntryMBB = B.getMF().front(); 1827 EntryMBB.addLiveIn(Arg->getRegister()); 1828 B.setInsertPt(EntryMBB, EntryMBB.begin()); 1829 B.buildCopy(LiveIn, Arg->getRegister()); 1830 1831 B.setInsertPt(OrigInsBB, OrigInsPt); 1832 } 1833 1834 return true; 1835 } 1836 1837 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 1838 MachineInstr &MI, 1839 MachineRegisterInfo &MRI, 1840 MachineIRBuilder &B, 1841 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 1842 B.setInstr(MI); 1843 1844 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1845 1846 const ArgDescriptor *Arg; 1847 const TargetRegisterClass *RC; 1848 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 1849 if (!Arg) { 1850 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 1851 return false; 1852 } 1853 1854 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 1855 MI.eraseFromParent(); 1856 return true; 1857 } 1858 1859 return false; 1860 } 1861 1862 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 1863 MachineRegisterInfo &MRI, 1864 MachineIRBuilder &B) const { 1865 B.setInstr(MI); 1866 Register Dst = MI.getOperand(0).getReg(); 1867 LLT DstTy = MRI.getType(Dst); 1868 LLT S16 = LLT::scalar(16); 1869 LLT S32 = LLT::scalar(32); 1870 LLT S64 = LLT::scalar(64); 1871 1872 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 1873 return true; 1874 1875 if (DstTy == S16) 1876 return legalizeFDIV16(MI, MRI, B); 1877 if (DstTy == S32) 1878 return legalizeFDIV32(MI, MRI, B); 1879 if (DstTy == S64) 1880 return legalizeFDIV64(MI, MRI, B); 1881 1882 return false; 1883 } 1884 1885 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 1886 MachineRegisterInfo &MRI, 1887 MachineIRBuilder &B) const { 1888 Register Res = MI.getOperand(0).getReg(); 1889 Register LHS = MI.getOperand(1).getReg(); 1890 Register RHS = MI.getOperand(2).getReg(); 1891 1892 uint16_t Flags = MI.getFlags(); 1893 1894 LLT ResTy = MRI.getType(Res); 1895 LLT S32 = LLT::scalar(32); 1896 LLT S64 = LLT::scalar(64); 1897 1898 const MachineFunction &MF = B.getMF(); 1899 bool Unsafe = 1900 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 1901 1902 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 1903 return false; 1904 1905 if (!Unsafe && ResTy == S32 && 1906 MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals) 1907 return false; 1908 1909 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 1910 // 1 / x -> RCP(x) 1911 if (CLHS->isExactlyValue(1.0)) { 1912 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 1913 .addUse(RHS) 1914 .setMIFlags(Flags); 1915 1916 MI.eraseFromParent(); 1917 return true; 1918 } 1919 1920 // -1 / x -> RCP( FNEG(x) ) 1921 if (CLHS->isExactlyValue(-1.0)) { 1922 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 1923 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 1924 .addUse(FNeg.getReg(0)) 1925 .setMIFlags(Flags); 1926 1927 MI.eraseFromParent(); 1928 return true; 1929 } 1930 } 1931 1932 // x / y -> x * (1.0 / y) 1933 if (Unsafe) { 1934 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 1935 .addUse(RHS) 1936 .setMIFlags(Flags); 1937 B.buildFMul(Res, LHS, RCP, Flags); 1938 1939 MI.eraseFromParent(); 1940 return true; 1941 } 1942 1943 return false; 1944 } 1945 1946 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 1947 MachineRegisterInfo &MRI, 1948 MachineIRBuilder &B) const { 1949 B.setInstr(MI); 1950 Register Res = MI.getOperand(0).getReg(); 1951 Register LHS = MI.getOperand(1).getReg(); 1952 Register RHS = MI.getOperand(2).getReg(); 1953 1954 uint16_t Flags = MI.getFlags(); 1955 1956 LLT S16 = LLT::scalar(16); 1957 LLT S32 = LLT::scalar(32); 1958 1959 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 1960 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 1961 1962 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 1963 .addUse(RHSExt.getReg(0)) 1964 .setMIFlags(Flags); 1965 1966 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 1967 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 1968 1969 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 1970 .addUse(RDst.getReg(0)) 1971 .addUse(RHS) 1972 .addUse(LHS) 1973 .setMIFlags(Flags); 1974 1975 MI.eraseFromParent(); 1976 return true; 1977 } 1978 1979 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 1980 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 1981 static void toggleSPDenormMode(bool Enable, 1982 MachineIRBuilder &B, 1983 const GCNSubtarget &ST, 1984 AMDGPU::SIModeRegisterDefaults Mode) { 1985 // Set SP denorm mode to this value. 1986 unsigned SPDenormMode = 1987 Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; 1988 1989 if (ST.hasDenormModeInst()) { 1990 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 1991 unsigned DPDenormModeDefault = Mode.FP64FP16Denormals 1992 ? FP_DENORM_FLUSH_NONE 1993 : FP_DENORM_FLUSH_IN_FLUSH_OUT; 1994 1995 unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 1996 B.buildInstr(AMDGPU::S_DENORM_MODE) 1997 .addImm(NewDenormModeValue); 1998 1999 } else { 2000 // Select FP32 bit field in mode register. 2001 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2002 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2003 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2004 2005 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2006 .addImm(SPDenormMode) 2007 .addImm(SPDenormModeBitField); 2008 } 2009 } 2010 2011 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2012 MachineRegisterInfo &MRI, 2013 MachineIRBuilder &B) const { 2014 B.setInstr(MI); 2015 Register Res = MI.getOperand(0).getReg(); 2016 Register LHS = MI.getOperand(1).getReg(); 2017 Register RHS = MI.getOperand(2).getReg(); 2018 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2019 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2020 2021 uint16_t Flags = MI.getFlags(); 2022 2023 LLT S32 = LLT::scalar(32); 2024 LLT S1 = LLT::scalar(1); 2025 2026 auto One = B.buildFConstant(S32, 1.0f); 2027 2028 auto DenominatorScaled = 2029 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2030 .addUse(RHS) 2031 .addUse(RHS) 2032 .addUse(LHS) 2033 .setMIFlags(Flags); 2034 auto NumeratorScaled = 2035 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2036 .addUse(LHS) 2037 .addUse(RHS) 2038 .addUse(LHS) 2039 .setMIFlags(Flags); 2040 2041 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2042 .addUse(DenominatorScaled.getReg(0)) 2043 .setMIFlags(Flags); 2044 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2045 2046 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2047 // aren't modeled as reading it. 2048 if (!Mode.FP32Denormals) 2049 toggleSPDenormMode(true, B, ST, Mode); 2050 2051 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2052 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2053 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2054 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2055 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2056 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2057 2058 if (!Mode.FP32Denormals) 2059 toggleSPDenormMode(false, B, ST, Mode); 2060 2061 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2062 .addUse(Fma4.getReg(0)) 2063 .addUse(Fma1.getReg(0)) 2064 .addUse(Fma3.getReg(0)) 2065 .addUse(NumeratorScaled.getReg(1)) 2066 .setMIFlags(Flags); 2067 2068 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2069 .addUse(Fmas.getReg(0)) 2070 .addUse(RHS) 2071 .addUse(LHS) 2072 .setMIFlags(Flags); 2073 2074 MI.eraseFromParent(); 2075 return true; 2076 } 2077 2078 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2079 MachineRegisterInfo &MRI, 2080 MachineIRBuilder &B) const { 2081 B.setInstr(MI); 2082 Register Res = MI.getOperand(0).getReg(); 2083 Register LHS = MI.getOperand(1).getReg(); 2084 Register RHS = MI.getOperand(2).getReg(); 2085 2086 uint16_t Flags = MI.getFlags(); 2087 2088 LLT S64 = LLT::scalar(64); 2089 LLT S1 = LLT::scalar(1); 2090 2091 auto One = B.buildFConstant(S64, 1.0); 2092 2093 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2094 .addUse(RHS) 2095 .addUse(RHS) 2096 .addUse(LHS) 2097 .setMIFlags(Flags); 2098 2099 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2100 2101 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2102 .addUse(DivScale0.getReg(0)) 2103 .setMIFlags(Flags); 2104 2105 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2106 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2107 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2108 2109 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2110 .addUse(LHS) 2111 .addUse(RHS) 2112 .addUse(LHS) 2113 .setMIFlags(Flags); 2114 2115 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2116 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); 2117 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2118 2119 Register Scale; 2120 if (!ST.hasUsableDivScaleConditionOutput()) { 2121 // Workaround a hardware bug on SI where the condition output from div_scale 2122 // is not usable. 2123 2124 Scale = MRI.createGenericVirtualRegister(S1); 2125 2126 LLT S32 = LLT::scalar(32); 2127 2128 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2129 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2130 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2131 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 2132 2133 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 2134 Scale1Unmerge.getReg(1)); 2135 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 2136 Scale0Unmerge.getReg(1)); 2137 B.buildXor(Scale, CmpNum, CmpDen); 2138 } else { 2139 Scale = DivScale1.getReg(1); 2140 } 2141 2142 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 2143 .addUse(Fma4.getReg(0)) 2144 .addUse(Fma3.getReg(0)) 2145 .addUse(Mul.getReg(0)) 2146 .addUse(Scale) 2147 .setMIFlags(Flags); 2148 2149 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, {S64}, false) 2150 .addDef(Res) 2151 .addUse(Fmas.getReg(0)) 2152 .addUse(RHS) 2153 .addUse(LHS) 2154 .setMIFlags(Flags); 2155 2156 MI.eraseFromParent(); 2157 return true; 2158 } 2159 2160 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 2161 MachineRegisterInfo &MRI, 2162 MachineIRBuilder &B) const { 2163 B.setInstr(MI); 2164 Register Res = MI.getOperand(0).getReg(); 2165 Register LHS = MI.getOperand(2).getReg(); 2166 Register RHS = MI.getOperand(3).getReg(); 2167 uint16_t Flags = MI.getFlags(); 2168 2169 LLT S32 = LLT::scalar(32); 2170 LLT S1 = LLT::scalar(1); 2171 2172 auto Abs = B.buildFAbs(S32, RHS, Flags); 2173 const APFloat C0Val(1.0f); 2174 2175 auto C0 = B.buildConstant(S32, 0x6f800000); 2176 auto C1 = B.buildConstant(S32, 0x2f800000); 2177 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 2178 2179 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 2180 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 2181 2182 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 2183 2184 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2185 .addUse(Mul0.getReg(0)) 2186 .setMIFlags(Flags); 2187 2188 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 2189 2190 B.buildFMul(Res, Sel, Mul1, Flags); 2191 2192 MI.eraseFromParent(); 2193 return true; 2194 } 2195 2196 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 2197 MachineRegisterInfo &MRI, 2198 MachineIRBuilder &B) const { 2199 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2200 if (!MFI->isEntryFunction()) { 2201 return legalizePreloadedArgIntrin(MI, MRI, B, 2202 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 2203 } 2204 2205 B.setInstr(MI); 2206 2207 uint64_t Offset = 2208 ST.getTargetLowering()->getImplicitParameterOffset( 2209 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 2210 Register DstReg = MI.getOperand(0).getReg(); 2211 LLT DstTy = MRI.getType(DstReg); 2212 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 2213 2214 const ArgDescriptor *Arg; 2215 const TargetRegisterClass *RC; 2216 std::tie(Arg, RC) 2217 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2218 if (!Arg) 2219 return false; 2220 2221 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 2222 if (!loadInputValue(KernargPtrReg, B, Arg)) 2223 return false; 2224 2225 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 2226 MI.eraseFromParent(); 2227 return true; 2228 } 2229 2230 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 2231 MachineRegisterInfo &MRI, 2232 MachineIRBuilder &B, 2233 unsigned AddrSpace) const { 2234 B.setInstr(MI); 2235 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 2236 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 2237 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 2238 MI.eraseFromParent(); 2239 return true; 2240 } 2241 2242 /// Handle register layout difference for f16 images for some subtargets. 2243 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 2244 MachineRegisterInfo &MRI, 2245 Register Reg) const { 2246 if (!ST.hasUnpackedD16VMem()) 2247 return Reg; 2248 2249 const LLT S16 = LLT::scalar(16); 2250 const LLT S32 = LLT::scalar(32); 2251 LLT StoreVT = MRI.getType(Reg); 2252 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 2253 2254 auto Unmerge = B.buildUnmerge(S16, Reg); 2255 2256 SmallVector<Register, 4> WideRegs; 2257 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 2258 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 2259 2260 int NumElts = StoreVT.getNumElements(); 2261 2262 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 2263 } 2264 2265 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI, 2266 MachineRegisterInfo &MRI, 2267 MachineIRBuilder &B, 2268 bool IsFormat) const { 2269 // TODO: Reject f16 format on targets where unsupported. 2270 Register VData = MI.getOperand(1).getReg(); 2271 LLT Ty = MRI.getType(VData); 2272 2273 B.setInstr(MI); 2274 2275 const LLT S32 = LLT::scalar(32); 2276 const LLT S16 = LLT::scalar(16); 2277 2278 // Fixup illegal register types for i8 stores. 2279 if (Ty == LLT::scalar(8) || Ty == S16) { 2280 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 2281 MI.getOperand(1).setReg(AnyExt); 2282 return true; 2283 } 2284 2285 if (Ty.isVector()) { 2286 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 2287 if (IsFormat) 2288 MI.getOperand(1).setReg(handleD16VData(B, MRI, VData)); 2289 return true; 2290 } 2291 2292 return Ty.getElementType() == S32 && Ty.getNumElements() <= 4; 2293 } 2294 2295 return Ty == S32; 2296 } 2297 2298 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 2299 MachineRegisterInfo &MRI, 2300 MachineIRBuilder &B) const { 2301 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 2302 switch (MI.getIntrinsicID()) { 2303 case Intrinsic::amdgcn_if: { 2304 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 2305 const SIRegisterInfo *TRI 2306 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 2307 2308 B.setInstr(*BrCond); 2309 Register Def = MI.getOperand(1).getReg(); 2310 Register Use = MI.getOperand(3).getReg(); 2311 B.buildInstr(AMDGPU::SI_IF) 2312 .addDef(Def) 2313 .addUse(Use) 2314 .addMBB(BrCond->getOperand(1).getMBB()); 2315 2316 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 2317 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 2318 MI.eraseFromParent(); 2319 BrCond->eraseFromParent(); 2320 return true; 2321 } 2322 2323 return false; 2324 } 2325 case Intrinsic::amdgcn_loop: { 2326 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 2327 const SIRegisterInfo *TRI 2328 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 2329 2330 B.setInstr(*BrCond); 2331 Register Reg = MI.getOperand(2).getReg(); 2332 B.buildInstr(AMDGPU::SI_LOOP) 2333 .addUse(Reg) 2334 .addMBB(BrCond->getOperand(1).getMBB()); 2335 MI.eraseFromParent(); 2336 BrCond->eraseFromParent(); 2337 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 2338 return true; 2339 } 2340 2341 return false; 2342 } 2343 case Intrinsic::amdgcn_kernarg_segment_ptr: 2344 return legalizePreloadedArgIntrin( 2345 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2346 case Intrinsic::amdgcn_implicitarg_ptr: 2347 return legalizeImplicitArgPtr(MI, MRI, B); 2348 case Intrinsic::amdgcn_workitem_id_x: 2349 return legalizePreloadedArgIntrin(MI, MRI, B, 2350 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 2351 case Intrinsic::amdgcn_workitem_id_y: 2352 return legalizePreloadedArgIntrin(MI, MRI, B, 2353 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 2354 case Intrinsic::amdgcn_workitem_id_z: 2355 return legalizePreloadedArgIntrin(MI, MRI, B, 2356 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 2357 case Intrinsic::amdgcn_workgroup_id_x: 2358 return legalizePreloadedArgIntrin(MI, MRI, B, 2359 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 2360 case Intrinsic::amdgcn_workgroup_id_y: 2361 return legalizePreloadedArgIntrin(MI, MRI, B, 2362 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 2363 case Intrinsic::amdgcn_workgroup_id_z: 2364 return legalizePreloadedArgIntrin(MI, MRI, B, 2365 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 2366 case Intrinsic::amdgcn_dispatch_ptr: 2367 return legalizePreloadedArgIntrin(MI, MRI, B, 2368 AMDGPUFunctionArgInfo::DISPATCH_PTR); 2369 case Intrinsic::amdgcn_queue_ptr: 2370 return legalizePreloadedArgIntrin(MI, MRI, B, 2371 AMDGPUFunctionArgInfo::QUEUE_PTR); 2372 case Intrinsic::amdgcn_implicit_buffer_ptr: 2373 return legalizePreloadedArgIntrin( 2374 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 2375 case Intrinsic::amdgcn_dispatch_id: 2376 return legalizePreloadedArgIntrin(MI, MRI, B, 2377 AMDGPUFunctionArgInfo::DISPATCH_ID); 2378 case Intrinsic::amdgcn_fdiv_fast: 2379 return legalizeFDIVFastIntrin(MI, MRI, B); 2380 case Intrinsic::amdgcn_is_shared: 2381 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 2382 case Intrinsic::amdgcn_is_private: 2383 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 2384 case Intrinsic::amdgcn_wavefrontsize: { 2385 B.setInstr(MI); 2386 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 2387 MI.eraseFromParent(); 2388 return true; 2389 } 2390 case Intrinsic::amdgcn_raw_buffer_store: 2391 return legalizeRawBufferStore(MI, MRI, B, false); 2392 case Intrinsic::amdgcn_raw_buffer_store_format: 2393 return legalizeRawBufferStore(MI, MRI, B, true); 2394 default: 2395 return true; 2396 } 2397 2398 return true; 2399 } 2400