1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPU.h" 22 #include "AMDGPULegalizerInfo.h" 23 #include "AMDGPUTargetMachine.h" 24 #include "SIMachineFunctionInfo.h" 25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/CodeGen/TargetOpcodes.h" 28 #include "llvm/CodeGen/ValueTypes.h" 29 #include "llvm/IR/DerivedTypes.h" 30 #include "llvm/IR/DiagnosticInfo.h" 31 #include "llvm/IR/Type.h" 32 #include "llvm/Support/Debug.h" 33 34 #define DEBUG_TYPE "amdgpu-legalinfo" 35 36 using namespace llvm; 37 using namespace LegalizeActions; 38 using namespace LegalizeMutations; 39 using namespace LegalityPredicates; 40 41 42 static LegalityPredicate isMultiple32(unsigned TypeIdx, 43 unsigned MaxSize = 1024) { 44 return [=](const LegalityQuery &Query) { 45 const LLT Ty = Query.Types[TypeIdx]; 46 const LLT EltTy = Ty.getScalarType(); 47 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 48 }; 49 } 50 51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 52 return [=](const LegalityQuery &Query) { 53 return Query.Types[TypeIdx].getSizeInBits() == Size; 54 }; 55 } 56 57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 58 return [=](const LegalityQuery &Query) { 59 const LLT Ty = Query.Types[TypeIdx]; 60 return Ty.isVector() && 61 Ty.getNumElements() % 2 != 0 && 62 Ty.getElementType().getSizeInBits() < 32 && 63 Ty.getSizeInBits() % 32 != 0; 64 }; 65 } 66 67 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 68 return [=](const LegalityQuery &Query) { 69 const LLT Ty = Query.Types[TypeIdx]; 70 const LLT EltTy = Ty.getScalarType(); 71 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 72 }; 73 } 74 75 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 76 return [=](const LegalityQuery &Query) { 77 const LLT Ty = Query.Types[TypeIdx]; 78 const LLT EltTy = Ty.getElementType(); 79 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 80 }; 81 } 82 83 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 84 return [=](const LegalityQuery &Query) { 85 const LLT Ty = Query.Types[TypeIdx]; 86 const LLT EltTy = Ty.getElementType(); 87 unsigned Size = Ty.getSizeInBits(); 88 unsigned Pieces = (Size + 63) / 64; 89 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 90 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 91 }; 92 } 93 94 // Increase the number of vector elements to reach the next multiple of 32-bit 95 // type. 96 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 97 return [=](const LegalityQuery &Query) { 98 const LLT Ty = Query.Types[TypeIdx]; 99 100 const LLT EltTy = Ty.getElementType(); 101 const int Size = Ty.getSizeInBits(); 102 const int EltSize = EltTy.getSizeInBits(); 103 const int NextMul32 = (Size + 31) / 32; 104 105 assert(EltSize < 32); 106 107 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 108 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 109 }; 110 } 111 112 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 113 return [=](const LegalityQuery &Query) { 114 const LLT QueryTy = Query.Types[TypeIdx]; 115 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 116 }; 117 } 118 119 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 120 return [=](const LegalityQuery &Query) { 121 const LLT QueryTy = Query.Types[TypeIdx]; 122 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 123 }; 124 } 125 126 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 127 return [=](const LegalityQuery &Query) { 128 const LLT QueryTy = Query.Types[TypeIdx]; 129 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 130 }; 131 } 132 133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 134 // v2s16. 135 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 136 return [=](const LegalityQuery &Query) { 137 const LLT Ty = Query.Types[TypeIdx]; 138 if (Ty.isVector()) { 139 const int EltSize = Ty.getElementType().getSizeInBits(); 140 return EltSize == 32 || EltSize == 64 || 141 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 142 EltSize == 128 || EltSize == 256; 143 } 144 145 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 146 }; 147 } 148 149 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 150 return [=](const LegalityQuery &Query) { 151 return Query.Types[TypeIdx].getElementType() == Type; 152 }; 153 } 154 155 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 156 return [=](const LegalityQuery &Query) { 157 const LLT Ty = Query.Types[TypeIdx]; 158 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 159 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 160 }; 161 } 162 163 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 164 const GCNTargetMachine &TM) 165 : ST(ST_) { 166 using namespace TargetOpcode; 167 168 auto GetAddrSpacePtr = [&TM](unsigned AS) { 169 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 170 }; 171 172 const LLT S1 = LLT::scalar(1); 173 const LLT S8 = LLT::scalar(8); 174 const LLT S16 = LLT::scalar(16); 175 const LLT S32 = LLT::scalar(32); 176 const LLT S64 = LLT::scalar(64); 177 const LLT S96 = LLT::scalar(96); 178 const LLT S128 = LLT::scalar(128); 179 const LLT S256 = LLT::scalar(256); 180 const LLT S1024 = LLT::scalar(1024); 181 182 const LLT V2S16 = LLT::vector(2, 16); 183 const LLT V4S16 = LLT::vector(4, 16); 184 185 const LLT V2S32 = LLT::vector(2, 32); 186 const LLT V3S32 = LLT::vector(3, 32); 187 const LLT V4S32 = LLT::vector(4, 32); 188 const LLT V5S32 = LLT::vector(5, 32); 189 const LLT V6S32 = LLT::vector(6, 32); 190 const LLT V7S32 = LLT::vector(7, 32); 191 const LLT V8S32 = LLT::vector(8, 32); 192 const LLT V9S32 = LLT::vector(9, 32); 193 const LLT V10S32 = LLT::vector(10, 32); 194 const LLT V11S32 = LLT::vector(11, 32); 195 const LLT V12S32 = LLT::vector(12, 32); 196 const LLT V13S32 = LLT::vector(13, 32); 197 const LLT V14S32 = LLT::vector(14, 32); 198 const LLT V15S32 = LLT::vector(15, 32); 199 const LLT V16S32 = LLT::vector(16, 32); 200 const LLT V32S32 = LLT::vector(32, 32); 201 202 const LLT V2S64 = LLT::vector(2, 64); 203 const LLT V3S64 = LLT::vector(3, 64); 204 const LLT V4S64 = LLT::vector(4, 64); 205 const LLT V5S64 = LLT::vector(5, 64); 206 const LLT V6S64 = LLT::vector(6, 64); 207 const LLT V7S64 = LLT::vector(7, 64); 208 const LLT V8S64 = LLT::vector(8, 64); 209 const LLT V16S64 = LLT::vector(16, 64); 210 211 std::initializer_list<LLT> AllS32Vectors = 212 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 213 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 214 std::initializer_list<LLT> AllS64Vectors = 215 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 216 217 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 218 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 219 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 220 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 221 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 222 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 223 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 224 225 const LLT CodePtr = FlatPtr; 226 227 const std::initializer_list<LLT> AddrSpaces64 = { 228 GlobalPtr, ConstantPtr, FlatPtr 229 }; 230 231 const std::initializer_list<LLT> AddrSpaces32 = { 232 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 233 }; 234 235 const std::initializer_list<LLT> FPTypesBase = { 236 S32, S64 237 }; 238 239 const std::initializer_list<LLT> FPTypes16 = { 240 S32, S64, S16 241 }; 242 243 const std::initializer_list<LLT> FPTypesPK16 = { 244 S32, S64, S16, V2S16 245 }; 246 247 setAction({G_BRCOND, S1}, Legal); 248 249 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 250 // elements for v3s16 251 getActionDefinitionsBuilder(G_PHI) 252 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 253 .legalFor(AllS32Vectors) 254 .legalFor(AllS64Vectors) 255 .legalFor(AddrSpaces64) 256 .legalFor(AddrSpaces32) 257 .clampScalar(0, S32, S256) 258 .widenScalarToNextPow2(0, 32) 259 .clampMaxNumElements(0, S32, 16) 260 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 261 .legalIf(isPointer(0)); 262 263 if (ST.has16BitInsts()) { 264 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 265 .legalFor({S32, S16}) 266 .clampScalar(0, S16, S32) 267 .scalarize(0); 268 } else { 269 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 270 .legalFor({S32}) 271 .clampScalar(0, S32, S32) 272 .scalarize(0); 273 } 274 275 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 276 .legalFor({S32}) 277 .clampScalar(0, S32, S32) 278 .scalarize(0); 279 280 // Report legal for any types we can handle anywhere. For the cases only legal 281 // on the SALU, RegBankSelect will be able to re-legalize. 282 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 283 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 284 .clampScalar(0, S32, S64) 285 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 286 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 287 .widenScalarToNextPow2(0) 288 .scalarize(0); 289 290 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 291 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 292 .legalFor({{S32, S1}}) 293 .clampScalar(0, S32, S32) 294 .scalarize(0); // TODO: Implement. 295 296 getActionDefinitionsBuilder({G_SADDO, G_SSUBO}) 297 .lower(); 298 299 getActionDefinitionsBuilder(G_BITCAST) 300 // Don't worry about the size constraint. 301 .legalIf(all(isRegisterType(0), isRegisterType(1))) 302 // FIXME: Testing hack 303 .legalForCartesianProduct({S16, LLT::vector(2, 8), }); 304 305 getActionDefinitionsBuilder(G_FCONSTANT) 306 .legalFor({S32, S64, S16}) 307 .clampScalar(0, S16, S64); 308 309 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 310 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 311 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 312 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 313 .clampScalarOrElt(0, S32, S1024) 314 .legalIf(isMultiple32(0)) 315 .widenScalarToNextPow2(0, 32) 316 .clampMaxNumElements(0, S32, 16); 317 318 319 // FIXME: i1 operands to intrinsics should always be legal, but other i1 320 // values may not be legal. We need to figure out how to distinguish 321 // between these two scenarios. 322 getActionDefinitionsBuilder(G_CONSTANT) 323 .legalFor({S1, S32, S64, S16, GlobalPtr, 324 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 325 .clampScalar(0, S32, S64) 326 .widenScalarToNextPow2(0) 327 .legalIf(isPointer(0)); 328 329 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 330 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 331 .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr}); 332 333 334 auto &FPOpActions = getActionDefinitionsBuilder( 335 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 336 .legalFor({S32, S64}); 337 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 338 .customFor({S32, S64}); 339 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 340 .customFor({S32, S64}); 341 342 if (ST.has16BitInsts()) { 343 if (ST.hasVOP3PInsts()) 344 FPOpActions.legalFor({S16, V2S16}); 345 else 346 FPOpActions.legalFor({S16}); 347 348 TrigActions.customFor({S16}); 349 FDIVActions.customFor({S16}); 350 } 351 352 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 353 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 354 355 if (ST.hasVOP3PInsts()) { 356 MinNumMaxNum.customFor(FPTypesPK16) 357 .clampMaxNumElements(0, S16, 2) 358 .clampScalar(0, S16, S64) 359 .scalarize(0); 360 } else if (ST.has16BitInsts()) { 361 MinNumMaxNum.customFor(FPTypes16) 362 .clampScalar(0, S16, S64) 363 .scalarize(0); 364 } else { 365 MinNumMaxNum.customFor(FPTypesBase) 366 .clampScalar(0, S32, S64) 367 .scalarize(0); 368 } 369 370 if (ST.hasVOP3PInsts()) 371 FPOpActions.clampMaxNumElements(0, S16, 2); 372 373 FPOpActions 374 .scalarize(0) 375 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 376 377 TrigActions 378 .scalarize(0) 379 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 380 381 FDIVActions 382 .scalarize(0) 383 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 384 385 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 386 .legalFor(FPTypesPK16) 387 .clampMaxNumElements(0, S16, 2) 388 .scalarize(0) 389 .clampScalar(0, S16, S64); 390 391 // TODO: Implement 392 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); 393 394 if (ST.has16BitInsts()) { 395 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 396 .legalFor({S32, S64, S16}) 397 .scalarize(0) 398 .clampScalar(0, S16, S64); 399 } else { 400 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 401 .legalFor({S32, S64}) 402 .scalarize(0) 403 .clampScalar(0, S32, S64); 404 } 405 406 getActionDefinitionsBuilder(G_FPTRUNC) 407 .legalFor({{S32, S64}, {S16, S32}}) 408 .scalarize(0); 409 410 getActionDefinitionsBuilder(G_FPEXT) 411 .legalFor({{S64, S32}, {S32, S16}}) 412 .lowerFor({{S64, S16}}) // FIXME: Implement 413 .scalarize(0); 414 415 // TODO: Verify V_BFI_B32 is generated from expanded bit ops. 416 getActionDefinitionsBuilder(G_FCOPYSIGN).lower(); 417 418 getActionDefinitionsBuilder(G_FSUB) 419 // Use actual fsub instruction 420 .legalFor({S32}) 421 // Must use fadd + fneg 422 .lowerFor({S64, S16, V2S16}) 423 .scalarize(0) 424 .clampScalar(0, S32, S64); 425 426 // Whether this is legal depends on the floating point mode for the function. 427 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 428 if (ST.hasMadF16()) 429 FMad.customFor({S32, S16}); 430 else 431 FMad.customFor({S32}); 432 FMad.scalarize(0) 433 .lower(); 434 435 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 436 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 437 {S32, S1}, {S64, S1}, {S16, S1}, 438 {S96, S32}, 439 // FIXME: Hack 440 {S64, LLT::scalar(33)}, 441 {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}}) 442 .scalarize(0); 443 444 // TODO: Split s1->s64 during regbankselect for VALU. 445 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 446 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}, {S32, S1}, {S16, S1}, {S64, S1}}) 447 .lowerFor({{S32, S64}}) 448 .customFor({{S64, S64}}); 449 if (ST.has16BitInsts()) 450 IToFP.legalFor({{S16, S16}}); 451 IToFP.clampScalar(1, S32, S64) 452 .scalarize(0); 453 454 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 455 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}); 456 if (ST.has16BitInsts()) 457 FPToI.legalFor({{S16, S16}}); 458 else 459 FPToI.minScalar(1, S32); 460 461 FPToI.minScalar(0, S32) 462 .scalarize(0); 463 464 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 465 .legalFor({S32, S64}) 466 .scalarize(0); 467 468 if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 469 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 470 .legalFor({S32, S64}) 471 .clampScalar(0, S32, S64) 472 .scalarize(0); 473 } else { 474 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 475 .legalFor({S32}) 476 .customFor({S64}) 477 .clampScalar(0, S32, S64) 478 .scalarize(0); 479 } 480 481 getActionDefinitionsBuilder(G_GEP) 482 .legalForCartesianProduct(AddrSpaces64, {S64}) 483 .legalForCartesianProduct(AddrSpaces32, {S32}) 484 .scalarize(0); 485 486 getActionDefinitionsBuilder(G_PTR_MASK) 487 .scalarize(0) 488 .alwaysLegal(); 489 490 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 491 492 auto &CmpBuilder = 493 getActionDefinitionsBuilder(G_ICMP) 494 .legalForCartesianProduct( 495 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 496 .legalFor({{S1, S32}, {S1, S64}}); 497 if (ST.has16BitInsts()) { 498 CmpBuilder.legalFor({{S1, S16}}); 499 } 500 501 CmpBuilder 502 .widenScalarToNextPow2(1) 503 .clampScalar(1, S32, S64) 504 .scalarize(0) 505 .legalIf(all(typeIs(0, S1), isPointer(1))); 506 507 getActionDefinitionsBuilder(G_FCMP) 508 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 509 .widenScalarToNextPow2(1) 510 .clampScalar(1, S32, S64) 511 .scalarize(0); 512 513 // FIXME: fexp, flog2, flog10 needs to be custom lowered. 514 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, 515 G_FLOG, G_FLOG2, G_FLOG10}) 516 .legalFor({S32}) 517 .scalarize(0); 518 519 // The 64-bit versions produce 32-bit results, but only on the SALU. 520 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, 521 G_CTTZ, G_CTTZ_ZERO_UNDEF, 522 G_CTPOP}) 523 .legalFor({{S32, S32}, {S32, S64}}) 524 .clampScalar(0, S32, S32) 525 .clampScalar(1, S32, S64) 526 .scalarize(0) 527 .widenScalarToNextPow2(0, 32) 528 .widenScalarToNextPow2(1, 32); 529 530 // TODO: Expand for > s32 531 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE}) 532 .legalFor({S32}) 533 .clampScalar(0, S32, S32) 534 .scalarize(0); 535 536 if (ST.has16BitInsts()) { 537 if (ST.hasVOP3PInsts()) { 538 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 539 .legalFor({S32, S16, V2S16}) 540 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 541 .clampMaxNumElements(0, S16, 2) 542 .clampScalar(0, S16, S32) 543 .widenScalarToNextPow2(0) 544 .scalarize(0); 545 } else { 546 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 547 .legalFor({S32, S16}) 548 .widenScalarToNextPow2(0) 549 .clampScalar(0, S16, S32) 550 .scalarize(0); 551 } 552 } else { 553 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 554 .legalFor({S32}) 555 .clampScalar(0, S32, S32) 556 .widenScalarToNextPow2(0) 557 .scalarize(0); 558 } 559 560 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 561 return [=](const LegalityQuery &Query) { 562 return Query.Types[TypeIdx0].getSizeInBits() < 563 Query.Types[TypeIdx1].getSizeInBits(); 564 }; 565 }; 566 567 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 568 return [=](const LegalityQuery &Query) { 569 return Query.Types[TypeIdx0].getSizeInBits() > 570 Query.Types[TypeIdx1].getSizeInBits(); 571 }; 572 }; 573 574 getActionDefinitionsBuilder(G_INTTOPTR) 575 // List the common cases 576 .legalForCartesianProduct(AddrSpaces64, {S64}) 577 .legalForCartesianProduct(AddrSpaces32, {S32}) 578 .scalarize(0) 579 // Accept any address space as long as the size matches 580 .legalIf(sameSize(0, 1)) 581 .widenScalarIf(smallerThan(1, 0), 582 [](const LegalityQuery &Query) { 583 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 584 }) 585 .narrowScalarIf(greaterThan(1, 0), 586 [](const LegalityQuery &Query) { 587 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 588 }); 589 590 getActionDefinitionsBuilder(G_PTRTOINT) 591 // List the common cases 592 .legalForCartesianProduct(AddrSpaces64, {S64}) 593 .legalForCartesianProduct(AddrSpaces32, {S32}) 594 .scalarize(0) 595 // Accept any address space as long as the size matches 596 .legalIf(sameSize(0, 1)) 597 .widenScalarIf(smallerThan(0, 1), 598 [](const LegalityQuery &Query) { 599 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 600 }) 601 .narrowScalarIf( 602 greaterThan(0, 1), 603 [](const LegalityQuery &Query) { 604 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 605 }); 606 607 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 608 .scalarize(0) 609 .custom(); 610 611 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 612 // handle some operations by just promoting the register during 613 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 614 auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned { 615 switch (AS) { 616 // FIXME: Private element size. 617 case AMDGPUAS::PRIVATE_ADDRESS: 618 return 32; 619 // FIXME: Check subtarget 620 case AMDGPUAS::LOCAL_ADDRESS: 621 return ST.useDS128() ? 128 : 64; 622 623 // Treat constant and global as identical. SMRD loads are sometimes usable 624 // for global loads (ideally constant address space should be eliminated) 625 // depending on the context. Legality cannot be context dependent, but 626 // RegBankSelect can split the load as necessary depending on the pointer 627 // register bank/uniformity and if the memory is invariant or not written in 628 // a kernel. 629 case AMDGPUAS::CONSTANT_ADDRESS: 630 case AMDGPUAS::GLOBAL_ADDRESS: 631 return 512; 632 default: 633 return 128; 634 } 635 }; 636 637 const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool { 638 const LLT DstTy = Query.Types[0]; 639 640 // Split vector extloads. 641 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 642 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 643 return true; 644 645 const LLT PtrTy = Query.Types[1]; 646 unsigned AS = PtrTy.getAddressSpace(); 647 if (MemSize > maxSizeForAddrSpace(AS)) 648 return true; 649 650 // Catch weird sized loads that don't evenly divide into the access sizes 651 // TODO: May be able to widen depending on alignment etc. 652 unsigned NumRegs = MemSize / 32; 653 if (NumRegs == 3 && !ST.hasDwordx3LoadStores()) 654 return true; 655 656 unsigned Align = Query.MMODescrs[0].AlignInBits; 657 if (Align < MemSize) { 658 const SITargetLowering *TLI = ST.getTargetLowering(); 659 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 660 } 661 662 return false; 663 }; 664 665 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 666 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 667 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 668 669 // TODO: Refine based on subtargets which support unaligned access or 128-bit 670 // LDS 671 // TODO: Unsupported flat for SI. 672 673 for (unsigned Op : {G_LOAD, G_STORE}) { 674 const bool IsStore = Op == G_STORE; 675 676 auto &Actions = getActionDefinitionsBuilder(Op); 677 // Whitelist the common cases. 678 // TODO: Pointer loads 679 // TODO: Wide constant loads 680 // TODO: Only CI+ has 3x loads 681 // TODO: Loads to s16 on gfx9 682 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 683 {V2S32, GlobalPtr, 64, GlobalAlign32}, 684 {V3S32, GlobalPtr, 96, GlobalAlign32}, 685 {S96, GlobalPtr, 96, GlobalAlign32}, 686 {V4S32, GlobalPtr, 128, GlobalAlign32}, 687 {S128, GlobalPtr, 128, GlobalAlign32}, 688 {S64, GlobalPtr, 64, GlobalAlign32}, 689 {V2S64, GlobalPtr, 128, GlobalAlign32}, 690 {V2S16, GlobalPtr, 32, GlobalAlign32}, 691 {S32, GlobalPtr, 8, GlobalAlign8}, 692 {S32, GlobalPtr, 16, GlobalAlign16}, 693 694 {S32, LocalPtr, 32, 32}, 695 {S64, LocalPtr, 64, 32}, 696 {V2S32, LocalPtr, 64, 32}, 697 {S32, LocalPtr, 8, 8}, 698 {S32, LocalPtr, 16, 16}, 699 {V2S16, LocalPtr, 32, 32}, 700 701 {S32, PrivatePtr, 32, 32}, 702 {S32, PrivatePtr, 8, 8}, 703 {S32, PrivatePtr, 16, 16}, 704 {V2S16, PrivatePtr, 32, 32}, 705 706 {S32, FlatPtr, 32, GlobalAlign32}, 707 {S32, FlatPtr, 16, GlobalAlign16}, 708 {S32, FlatPtr, 8, GlobalAlign8}, 709 {V2S16, FlatPtr, 32, GlobalAlign32}, 710 711 {S32, ConstantPtr, 32, GlobalAlign32}, 712 {V2S32, ConstantPtr, 64, GlobalAlign32}, 713 {V3S32, ConstantPtr, 96, GlobalAlign32}, 714 {V4S32, ConstantPtr, 128, GlobalAlign32}, 715 {S64, ConstantPtr, 64, GlobalAlign32}, 716 {S128, ConstantPtr, 128, GlobalAlign32}, 717 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 718 Actions 719 .customIf(typeIs(1, Constant32Ptr)) 720 .narrowScalarIf( 721 [=](const LegalityQuery &Query) -> bool { 722 return !Query.Types[0].isVector() && needToSplitLoad(Query); 723 }, 724 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 725 const LLT DstTy = Query.Types[0]; 726 const LLT PtrTy = Query.Types[1]; 727 728 const unsigned DstSize = DstTy.getSizeInBits(); 729 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 730 731 // Split extloads. 732 if (DstSize > MemSize) 733 return std::make_pair(0, LLT::scalar(MemSize)); 734 735 if (DstSize > 32 && (DstSize % 32 != 0)) { 736 // FIXME: Need a way to specify non-extload of larger size if 737 // suitably aligned. 738 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 739 } 740 741 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 742 if (MemSize > MaxSize) 743 return std::make_pair(0, LLT::scalar(MaxSize)); 744 745 unsigned Align = Query.MMODescrs[0].AlignInBits; 746 return std::make_pair(0, LLT::scalar(Align)); 747 }) 748 .fewerElementsIf( 749 [=](const LegalityQuery &Query) -> bool { 750 return Query.Types[0].isVector() && needToSplitLoad(Query); 751 }, 752 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 753 const LLT DstTy = Query.Types[0]; 754 const LLT PtrTy = Query.Types[1]; 755 756 LLT EltTy = DstTy.getElementType(); 757 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 758 759 // Split if it's too large for the address space. 760 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 761 unsigned NumElts = DstTy.getNumElements(); 762 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 763 764 // FIXME: Refine when odd breakdowns handled 765 // The scalars will need to be re-legalized. 766 if (NumPieces == 1 || NumPieces >= NumElts || 767 NumElts % NumPieces != 0) 768 return std::make_pair(0, EltTy); 769 770 return std::make_pair(0, 771 LLT::vector(NumElts / NumPieces, EltTy)); 772 } 773 774 // Need to split because of alignment. 775 unsigned Align = Query.MMODescrs[0].AlignInBits; 776 unsigned EltSize = EltTy.getSizeInBits(); 777 if (EltSize > Align && 778 (EltSize / Align < DstTy.getNumElements())) { 779 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 780 } 781 782 // May need relegalization for the scalars. 783 return std::make_pair(0, EltTy); 784 }) 785 .minScalar(0, S32); 786 787 if (IsStore) 788 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 789 790 // TODO: Need a bitcast lower option? 791 Actions 792 .legalIf([=](const LegalityQuery &Query) { 793 const LLT Ty0 = Query.Types[0]; 794 unsigned Size = Ty0.getSizeInBits(); 795 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 796 unsigned Align = Query.MMODescrs[0].AlignInBits; 797 798 // No extending vector loads. 799 if (Size > MemSize && Ty0.isVector()) 800 return false; 801 802 // FIXME: Widening store from alignment not valid. 803 if (MemSize < Size) 804 MemSize = std::max(MemSize, Align); 805 806 switch (MemSize) { 807 case 8: 808 case 16: 809 return Size == 32; 810 case 32: 811 case 64: 812 case 128: 813 return true; 814 case 96: 815 return ST.hasDwordx3LoadStores(); 816 case 256: 817 case 512: 818 return true; 819 default: 820 return false; 821 } 822 }) 823 .widenScalarToNextPow2(0) 824 // TODO: v3s32->v4s32 with alignment 825 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 826 } 827 828 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 829 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 830 {S32, GlobalPtr, 16, 2 * 8}, 831 {S32, LocalPtr, 8, 8}, 832 {S32, LocalPtr, 16, 16}, 833 {S32, PrivatePtr, 8, 8}, 834 {S32, PrivatePtr, 16, 16}, 835 {S32, ConstantPtr, 8, 8}, 836 {S32, ConstantPtr, 16, 2 * 8}}); 837 if (ST.hasFlatAddressSpace()) { 838 ExtLoads.legalForTypesWithMemDesc( 839 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 840 } 841 842 ExtLoads.clampScalar(0, S32, S32) 843 .widenScalarToNextPow2(0) 844 .unsupportedIfMemSizeNotPow2() 845 .lower(); 846 847 auto &Atomics = getActionDefinitionsBuilder( 848 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 849 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 850 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 851 G_ATOMICRMW_UMIN}) 852 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 853 {S64, GlobalPtr}, {S64, LocalPtr}}); 854 if (ST.hasFlatAddressSpace()) { 855 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 856 } 857 858 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 859 .legalFor({{S32, LocalPtr}}); 860 861 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 862 // demarshalling 863 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 864 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 865 {S32, FlatPtr}, {S64, FlatPtr}}) 866 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 867 {S32, RegionPtr}, {S64, RegionPtr}}); 868 869 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS) 870 .lower(); 871 872 // TODO: Pointer types, any 32-bit or 64-bit vector 873 getActionDefinitionsBuilder(G_SELECT) 874 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 875 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 876 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1}) 877 .clampScalar(0, S16, S64) 878 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 879 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 880 .scalarize(1) 881 .clampMaxNumElements(0, S32, 2) 882 .clampMaxNumElements(0, LocalPtr, 2) 883 .clampMaxNumElements(0, PrivatePtr, 2) 884 .scalarize(0) 885 .widenScalarToNextPow2(0) 886 .legalIf(all(isPointer(0), typeIs(1, S1))); 887 888 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 889 // be more flexible with the shift amount type. 890 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 891 .legalFor({{S32, S32}, {S64, S32}}); 892 if (ST.has16BitInsts()) { 893 if (ST.hasVOP3PInsts()) { 894 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 895 .clampMaxNumElements(0, S16, 2); 896 } else 897 Shifts.legalFor({{S16, S32}, {S16, S16}}); 898 899 Shifts.clampScalar(1, S16, S32); 900 Shifts.clampScalar(0, S16, S64); 901 Shifts.widenScalarToNextPow2(0, 16); 902 } else { 903 // Make sure we legalize the shift amount type first, as the general 904 // expansion for the shifted type will produce much worse code if it hasn't 905 // been truncated already. 906 Shifts.clampScalar(1, S32, S32); 907 Shifts.clampScalar(0, S32, S64); 908 Shifts.widenScalarToNextPow2(0, 32); 909 } 910 Shifts.scalarize(0); 911 912 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 913 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 914 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 915 unsigned IdxTypeIdx = 2; 916 917 getActionDefinitionsBuilder(Op) 918 .customIf([=](const LegalityQuery &Query) { 919 const LLT EltTy = Query.Types[EltTypeIdx]; 920 const LLT VecTy = Query.Types[VecTypeIdx]; 921 const LLT IdxTy = Query.Types[IdxTypeIdx]; 922 return (EltTy.getSizeInBits() == 16 || 923 EltTy.getSizeInBits() % 32 == 0) && 924 VecTy.getSizeInBits() % 32 == 0 && 925 VecTy.getSizeInBits() <= 1024 && 926 IdxTy.getSizeInBits() == 32; 927 }) 928 .clampScalar(EltTypeIdx, S32, S64) 929 .clampScalar(VecTypeIdx, S32, S64) 930 .clampScalar(IdxTypeIdx, S32, S32); 931 } 932 933 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 934 .unsupportedIf([=](const LegalityQuery &Query) { 935 const LLT &EltTy = Query.Types[1].getElementType(); 936 return Query.Types[0] != EltTy; 937 }); 938 939 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 940 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 941 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 942 943 // FIXME: Doesn't handle extract of illegal sizes. 944 getActionDefinitionsBuilder(Op) 945 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 946 // FIXME: Multiples of 16 should not be legal. 947 .legalIf([=](const LegalityQuery &Query) { 948 const LLT BigTy = Query.Types[BigTyIdx]; 949 const LLT LitTy = Query.Types[LitTyIdx]; 950 return (BigTy.getSizeInBits() % 32 == 0) && 951 (LitTy.getSizeInBits() % 16 == 0); 952 }) 953 .widenScalarIf( 954 [=](const LegalityQuery &Query) { 955 const LLT BigTy = Query.Types[BigTyIdx]; 956 return (BigTy.getScalarSizeInBits() < 16); 957 }, 958 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 959 .widenScalarIf( 960 [=](const LegalityQuery &Query) { 961 const LLT LitTy = Query.Types[LitTyIdx]; 962 return (LitTy.getScalarSizeInBits() < 16); 963 }, 964 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 965 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 966 .widenScalarToNextPow2(BigTyIdx, 32); 967 968 } 969 970 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 971 .legalForCartesianProduct(AllS32Vectors, {S32}) 972 .legalForCartesianProduct(AllS64Vectors, {S64}) 973 .clampNumElements(0, V16S32, V32S32) 974 .clampNumElements(0, V2S64, V16S64) 975 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 976 977 if (ST.hasScalarPackInsts()) 978 BuildVector.legalFor({V2S16, S32}); 979 980 BuildVector 981 .minScalarSameAs(1, 0) 982 .legalIf(isRegisterType(0)) 983 .minScalarOrElt(0, S32); 984 985 if (ST.hasScalarPackInsts()) { 986 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 987 .legalFor({V2S16, S32}) 988 .lower(); 989 } else { 990 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 991 .lower(); 992 } 993 994 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 995 .legalIf(isRegisterType(0)); 996 997 // TODO: Don't fully scalarize v2s16 pieces 998 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 999 1000 // Merge/Unmerge 1001 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1002 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1003 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1004 1005 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1006 const LLT &Ty = Query.Types[TypeIdx]; 1007 if (Ty.isVector()) { 1008 const LLT &EltTy = Ty.getElementType(); 1009 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 1010 return true; 1011 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1012 return true; 1013 } 1014 return false; 1015 }; 1016 1017 auto &Builder = getActionDefinitionsBuilder(Op) 1018 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1019 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1020 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1021 // valid. 1022 .clampScalar(LitTyIdx, S16, S256) 1023 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1024 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1025 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1026 elementTypeIs(1, S16)), 1027 changeTo(1, V2S16)) 1028 // Break up vectors with weird elements into scalars 1029 .fewerElementsIf( 1030 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 1031 scalarize(0)) 1032 .fewerElementsIf( 1033 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 1034 scalarize(1)) 1035 .clampScalar(BigTyIdx, S32, S1024) 1036 .lowerFor({{S16, V2S16}}); 1037 1038 if (Op == G_MERGE_VALUES) { 1039 Builder.widenScalarIf( 1040 // TODO: Use 16-bit shifts if legal for 8-bit values? 1041 [=](const LegalityQuery &Query) { 1042 const LLT Ty = Query.Types[LitTyIdx]; 1043 return Ty.getSizeInBits() < 32; 1044 }, 1045 changeTo(LitTyIdx, S32)); 1046 } 1047 1048 Builder.widenScalarIf( 1049 [=](const LegalityQuery &Query) { 1050 const LLT Ty = Query.Types[BigTyIdx]; 1051 return !isPowerOf2_32(Ty.getSizeInBits()) && 1052 Ty.getSizeInBits() % 16 != 0; 1053 }, 1054 [=](const LegalityQuery &Query) { 1055 // Pick the next power of 2, or a multiple of 64 over 128. 1056 // Whichever is smaller. 1057 const LLT &Ty = Query.Types[BigTyIdx]; 1058 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1059 if (NewSizeInBits >= 256) { 1060 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1061 if (RoundedTo < NewSizeInBits) 1062 NewSizeInBits = RoundedTo; 1063 } 1064 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1065 }) 1066 .legalIf([=](const LegalityQuery &Query) { 1067 const LLT &BigTy = Query.Types[BigTyIdx]; 1068 const LLT &LitTy = Query.Types[LitTyIdx]; 1069 1070 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1071 return false; 1072 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1073 return false; 1074 1075 return BigTy.getSizeInBits() % 16 == 0 && 1076 LitTy.getSizeInBits() % 16 == 0 && 1077 BigTy.getSizeInBits() <= 1024; 1078 }) 1079 // Any vectors left are the wrong size. Scalarize them. 1080 .scalarize(0) 1081 .scalarize(1); 1082 } 1083 1084 getActionDefinitionsBuilder(G_SEXT_INREG).lower(); 1085 1086 computeTables(); 1087 verify(*ST.getInstrInfo()); 1088 } 1089 1090 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1091 MachineRegisterInfo &MRI, 1092 MachineIRBuilder &B, 1093 GISelChangeObserver &Observer) const { 1094 switch (MI.getOpcode()) { 1095 case TargetOpcode::G_ADDRSPACE_CAST: 1096 return legalizeAddrSpaceCast(MI, MRI, B); 1097 case TargetOpcode::G_FRINT: 1098 return legalizeFrint(MI, MRI, B); 1099 case TargetOpcode::G_FCEIL: 1100 return legalizeFceil(MI, MRI, B); 1101 case TargetOpcode::G_INTRINSIC_TRUNC: 1102 return legalizeIntrinsicTrunc(MI, MRI, B); 1103 case TargetOpcode::G_SITOFP: 1104 return legalizeITOFP(MI, MRI, B, true); 1105 case TargetOpcode::G_UITOFP: 1106 return legalizeITOFP(MI, MRI, B, false); 1107 case TargetOpcode::G_FMINNUM: 1108 case TargetOpcode::G_FMAXNUM: 1109 case TargetOpcode::G_FMINNUM_IEEE: 1110 case TargetOpcode::G_FMAXNUM_IEEE: 1111 return legalizeMinNumMaxNum(MI, MRI, B); 1112 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1113 return legalizeExtractVectorElt(MI, MRI, B); 1114 case TargetOpcode::G_INSERT_VECTOR_ELT: 1115 return legalizeInsertVectorElt(MI, MRI, B); 1116 case TargetOpcode::G_FSIN: 1117 case TargetOpcode::G_FCOS: 1118 return legalizeSinCos(MI, MRI, B); 1119 case TargetOpcode::G_GLOBAL_VALUE: 1120 return legalizeGlobalValue(MI, MRI, B); 1121 case TargetOpcode::G_LOAD: 1122 return legalizeLoad(MI, MRI, B, Observer); 1123 case TargetOpcode::G_FMAD: 1124 return legalizeFMad(MI, MRI, B); 1125 case TargetOpcode::G_FDIV: 1126 return legalizeFDIV(MI, MRI, B); 1127 case TargetOpcode::G_ATOMIC_CMPXCHG: 1128 return legalizeAtomicCmpXChg(MI, MRI, B); 1129 default: 1130 return false; 1131 } 1132 1133 llvm_unreachable("expected switch to return"); 1134 } 1135 1136 Register AMDGPULegalizerInfo::getSegmentAperture( 1137 unsigned AS, 1138 MachineRegisterInfo &MRI, 1139 MachineIRBuilder &B) const { 1140 MachineFunction &MF = B.getMF(); 1141 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1142 const LLT S32 = LLT::scalar(32); 1143 1144 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1145 1146 if (ST.hasApertureRegs()) { 1147 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1148 // getreg. 1149 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1150 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1151 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1152 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1153 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1154 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1155 unsigned Encoding = 1156 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1157 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1158 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1159 1160 Register ApertureReg = MRI.createGenericVirtualRegister(S32); 1161 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1162 1163 B.buildInstr(AMDGPU::S_GETREG_B32) 1164 .addDef(GetReg) 1165 .addImm(Encoding); 1166 MRI.setType(GetReg, S32); 1167 1168 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1169 B.buildInstr(TargetOpcode::G_SHL) 1170 .addDef(ApertureReg) 1171 .addUse(GetReg) 1172 .addUse(ShiftAmt.getReg(0)); 1173 1174 return ApertureReg; 1175 } 1176 1177 Register QueuePtr = MRI.createGenericVirtualRegister( 1178 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1179 1180 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1181 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1182 return Register(); 1183 1184 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1185 // private_segment_aperture_base_hi. 1186 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1187 1188 // FIXME: Don't use undef 1189 Value *V = UndefValue::get(PointerType::get( 1190 Type::getInt8Ty(MF.getFunction().getContext()), 1191 AMDGPUAS::CONSTANT_ADDRESS)); 1192 1193 MachinePointerInfo PtrInfo(V, StructOffset); 1194 MachineMemOperand *MMO = MF.getMachineMemOperand( 1195 PtrInfo, 1196 MachineMemOperand::MOLoad | 1197 MachineMemOperand::MODereferenceable | 1198 MachineMemOperand::MOInvariant, 1199 4, 1200 MinAlign(64, StructOffset)); 1201 1202 Register LoadResult = MRI.createGenericVirtualRegister(S32); 1203 Register LoadAddr; 1204 1205 B.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1206 B.buildLoad(LoadResult, LoadAddr, *MMO); 1207 return LoadResult; 1208 } 1209 1210 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1211 MachineInstr &MI, MachineRegisterInfo &MRI, 1212 MachineIRBuilder &B) const { 1213 MachineFunction &MF = B.getMF(); 1214 1215 B.setInstr(MI); 1216 1217 const LLT S32 = LLT::scalar(32); 1218 Register Dst = MI.getOperand(0).getReg(); 1219 Register Src = MI.getOperand(1).getReg(); 1220 1221 LLT DstTy = MRI.getType(Dst); 1222 LLT SrcTy = MRI.getType(Src); 1223 unsigned DestAS = DstTy.getAddressSpace(); 1224 unsigned SrcAS = SrcTy.getAddressSpace(); 1225 1226 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1227 // vector element. 1228 assert(!DstTy.isVector()); 1229 1230 const AMDGPUTargetMachine &TM 1231 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1232 1233 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1234 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1235 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1236 return true; 1237 } 1238 1239 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1240 // Truncate. 1241 B.buildExtract(Dst, Src, 0); 1242 MI.eraseFromParent(); 1243 return true; 1244 } 1245 1246 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1247 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1248 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1249 1250 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1251 // another. Merge operands are required to be the same type, but creating an 1252 // extra ptrtoint would be kind of pointless. 1253 auto HighAddr = B.buildConstant( 1254 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1255 B.buildMerge(Dst, {Src, HighAddr.getReg(0)}); 1256 MI.eraseFromParent(); 1257 return true; 1258 } 1259 1260 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1261 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1262 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1263 unsigned NullVal = TM.getNullPointerValue(DestAS); 1264 1265 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1266 auto FlatNull = B.buildConstant(SrcTy, 0); 1267 1268 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy); 1269 1270 // Extract low 32-bits of the pointer. 1271 B.buildExtract(PtrLo32, Src, 0); 1272 1273 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1274 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); 1275 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1276 1277 MI.eraseFromParent(); 1278 return true; 1279 } 1280 1281 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1282 return false; 1283 1284 if (!ST.hasFlatAddressSpace()) 1285 return false; 1286 1287 auto SegmentNull = 1288 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1289 auto FlatNull = 1290 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1291 1292 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1293 if (!ApertureReg.isValid()) 1294 return false; 1295 1296 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1297 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); 1298 1299 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy); 1300 1301 // Coerce the type of the low half of the result so we can use merge_values. 1302 Register SrcAsInt = MRI.createGenericVirtualRegister(S32); 1303 B.buildInstr(TargetOpcode::G_PTRTOINT) 1304 .addDef(SrcAsInt) 1305 .addUse(Src); 1306 1307 // TODO: Should we allow mismatched types but matching sizes in merges to 1308 // avoid the ptrtoint? 1309 B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); 1310 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); 1311 1312 MI.eraseFromParent(); 1313 return true; 1314 } 1315 1316 bool AMDGPULegalizerInfo::legalizeFrint( 1317 MachineInstr &MI, MachineRegisterInfo &MRI, 1318 MachineIRBuilder &B) const { 1319 B.setInstr(MI); 1320 1321 Register Src = MI.getOperand(1).getReg(); 1322 LLT Ty = MRI.getType(Src); 1323 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1324 1325 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1326 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1327 1328 auto C1 = B.buildFConstant(Ty, C1Val); 1329 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1330 1331 // TODO: Should this propagate fast-math-flags? 1332 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1333 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1334 1335 auto C2 = B.buildFConstant(Ty, C2Val); 1336 auto Fabs = B.buildFAbs(Ty, Src); 1337 1338 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1339 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1340 return true; 1341 } 1342 1343 bool AMDGPULegalizerInfo::legalizeFceil( 1344 MachineInstr &MI, MachineRegisterInfo &MRI, 1345 MachineIRBuilder &B) const { 1346 B.setInstr(MI); 1347 1348 const LLT S1 = LLT::scalar(1); 1349 const LLT S64 = LLT::scalar(64); 1350 1351 Register Src = MI.getOperand(1).getReg(); 1352 assert(MRI.getType(Src) == S64); 1353 1354 // result = trunc(src) 1355 // if (src > 0.0 && src != result) 1356 // result += 1.0 1357 1358 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src}); 1359 1360 const auto Zero = B.buildFConstant(S64, 0.0); 1361 const auto One = B.buildFConstant(S64, 1.0); 1362 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1363 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1364 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1365 auto Add = B.buildSelect(S64, And, One, Zero); 1366 1367 // TODO: Should this propagate fast-math-flags? 1368 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1369 return true; 1370 } 1371 1372 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1373 MachineIRBuilder &B) { 1374 const unsigned FractBits = 52; 1375 const unsigned ExpBits = 11; 1376 LLT S32 = LLT::scalar(32); 1377 1378 auto Const0 = B.buildConstant(S32, FractBits - 32); 1379 auto Const1 = B.buildConstant(S32, ExpBits); 1380 1381 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1382 .addUse(Const0.getReg(0)) 1383 .addUse(Const1.getReg(0)); 1384 1385 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1386 } 1387 1388 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1389 MachineInstr &MI, MachineRegisterInfo &MRI, 1390 MachineIRBuilder &B) const { 1391 B.setInstr(MI); 1392 1393 const LLT S1 = LLT::scalar(1); 1394 const LLT S32 = LLT::scalar(32); 1395 const LLT S64 = LLT::scalar(64); 1396 1397 Register Src = MI.getOperand(1).getReg(); 1398 assert(MRI.getType(Src) == S64); 1399 1400 // TODO: Should this use extract since the low half is unused? 1401 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1402 Register Hi = Unmerge.getReg(1); 1403 1404 // Extract the upper half, since this is where we will find the sign and 1405 // exponent. 1406 auto Exp = extractF64Exponent(Hi, B); 1407 1408 const unsigned FractBits = 52; 1409 1410 // Extract the sign bit. 1411 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1412 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1413 1414 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1415 1416 const auto Zero32 = B.buildConstant(S32, 0); 1417 1418 // Extend back to 64-bits. 1419 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); 1420 1421 auto Shr = B.buildAShr(S64, FractMask, Exp); 1422 auto Not = B.buildNot(S64, Shr); 1423 auto Tmp0 = B.buildAnd(S64, Src, Not); 1424 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1425 1426 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1427 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1428 1429 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1430 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1431 return true; 1432 } 1433 1434 bool AMDGPULegalizerInfo::legalizeITOFP( 1435 MachineInstr &MI, MachineRegisterInfo &MRI, 1436 MachineIRBuilder &B, bool Signed) const { 1437 B.setInstr(MI); 1438 1439 Register Dst = MI.getOperand(0).getReg(); 1440 Register Src = MI.getOperand(1).getReg(); 1441 1442 const LLT S64 = LLT::scalar(64); 1443 const LLT S32 = LLT::scalar(32); 1444 1445 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1446 1447 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1448 1449 auto CvtHi = Signed ? 1450 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1451 B.buildUITOFP(S64, Unmerge.getReg(1)); 1452 1453 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1454 1455 auto ThirtyTwo = B.buildConstant(S32, 32); 1456 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1457 .addUse(CvtHi.getReg(0)) 1458 .addUse(ThirtyTwo.getReg(0)); 1459 1460 // TODO: Should this propagate fast-math-flags? 1461 B.buildFAdd(Dst, LdExp, CvtLo); 1462 MI.eraseFromParent(); 1463 return true; 1464 } 1465 1466 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1467 MachineInstr &MI, MachineRegisterInfo &MRI, 1468 MachineIRBuilder &B) const { 1469 MachineFunction &MF = B.getMF(); 1470 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1471 1472 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1473 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1474 1475 // With ieee_mode disabled, the instructions have the correct behavior 1476 // already for G_FMINNUM/G_FMAXNUM 1477 if (!MFI->getMode().IEEE) 1478 return !IsIEEEOp; 1479 1480 if (IsIEEEOp) 1481 return true; 1482 1483 MachineIRBuilder HelperBuilder(MI); 1484 GISelObserverWrapper DummyObserver; 1485 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1486 HelperBuilder.setInstr(MI); 1487 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1488 } 1489 1490 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1491 MachineInstr &MI, MachineRegisterInfo &MRI, 1492 MachineIRBuilder &B) const { 1493 // TODO: Should move some of this into LegalizerHelper. 1494 1495 // TODO: Promote dynamic indexing of s16 to s32 1496 // TODO: Dynamic s64 indexing is only legal for SGPR. 1497 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); 1498 if (!IdxVal) // Dynamic case will be selected to register indexing. 1499 return true; 1500 1501 Register Dst = MI.getOperand(0).getReg(); 1502 Register Vec = MI.getOperand(1).getReg(); 1503 1504 LLT VecTy = MRI.getType(Vec); 1505 LLT EltTy = VecTy.getElementType(); 1506 assert(EltTy == MRI.getType(Dst)); 1507 1508 B.setInstr(MI); 1509 1510 if (IdxVal.getValue() < VecTy.getNumElements()) 1511 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); 1512 else 1513 B.buildUndef(Dst); 1514 1515 MI.eraseFromParent(); 1516 return true; 1517 } 1518 1519 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1520 MachineInstr &MI, MachineRegisterInfo &MRI, 1521 MachineIRBuilder &B) const { 1522 // TODO: Should move some of this into LegalizerHelper. 1523 1524 // TODO: Promote dynamic indexing of s16 to s32 1525 // TODO: Dynamic s64 indexing is only legal for SGPR. 1526 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); 1527 if (!IdxVal) // Dynamic case will be selected to register indexing. 1528 return true; 1529 1530 Register Dst = MI.getOperand(0).getReg(); 1531 Register Vec = MI.getOperand(1).getReg(); 1532 Register Ins = MI.getOperand(2).getReg(); 1533 1534 LLT VecTy = MRI.getType(Vec); 1535 LLT EltTy = VecTy.getElementType(); 1536 assert(EltTy == MRI.getType(Ins)); 1537 1538 B.setInstr(MI); 1539 1540 if (IdxVal.getValue() < VecTy.getNumElements()) 1541 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); 1542 else 1543 B.buildUndef(Dst); 1544 1545 MI.eraseFromParent(); 1546 return true; 1547 } 1548 1549 bool AMDGPULegalizerInfo::legalizeSinCos( 1550 MachineInstr &MI, MachineRegisterInfo &MRI, 1551 MachineIRBuilder &B) const { 1552 B.setInstr(MI); 1553 1554 Register DstReg = MI.getOperand(0).getReg(); 1555 Register SrcReg = MI.getOperand(1).getReg(); 1556 LLT Ty = MRI.getType(DstReg); 1557 unsigned Flags = MI.getFlags(); 1558 1559 Register TrigVal; 1560 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1561 if (ST.hasTrigReducedRange()) { 1562 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1563 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1564 .addUse(MulVal.getReg(0)) 1565 .setMIFlags(Flags).getReg(0); 1566 } else 1567 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1568 1569 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1570 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1571 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1572 .addUse(TrigVal) 1573 .setMIFlags(Flags); 1574 MI.eraseFromParent(); 1575 return true; 1576 } 1577 1578 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1579 Register DstReg, LLT PtrTy, 1580 MachineIRBuilder &B, const GlobalValue *GV, 1581 unsigned Offset, unsigned GAFlags) const { 1582 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1583 // to the following code sequence: 1584 // 1585 // For constant address space: 1586 // s_getpc_b64 s[0:1] 1587 // s_add_u32 s0, s0, $symbol 1588 // s_addc_u32 s1, s1, 0 1589 // 1590 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1591 // a fixup or relocation is emitted to replace $symbol with a literal 1592 // constant, which is a pc-relative offset from the encoding of the $symbol 1593 // operand to the global variable. 1594 // 1595 // For global address space: 1596 // s_getpc_b64 s[0:1] 1597 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1598 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1599 // 1600 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1601 // fixups or relocations are emitted to replace $symbol@*@lo and 1602 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1603 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1604 // operand to the global variable. 1605 // 1606 // What we want here is an offset from the value returned by s_getpc 1607 // (which is the address of the s_add_u32 instruction) to the global 1608 // variable, but since the encoding of $symbol starts 4 bytes after the start 1609 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1610 // small. This requires us to add 4 to the global variable offset in order to 1611 // compute the correct address. 1612 1613 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1614 1615 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1616 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1617 1618 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1619 .addDef(PCReg); 1620 1621 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1622 if (GAFlags == SIInstrInfo::MO_NONE) 1623 MIB.addImm(0); 1624 else 1625 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1626 1627 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1628 1629 if (PtrTy.getSizeInBits() == 32) 1630 B.buildExtract(DstReg, PCReg, 0); 1631 return true; 1632 } 1633 1634 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1635 MachineInstr &MI, MachineRegisterInfo &MRI, 1636 MachineIRBuilder &B) const { 1637 Register DstReg = MI.getOperand(0).getReg(); 1638 LLT Ty = MRI.getType(DstReg); 1639 unsigned AS = Ty.getAddressSpace(); 1640 1641 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1642 MachineFunction &MF = B.getMF(); 1643 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1644 B.setInstr(MI); 1645 1646 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1647 if (!MFI->isEntryFunction()) { 1648 const Function &Fn = MF.getFunction(); 1649 DiagnosticInfoUnsupported BadLDSDecl( 1650 Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 1651 Fn.getContext().diagnose(BadLDSDecl); 1652 } 1653 1654 // TODO: We could emit code to handle the initialization somewhere. 1655 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1656 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1657 MI.eraseFromParent(); 1658 return true; 1659 } 1660 1661 const Function &Fn = MF.getFunction(); 1662 DiagnosticInfoUnsupported BadInit( 1663 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1664 Fn.getContext().diagnose(BadInit); 1665 return true; 1666 } 1667 1668 const SITargetLowering *TLI = ST.getTargetLowering(); 1669 1670 if (TLI->shouldEmitFixup(GV)) { 1671 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 1672 MI.eraseFromParent(); 1673 return true; 1674 } 1675 1676 if (TLI->shouldEmitPCReloc(GV)) { 1677 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 1678 MI.eraseFromParent(); 1679 return true; 1680 } 1681 1682 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1683 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 1684 1685 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 1686 MachinePointerInfo::getGOT(MF), 1687 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1688 MachineMemOperand::MOInvariant, 1689 8 /*Size*/, 8 /*Align*/); 1690 1691 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 1692 1693 if (Ty.getSizeInBits() == 32) { 1694 // Truncate if this is a 32-bit constant adrdess. 1695 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 1696 B.buildExtract(DstReg, Load, 0); 1697 } else 1698 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 1699 1700 MI.eraseFromParent(); 1701 return true; 1702 } 1703 1704 bool AMDGPULegalizerInfo::legalizeLoad( 1705 MachineInstr &MI, MachineRegisterInfo &MRI, 1706 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 1707 B.setInstr(MI); 1708 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1709 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 1710 Observer.changingInstr(MI); 1711 MI.getOperand(1).setReg(Cast.getReg(0)); 1712 Observer.changedInstr(MI); 1713 return true; 1714 } 1715 1716 bool AMDGPULegalizerInfo::legalizeFMad( 1717 MachineInstr &MI, MachineRegisterInfo &MRI, 1718 MachineIRBuilder &B) const { 1719 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 1720 assert(Ty.isScalar()); 1721 1722 // TODO: Always legal with future ftz flag. 1723 if (Ty == LLT::scalar(32) && !ST.hasFP32Denormals()) 1724 return true; 1725 if (Ty == LLT::scalar(16) && !ST.hasFP16Denormals()) 1726 return true; 1727 1728 MachineFunction &MF = B.getMF(); 1729 1730 MachineIRBuilder HelperBuilder(MI); 1731 GISelObserverWrapper DummyObserver; 1732 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1733 HelperBuilder.setMBB(*MI.getParent()); 1734 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 1735 } 1736 1737 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 1738 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 1739 Register DstReg = MI.getOperand(0).getReg(); 1740 Register PtrReg = MI.getOperand(1).getReg(); 1741 Register CmpVal = MI.getOperand(2).getReg(); 1742 Register NewVal = MI.getOperand(3).getReg(); 1743 1744 assert(SITargetLowering::isFlatGlobalAddrSpace( 1745 MRI.getType(PtrReg).getAddressSpace()) && 1746 "this should not have been custom lowered"); 1747 1748 LLT ValTy = MRI.getType(CmpVal); 1749 LLT VecTy = LLT::vector(2, ValTy); 1750 1751 B.setInstr(MI); 1752 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 1753 1754 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 1755 .addDef(DstReg) 1756 .addUse(PtrReg) 1757 .addUse(PackedVal) 1758 .setMemRefs(MI.memoperands()); 1759 1760 MI.eraseFromParent(); 1761 return true; 1762 } 1763 1764 // Return the use branch instruction, otherwise null if the usage is invalid. 1765 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 1766 MachineRegisterInfo &MRI) { 1767 Register CondDef = MI.getOperand(0).getReg(); 1768 if (!MRI.hasOneNonDBGUse(CondDef)) 1769 return nullptr; 1770 1771 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 1772 return UseMI.getParent() == MI.getParent() && 1773 UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr; 1774 } 1775 1776 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 1777 Register Reg, LLT Ty) const { 1778 Register LiveIn = MRI.getLiveInVirtReg(Reg); 1779 if (LiveIn) 1780 return LiveIn; 1781 1782 Register NewReg = MRI.createGenericVirtualRegister(Ty); 1783 MRI.addLiveIn(Reg, NewReg); 1784 return NewReg; 1785 } 1786 1787 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 1788 const ArgDescriptor *Arg) const { 1789 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 1790 return false; // TODO: Handle these 1791 1792 assert(Arg->getRegister().isPhysical()); 1793 1794 MachineRegisterInfo &MRI = *B.getMRI(); 1795 1796 LLT Ty = MRI.getType(DstReg); 1797 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 1798 1799 if (Arg->isMasked()) { 1800 // TODO: Should we try to emit this once in the entry block? 1801 const LLT S32 = LLT::scalar(32); 1802 const unsigned Mask = Arg->getMask(); 1803 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 1804 1805 Register AndMaskSrc = LiveIn; 1806 1807 if (Shift != 0) { 1808 auto ShiftAmt = B.buildConstant(S32, Shift); 1809 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 1810 } 1811 1812 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 1813 } else 1814 B.buildCopy(DstReg, LiveIn); 1815 1816 // Insert the argument copy if it doens't already exist. 1817 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 1818 if (!MRI.getVRegDef(LiveIn)) { 1819 // FIXME: Should have scoped insert pt 1820 MachineBasicBlock &OrigInsBB = B.getMBB(); 1821 auto OrigInsPt = B.getInsertPt(); 1822 1823 MachineBasicBlock &EntryMBB = B.getMF().front(); 1824 EntryMBB.addLiveIn(Arg->getRegister()); 1825 B.setInsertPt(EntryMBB, EntryMBB.begin()); 1826 B.buildCopy(LiveIn, Arg->getRegister()); 1827 1828 B.setInsertPt(OrigInsBB, OrigInsPt); 1829 } 1830 1831 return true; 1832 } 1833 1834 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 1835 MachineInstr &MI, 1836 MachineRegisterInfo &MRI, 1837 MachineIRBuilder &B, 1838 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 1839 B.setInstr(MI); 1840 1841 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1842 1843 const ArgDescriptor *Arg; 1844 const TargetRegisterClass *RC; 1845 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 1846 if (!Arg) { 1847 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 1848 return false; 1849 } 1850 1851 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 1852 MI.eraseFromParent(); 1853 return true; 1854 } 1855 1856 return false; 1857 } 1858 1859 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 1860 MachineRegisterInfo &MRI, 1861 MachineIRBuilder &B) const { 1862 B.setInstr(MI); 1863 Register Dst = MI.getOperand(0).getReg(); 1864 LLT DstTy = MRI.getType(Dst); 1865 LLT S16 = LLT::scalar(16); 1866 LLT S32 = LLT::scalar(32); 1867 1868 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 1869 return true; 1870 1871 if (DstTy == S16) 1872 return legalizeFDIV16(MI, MRI, B); 1873 if (DstTy == S32) 1874 return legalizeFDIV32(MI, MRI, B); 1875 1876 return false; 1877 } 1878 1879 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 1880 MachineRegisterInfo &MRI, 1881 MachineIRBuilder &B) const { 1882 Register Res = MI.getOperand(0).getReg(); 1883 Register LHS = MI.getOperand(1).getReg(); 1884 Register RHS = MI.getOperand(2).getReg(); 1885 1886 uint16_t Flags = MI.getFlags(); 1887 1888 LLT ResTy = MRI.getType(Res); 1889 LLT S32 = LLT::scalar(32); 1890 LLT S64 = LLT::scalar(64); 1891 1892 const MachineFunction &MF = B.getMF(); 1893 bool Unsafe = 1894 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 1895 1896 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 1897 return false; 1898 1899 if (!Unsafe && ResTy == S32 && ST.hasFP32Denormals()) 1900 return false; 1901 1902 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 1903 // 1 / x -> RCP(x) 1904 if (CLHS->isExactlyValue(1.0)) { 1905 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 1906 .addUse(RHS) 1907 .setMIFlags(Flags); 1908 1909 MI.eraseFromParent(); 1910 return true; 1911 } 1912 1913 // -1 / x -> RCP( FNEG(x) ) 1914 if (CLHS->isExactlyValue(-1.0)) { 1915 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 1916 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 1917 .addUse(FNeg.getReg(0)) 1918 .setMIFlags(Flags); 1919 1920 MI.eraseFromParent(); 1921 return true; 1922 } 1923 } 1924 1925 // x / y -> x * (1.0 / y) 1926 if (Unsafe) { 1927 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 1928 .addUse(RHS) 1929 .setMIFlags(Flags); 1930 B.buildFMul(Res, LHS, RCP, Flags); 1931 1932 MI.eraseFromParent(); 1933 return true; 1934 } 1935 1936 return false; 1937 } 1938 1939 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 1940 MachineRegisterInfo &MRI, 1941 MachineIRBuilder &B) const { 1942 B.setInstr(MI); 1943 Register Res = MI.getOperand(0).getReg(); 1944 Register LHS = MI.getOperand(1).getReg(); 1945 Register RHS = MI.getOperand(2).getReg(); 1946 1947 uint16_t Flags = MI.getFlags(); 1948 1949 LLT S16 = LLT::scalar(16); 1950 LLT S32 = LLT::scalar(32); 1951 1952 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 1953 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 1954 1955 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 1956 .addUse(RHSExt.getReg(0)) 1957 .setMIFlags(Flags); 1958 1959 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 1960 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 1961 1962 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 1963 .addUse(RDst.getReg(0)) 1964 .addUse(RHS) 1965 .addUse(LHS) 1966 .setMIFlags(Flags); 1967 1968 MI.eraseFromParent(); 1969 return true; 1970 } 1971 1972 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 1973 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 1974 static void toggleSPDenormMode(bool Enable, 1975 const GCNSubtarget &ST, 1976 MachineIRBuilder &B) { 1977 // Set SP denorm mode to this value. 1978 unsigned SPDenormMode = 1979 Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; 1980 1981 if (ST.hasDenormModeInst()) { 1982 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 1983 unsigned DPDenormModeDefault = ST.hasFP64Denormals() 1984 ? FP_DENORM_FLUSH_NONE 1985 : FP_DENORM_FLUSH_IN_FLUSH_OUT; 1986 1987 unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 1988 B.buildInstr(AMDGPU::S_DENORM_MODE) 1989 .addImm(NewDenormModeValue); 1990 1991 } else { 1992 // Select FP32 bit field in mode register. 1993 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 1994 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 1995 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 1996 1997 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 1998 .addImm(SPDenormMode) 1999 .addImm(SPDenormModeBitField); 2000 } 2001 } 2002 2003 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2004 MachineRegisterInfo &MRI, 2005 MachineIRBuilder &B) const { 2006 B.setInstr(MI); 2007 Register Res = MI.getOperand(0).getReg(); 2008 Register LHS = MI.getOperand(1).getReg(); 2009 Register RHS = MI.getOperand(2).getReg(); 2010 2011 uint16_t Flags = MI.getFlags(); 2012 2013 LLT S32 = LLT::scalar(32); 2014 LLT S1 = LLT::scalar(1); 2015 2016 auto One = B.buildFConstant(S32, 1.0f); 2017 2018 auto DenominatorScaled = 2019 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2020 .addUse(RHS) 2021 .addUse(RHS) 2022 .addUse(LHS) 2023 .setMIFlags(Flags); 2024 auto NumeratorScaled = 2025 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2026 .addUse(LHS) 2027 .addUse(RHS) 2028 .addUse(LHS) 2029 .setMIFlags(Flags); 2030 2031 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2032 .addUse(DenominatorScaled.getReg(0)) 2033 .setMIFlags(Flags); 2034 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2035 2036 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2037 // aren't modeled as reading it. 2038 if (!ST.hasFP32Denormals()) 2039 toggleSPDenormMode(true, ST, B); 2040 2041 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2042 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2043 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2044 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2045 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2046 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2047 2048 if (!ST.hasFP32Denormals()) 2049 toggleSPDenormMode(false, ST, B); 2050 2051 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2052 .addUse(Fma4.getReg(0)) 2053 .addUse(Fma1.getReg(0)) 2054 .addUse(Fma3.getReg(0)) 2055 .addUse(NumeratorScaled.getReg(1)) 2056 .setMIFlags(Flags); 2057 2058 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2059 .addUse(Fmas.getReg(0)) 2060 .addUse(RHS) 2061 .addUse(LHS) 2062 .setMIFlags(Flags); 2063 2064 MI.eraseFromParent(); 2065 return true; 2066 } 2067 2068 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 2069 MachineRegisterInfo &MRI, 2070 MachineIRBuilder &B) const { 2071 B.setInstr(MI); 2072 Register Res = MI.getOperand(0).getReg(); 2073 Register LHS = MI.getOperand(2).getReg(); 2074 Register RHS = MI.getOperand(3).getReg(); 2075 uint16_t Flags = MI.getFlags(); 2076 2077 LLT S32 = LLT::scalar(32); 2078 LLT S1 = LLT::scalar(1); 2079 2080 auto Abs = B.buildFAbs(S32, RHS, Flags); 2081 const APFloat C0Val(1.0f); 2082 2083 auto C0 = B.buildConstant(S32, 0x6f800000); 2084 auto C1 = B.buildConstant(S32, 0x2f800000); 2085 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 2086 2087 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 2088 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 2089 2090 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 2091 2092 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2093 .addUse(Mul0.getReg(0)) 2094 .setMIFlags(Flags); 2095 2096 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 2097 2098 B.buildFMul(Res, Sel, Mul1, Flags); 2099 2100 MI.eraseFromParent(); 2101 return true; 2102 } 2103 2104 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 2105 MachineRegisterInfo &MRI, 2106 MachineIRBuilder &B) const { 2107 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2108 if (!MFI->isEntryFunction()) { 2109 return legalizePreloadedArgIntrin(MI, MRI, B, 2110 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 2111 } 2112 2113 B.setInstr(MI); 2114 2115 uint64_t Offset = 2116 ST.getTargetLowering()->getImplicitParameterOffset( 2117 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 2118 Register DstReg = MI.getOperand(0).getReg(); 2119 LLT DstTy = MRI.getType(DstReg); 2120 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 2121 2122 const ArgDescriptor *Arg; 2123 const TargetRegisterClass *RC; 2124 std::tie(Arg, RC) 2125 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2126 if (!Arg) 2127 return false; 2128 2129 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 2130 if (!loadInputValue(KernargPtrReg, B, Arg)) 2131 return false; 2132 2133 B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 2134 MI.eraseFromParent(); 2135 return true; 2136 } 2137 2138 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 2139 MachineRegisterInfo &MRI, 2140 MachineIRBuilder &B, 2141 unsigned AddrSpace) const { 2142 B.setInstr(MI); 2143 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 2144 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 2145 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 2146 MI.eraseFromParent(); 2147 return true; 2148 } 2149 2150 /// Handle register layout difference for f16 images for some subtargets. 2151 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 2152 MachineRegisterInfo &MRI, 2153 Register Reg) const { 2154 if (!ST.hasUnpackedD16VMem()) 2155 return Reg; 2156 2157 const LLT S16 = LLT::scalar(16); 2158 const LLT S32 = LLT::scalar(32); 2159 LLT StoreVT = MRI.getType(Reg); 2160 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 2161 2162 auto Unmerge = B.buildUnmerge(S16, Reg); 2163 2164 SmallVector<Register, 4> WideRegs; 2165 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 2166 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 2167 2168 int NumElts = StoreVT.getNumElements(); 2169 2170 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 2171 } 2172 2173 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI, 2174 MachineRegisterInfo &MRI, 2175 MachineIRBuilder &B, 2176 bool IsFormat) const { 2177 // TODO: Reject f16 format on targets where unsupported. 2178 Register VData = MI.getOperand(1).getReg(); 2179 LLT Ty = MRI.getType(VData); 2180 2181 B.setInstr(MI); 2182 2183 const LLT S32 = LLT::scalar(32); 2184 const LLT S16 = LLT::scalar(16); 2185 2186 // Fixup illegal register types for i8 stores. 2187 if (Ty == LLT::scalar(8) || Ty == S16) { 2188 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 2189 MI.getOperand(1).setReg(AnyExt); 2190 return true; 2191 } 2192 2193 if (Ty.isVector()) { 2194 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 2195 if (IsFormat) 2196 MI.getOperand(1).setReg(handleD16VData(B, MRI, VData)); 2197 return true; 2198 } 2199 2200 return Ty.getElementType() == S32 && Ty.getNumElements() <= 4; 2201 } 2202 2203 return Ty == S32; 2204 } 2205 2206 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 2207 MachineRegisterInfo &MRI, 2208 MachineIRBuilder &B) const { 2209 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 2210 switch (MI.getIntrinsicID()) { 2211 case Intrinsic::amdgcn_if: { 2212 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 2213 const SIRegisterInfo *TRI 2214 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 2215 2216 B.setInstr(*BrCond); 2217 Register Def = MI.getOperand(1).getReg(); 2218 Register Use = MI.getOperand(3).getReg(); 2219 B.buildInstr(AMDGPU::SI_IF) 2220 .addDef(Def) 2221 .addUse(Use) 2222 .addMBB(BrCond->getOperand(1).getMBB()); 2223 2224 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 2225 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 2226 MI.eraseFromParent(); 2227 BrCond->eraseFromParent(); 2228 return true; 2229 } 2230 2231 return false; 2232 } 2233 case Intrinsic::amdgcn_loop: { 2234 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 2235 const SIRegisterInfo *TRI 2236 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 2237 2238 B.setInstr(*BrCond); 2239 Register Reg = MI.getOperand(2).getReg(); 2240 B.buildInstr(AMDGPU::SI_LOOP) 2241 .addUse(Reg) 2242 .addMBB(BrCond->getOperand(1).getMBB()); 2243 MI.eraseFromParent(); 2244 BrCond->eraseFromParent(); 2245 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 2246 return true; 2247 } 2248 2249 return false; 2250 } 2251 case Intrinsic::amdgcn_kernarg_segment_ptr: 2252 return legalizePreloadedArgIntrin( 2253 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2254 case Intrinsic::amdgcn_implicitarg_ptr: 2255 return legalizeImplicitArgPtr(MI, MRI, B); 2256 case Intrinsic::amdgcn_workitem_id_x: 2257 return legalizePreloadedArgIntrin(MI, MRI, B, 2258 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 2259 case Intrinsic::amdgcn_workitem_id_y: 2260 return legalizePreloadedArgIntrin(MI, MRI, B, 2261 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 2262 case Intrinsic::amdgcn_workitem_id_z: 2263 return legalizePreloadedArgIntrin(MI, MRI, B, 2264 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 2265 case Intrinsic::amdgcn_workgroup_id_x: 2266 return legalizePreloadedArgIntrin(MI, MRI, B, 2267 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 2268 case Intrinsic::amdgcn_workgroup_id_y: 2269 return legalizePreloadedArgIntrin(MI, MRI, B, 2270 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 2271 case Intrinsic::amdgcn_workgroup_id_z: 2272 return legalizePreloadedArgIntrin(MI, MRI, B, 2273 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 2274 case Intrinsic::amdgcn_dispatch_ptr: 2275 return legalizePreloadedArgIntrin(MI, MRI, B, 2276 AMDGPUFunctionArgInfo::DISPATCH_PTR); 2277 case Intrinsic::amdgcn_queue_ptr: 2278 return legalizePreloadedArgIntrin(MI, MRI, B, 2279 AMDGPUFunctionArgInfo::QUEUE_PTR); 2280 case Intrinsic::amdgcn_implicit_buffer_ptr: 2281 return legalizePreloadedArgIntrin( 2282 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 2283 case Intrinsic::amdgcn_dispatch_id: 2284 return legalizePreloadedArgIntrin(MI, MRI, B, 2285 AMDGPUFunctionArgInfo::DISPATCH_ID); 2286 case Intrinsic::amdgcn_fdiv_fast: 2287 return legalizeFDIVFastIntrin(MI, MRI, B); 2288 case Intrinsic::amdgcn_is_shared: 2289 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 2290 case Intrinsic::amdgcn_is_private: 2291 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 2292 case Intrinsic::amdgcn_wavefrontsize: { 2293 B.setInstr(MI); 2294 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 2295 MI.eraseFromParent(); 2296 return true; 2297 } 2298 case Intrinsic::amdgcn_raw_buffer_store: 2299 return legalizeRawBufferStore(MI, MRI, B, false); 2300 case Intrinsic::amdgcn_raw_buffer_store_format: 2301 return legalizeRawBufferStore(MI, MRI, B, true); 2302 default: 2303 return true; 2304 } 2305 2306 return true; 2307 } 2308