1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPU.h" 22 #include "AMDGPULegalizerInfo.h" 23 #include "AMDGPUTargetMachine.h" 24 #include "SIMachineFunctionInfo.h" 25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/CodeGen/TargetOpcodes.h" 28 #include "llvm/CodeGen/ValueTypes.h" 29 #include "llvm/IR/DerivedTypes.h" 30 #include "llvm/IR/DiagnosticInfo.h" 31 #include "llvm/IR/Type.h" 32 #include "llvm/Support/Debug.h" 33 34 #define DEBUG_TYPE "amdgpu-legalinfo" 35 36 using namespace llvm; 37 using namespace LegalizeActions; 38 using namespace LegalizeMutations; 39 using namespace LegalityPredicates; 40 41 42 static LegalityPredicate isMultiple32(unsigned TypeIdx, 43 unsigned MaxSize = 1024) { 44 return [=](const LegalityQuery &Query) { 45 const LLT Ty = Query.Types[TypeIdx]; 46 const LLT EltTy = Ty.getScalarType(); 47 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 48 }; 49 } 50 51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 52 return [=](const LegalityQuery &Query) { 53 return Query.Types[TypeIdx].getSizeInBits() == Size; 54 }; 55 } 56 57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 58 return [=](const LegalityQuery &Query) { 59 const LLT Ty = Query.Types[TypeIdx]; 60 return Ty.isVector() && 61 Ty.getNumElements() % 2 != 0 && 62 Ty.getElementType().getSizeInBits() < 32 && 63 Ty.getSizeInBits() % 32 != 0; 64 }; 65 } 66 67 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 68 return [=](const LegalityQuery &Query) { 69 const LLT Ty = Query.Types[TypeIdx]; 70 const LLT EltTy = Ty.getScalarType(); 71 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 72 }; 73 } 74 75 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 76 return [=](const LegalityQuery &Query) { 77 const LLT Ty = Query.Types[TypeIdx]; 78 const LLT EltTy = Ty.getElementType(); 79 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 80 }; 81 } 82 83 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 84 return [=](const LegalityQuery &Query) { 85 const LLT Ty = Query.Types[TypeIdx]; 86 const LLT EltTy = Ty.getElementType(); 87 unsigned Size = Ty.getSizeInBits(); 88 unsigned Pieces = (Size + 63) / 64; 89 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 90 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 91 }; 92 } 93 94 // Increase the number of vector elements to reach the next multiple of 32-bit 95 // type. 96 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 97 return [=](const LegalityQuery &Query) { 98 const LLT Ty = Query.Types[TypeIdx]; 99 100 const LLT EltTy = Ty.getElementType(); 101 const int Size = Ty.getSizeInBits(); 102 const int EltSize = EltTy.getSizeInBits(); 103 const int NextMul32 = (Size + 31) / 32; 104 105 assert(EltSize < 32); 106 107 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 108 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 109 }; 110 } 111 112 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 113 return [=](const LegalityQuery &Query) { 114 const LLT QueryTy = Query.Types[TypeIdx]; 115 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 116 }; 117 } 118 119 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 120 return [=](const LegalityQuery &Query) { 121 const LLT QueryTy = Query.Types[TypeIdx]; 122 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 123 }; 124 } 125 126 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 127 return [=](const LegalityQuery &Query) { 128 const LLT QueryTy = Query.Types[TypeIdx]; 129 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 130 }; 131 } 132 133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 134 // v2s16. 135 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 136 return [=](const LegalityQuery &Query) { 137 const LLT Ty = Query.Types[TypeIdx]; 138 if (Ty.isVector()) { 139 const int EltSize = Ty.getElementType().getSizeInBits(); 140 return EltSize == 32 || EltSize == 64 || 141 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 142 EltSize == 128 || EltSize == 256; 143 } 144 145 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 146 }; 147 } 148 149 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 150 return [=](const LegalityQuery &Query) { 151 return Query.Types[TypeIdx].getElementType() == Type; 152 }; 153 } 154 155 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 156 return [=](const LegalityQuery &Query) { 157 const LLT Ty = Query.Types[TypeIdx]; 158 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 159 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 160 }; 161 } 162 163 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 164 const GCNTargetMachine &TM) 165 : ST(ST_) { 166 using namespace TargetOpcode; 167 168 auto GetAddrSpacePtr = [&TM](unsigned AS) { 169 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 170 }; 171 172 const LLT S1 = LLT::scalar(1); 173 const LLT S8 = LLT::scalar(8); 174 const LLT S16 = LLT::scalar(16); 175 const LLT S32 = LLT::scalar(32); 176 const LLT S64 = LLT::scalar(64); 177 const LLT S96 = LLT::scalar(96); 178 const LLT S128 = LLT::scalar(128); 179 const LLT S256 = LLT::scalar(256); 180 const LLT S1024 = LLT::scalar(1024); 181 182 const LLT V2S16 = LLT::vector(2, 16); 183 const LLT V4S16 = LLT::vector(4, 16); 184 185 const LLT V2S32 = LLT::vector(2, 32); 186 const LLT V3S32 = LLT::vector(3, 32); 187 const LLT V4S32 = LLT::vector(4, 32); 188 const LLT V5S32 = LLT::vector(5, 32); 189 const LLT V6S32 = LLT::vector(6, 32); 190 const LLT V7S32 = LLT::vector(7, 32); 191 const LLT V8S32 = LLT::vector(8, 32); 192 const LLT V9S32 = LLT::vector(9, 32); 193 const LLT V10S32 = LLT::vector(10, 32); 194 const LLT V11S32 = LLT::vector(11, 32); 195 const LLT V12S32 = LLT::vector(12, 32); 196 const LLT V13S32 = LLT::vector(13, 32); 197 const LLT V14S32 = LLT::vector(14, 32); 198 const LLT V15S32 = LLT::vector(15, 32); 199 const LLT V16S32 = LLT::vector(16, 32); 200 const LLT V32S32 = LLT::vector(32, 32); 201 202 const LLT V2S64 = LLT::vector(2, 64); 203 const LLT V3S64 = LLT::vector(3, 64); 204 const LLT V4S64 = LLT::vector(4, 64); 205 const LLT V5S64 = LLT::vector(5, 64); 206 const LLT V6S64 = LLT::vector(6, 64); 207 const LLT V7S64 = LLT::vector(7, 64); 208 const LLT V8S64 = LLT::vector(8, 64); 209 const LLT V16S64 = LLT::vector(16, 64); 210 211 std::initializer_list<LLT> AllS32Vectors = 212 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 213 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 214 std::initializer_list<LLT> AllS64Vectors = 215 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 216 217 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 218 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 219 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 220 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 221 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 222 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 223 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 224 225 const LLT CodePtr = FlatPtr; 226 227 const std::initializer_list<LLT> AddrSpaces64 = { 228 GlobalPtr, ConstantPtr, FlatPtr 229 }; 230 231 const std::initializer_list<LLT> AddrSpaces32 = { 232 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 233 }; 234 235 const std::initializer_list<LLT> FPTypesBase = { 236 S32, S64 237 }; 238 239 const std::initializer_list<LLT> FPTypes16 = { 240 S32, S64, S16 241 }; 242 243 const std::initializer_list<LLT> FPTypesPK16 = { 244 S32, S64, S16, V2S16 245 }; 246 247 const LLT MinLegalScalarShiftTy = ST.has16BitInsts() ? S16 : S32; 248 249 setAction({G_BRCOND, S1}, Legal); // VCC branches 250 setAction({G_BRCOND, S32}, Legal); // SCC branches 251 252 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 253 // elements for v3s16 254 getActionDefinitionsBuilder(G_PHI) 255 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 256 .legalFor(AllS32Vectors) 257 .legalFor(AllS64Vectors) 258 .legalFor(AddrSpaces64) 259 .legalFor(AddrSpaces32) 260 .clampScalar(0, S32, S256) 261 .widenScalarToNextPow2(0, 32) 262 .clampMaxNumElements(0, S32, 16) 263 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 264 .legalIf(isPointer(0)); 265 266 if (ST.has16BitInsts()) { 267 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 268 .legalFor({S32, S16}) 269 .clampScalar(0, S16, S32) 270 .scalarize(0); 271 } else { 272 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 273 .legalFor({S32}) 274 .clampScalar(0, S32, S32) 275 .scalarize(0); 276 } 277 278 // FIXME: Not really legal. Placeholder for custom lowering. 279 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 280 .legalFor({S32, S64}) 281 .clampScalar(0, S32, S64) 282 .widenScalarToNextPow2(0, 32) 283 .scalarize(0); 284 285 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 286 .legalFor({S32}) 287 .clampScalar(0, S32, S32) 288 .scalarize(0); 289 290 // Report legal for any types we can handle anywhere. For the cases only legal 291 // on the SALU, RegBankSelect will be able to re-legalize. 292 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 293 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 294 .clampScalar(0, S32, S64) 295 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 296 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 297 .widenScalarToNextPow2(0) 298 .scalarize(0); 299 300 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 301 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 302 .legalFor({{S32, S1}, {S32, S32}}) 303 .clampScalar(0, S32, S32) 304 .scalarize(0); // TODO: Implement. 305 306 getActionDefinitionsBuilder({G_SADDO, G_SSUBO}) 307 .lower(); 308 309 getActionDefinitionsBuilder(G_BITCAST) 310 // Don't worry about the size constraint. 311 .legalIf(all(isRegisterType(0), isRegisterType(1))) 312 // FIXME: Testing hack 313 .legalForCartesianProduct({S16, LLT::vector(2, 8), }) 314 .lower(); 315 316 317 getActionDefinitionsBuilder(G_FCONSTANT) 318 .legalFor({S32, S64, S16}) 319 .clampScalar(0, S16, S64); 320 321 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 322 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 323 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 324 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 325 .clampScalarOrElt(0, S32, S1024) 326 .legalIf(isMultiple32(0)) 327 .widenScalarToNextPow2(0, 32) 328 .clampMaxNumElements(0, S32, 16); 329 330 331 // FIXME: i1 operands to intrinsics should always be legal, but other i1 332 // values may not be legal. We need to figure out how to distinguish 333 // between these two scenarios. 334 getActionDefinitionsBuilder(G_CONSTANT) 335 .legalFor({S1, S32, S64, S16, GlobalPtr, 336 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 337 .clampScalar(0, S32, S64) 338 .widenScalarToNextPow2(0) 339 .legalIf(isPointer(0)); 340 341 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 342 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 343 .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr}); 344 345 346 auto &FPOpActions = getActionDefinitionsBuilder( 347 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 348 .legalFor({S32, S64}); 349 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 350 .customFor({S32, S64}); 351 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 352 .customFor({S32, S64}); 353 354 if (ST.has16BitInsts()) { 355 if (ST.hasVOP3PInsts()) 356 FPOpActions.legalFor({S16, V2S16}); 357 else 358 FPOpActions.legalFor({S16}); 359 360 TrigActions.customFor({S16}); 361 FDIVActions.customFor({S16}); 362 } 363 364 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 365 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 366 367 if (ST.hasVOP3PInsts()) { 368 MinNumMaxNum.customFor(FPTypesPK16) 369 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 370 .clampMaxNumElements(0, S16, 2) 371 .clampScalar(0, S16, S64) 372 .scalarize(0); 373 } else if (ST.has16BitInsts()) { 374 MinNumMaxNum.customFor(FPTypes16) 375 .clampScalar(0, S16, S64) 376 .scalarize(0); 377 } else { 378 MinNumMaxNum.customFor(FPTypesBase) 379 .clampScalar(0, S32, S64) 380 .scalarize(0); 381 } 382 383 if (ST.hasVOP3PInsts()) 384 FPOpActions.clampMaxNumElements(0, S16, 2); 385 386 FPOpActions 387 .scalarize(0) 388 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 389 390 TrigActions 391 .scalarize(0) 392 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 393 394 FDIVActions 395 .scalarize(0) 396 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 397 398 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 399 .legalFor(FPTypesPK16) 400 .clampMaxNumElements(0, S16, 2) 401 .scalarize(0) 402 .clampScalar(0, S16, S64); 403 404 // TODO: Implement 405 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); 406 407 if (ST.has16BitInsts()) { 408 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 409 .legalFor({S32, S64, S16}) 410 .scalarize(0) 411 .clampScalar(0, S16, S64); 412 } else { 413 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 414 .legalFor({S32, S64}) 415 .scalarize(0) 416 .clampScalar(0, S32, S64); 417 } 418 419 getActionDefinitionsBuilder(G_FPTRUNC) 420 .legalFor({{S32, S64}, {S16, S32}}) 421 .scalarize(0); 422 423 getActionDefinitionsBuilder(G_FPEXT) 424 .legalFor({{S64, S32}, {S32, S16}}) 425 .lowerFor({{S64, S16}}) // FIXME: Implement 426 .scalarize(0); 427 428 // TODO: Verify V_BFI_B32 is generated from expanded bit ops. 429 getActionDefinitionsBuilder(G_FCOPYSIGN).lower(); 430 431 getActionDefinitionsBuilder(G_FSUB) 432 // Use actual fsub instruction 433 .legalFor({S32}) 434 // Must use fadd + fneg 435 .lowerFor({S64, S16, V2S16}) 436 .scalarize(0) 437 .clampScalar(0, S32, S64); 438 439 // Whether this is legal depends on the floating point mode for the function. 440 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 441 if (ST.hasMadF16()) 442 FMad.customFor({S32, S16}); 443 else 444 FMad.customFor({S32}); 445 FMad.scalarize(0) 446 .lower(); 447 448 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 449 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 450 {S32, S1}, {S64, S1}, {S16, S1}, 451 {S96, S32}, 452 // FIXME: Hack 453 {S64, LLT::scalar(33)}, 454 {S32, S8}, {S32, LLT::scalar(24)}}) 455 .scalarize(0) 456 .clampScalar(0, S32, S64); 457 458 // TODO: Split s1->s64 during regbankselect for VALU. 459 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 460 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 461 .lowerFor({{S32, S64}}) 462 .lowerIf(typeIs(1, S1)) 463 .customFor({{S64, S64}}); 464 if (ST.has16BitInsts()) 465 IToFP.legalFor({{S16, S16}}); 466 IToFP.clampScalar(1, S32, S64) 467 .scalarize(0); 468 469 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 470 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}); 471 if (ST.has16BitInsts()) 472 FPToI.legalFor({{S16, S16}}); 473 else 474 FPToI.minScalar(1, S32); 475 476 FPToI.minScalar(0, S32) 477 .scalarize(0); 478 479 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 480 .scalarize(0) 481 .lower(); 482 483 if (ST.has16BitInsts()) { 484 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 485 .legalFor({S16, S32, S64}) 486 .clampScalar(0, S16, S64) 487 .scalarize(0); 488 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 489 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 490 .legalFor({S32, S64}) 491 .clampScalar(0, S32, S64) 492 .scalarize(0); 493 } else { 494 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 495 .legalFor({S32}) 496 .customFor({S64}) 497 .clampScalar(0, S32, S64) 498 .scalarize(0); 499 } 500 501 getActionDefinitionsBuilder(G_PTR_ADD) 502 .legalForCartesianProduct(AddrSpaces64, {S64}) 503 .legalForCartesianProduct(AddrSpaces32, {S32}) 504 .scalarize(0); 505 506 getActionDefinitionsBuilder(G_PTR_MASK) 507 .scalarize(0) 508 .alwaysLegal(); 509 510 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 511 512 auto &CmpBuilder = 513 getActionDefinitionsBuilder(G_ICMP) 514 // The compare output type differs based on the register bank of the output, 515 // so make both s1 and s32 legal. 516 // 517 // Scalar compares producing output in scc will be promoted to s32, as that 518 // is the allocatable register type that will be needed for the copy from 519 // scc. This will be promoted during RegBankSelect, and we assume something 520 // before that won't try to use s32 result types. 521 // 522 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 523 // bank. 524 .legalForCartesianProduct( 525 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 526 .legalForCartesianProduct( 527 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 528 if (ST.has16BitInsts()) { 529 CmpBuilder.legalFor({{S1, S16}}); 530 } 531 532 CmpBuilder 533 .widenScalarToNextPow2(1) 534 .clampScalar(1, S32, S64) 535 .scalarize(0) 536 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 537 538 getActionDefinitionsBuilder(G_FCMP) 539 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 540 .widenScalarToNextPow2(1) 541 .clampScalar(1, S32, S64) 542 .scalarize(0); 543 544 // FIXME: fexp, flog2, flog10 needs to be custom lowered. 545 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, 546 G_FLOG, G_FLOG2, G_FLOG10}) 547 .legalFor({S32}) 548 .scalarize(0); 549 550 // The 64-bit versions produce 32-bit results, but only on the SALU. 551 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, 552 G_CTTZ, G_CTTZ_ZERO_UNDEF, 553 G_CTPOP}) 554 .legalFor({{S32, S32}, {S32, S64}}) 555 .clampScalar(0, S32, S32) 556 .clampScalar(1, S32, S64) 557 .scalarize(0) 558 .widenScalarToNextPow2(0, 32) 559 .widenScalarToNextPow2(1, 32); 560 561 // TODO: Expand for > s32 562 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE}) 563 .legalFor({S32}) 564 .clampScalar(0, S32, S32) 565 .scalarize(0); 566 567 if (ST.has16BitInsts()) { 568 if (ST.hasVOP3PInsts()) { 569 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 570 .legalFor({S32, S16, V2S16}) 571 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 572 .clampMaxNumElements(0, S16, 2) 573 .clampScalar(0, S16, S32) 574 .widenScalarToNextPow2(0) 575 .scalarize(0); 576 } else { 577 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 578 .legalFor({S32, S16}) 579 .widenScalarToNextPow2(0) 580 .clampScalar(0, S16, S32) 581 .scalarize(0); 582 } 583 } else { 584 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 585 .legalFor({S32}) 586 .clampScalar(0, S32, S32) 587 .widenScalarToNextPow2(0) 588 .scalarize(0); 589 } 590 591 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 592 return [=](const LegalityQuery &Query) { 593 return Query.Types[TypeIdx0].getSizeInBits() < 594 Query.Types[TypeIdx1].getSizeInBits(); 595 }; 596 }; 597 598 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 599 return [=](const LegalityQuery &Query) { 600 return Query.Types[TypeIdx0].getSizeInBits() > 601 Query.Types[TypeIdx1].getSizeInBits(); 602 }; 603 }; 604 605 getActionDefinitionsBuilder(G_INTTOPTR) 606 // List the common cases 607 .legalForCartesianProduct(AddrSpaces64, {S64}) 608 .legalForCartesianProduct(AddrSpaces32, {S32}) 609 .scalarize(0) 610 // Accept any address space as long as the size matches 611 .legalIf(sameSize(0, 1)) 612 .widenScalarIf(smallerThan(1, 0), 613 [](const LegalityQuery &Query) { 614 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 615 }) 616 .narrowScalarIf(greaterThan(1, 0), 617 [](const LegalityQuery &Query) { 618 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 619 }); 620 621 getActionDefinitionsBuilder(G_PTRTOINT) 622 // List the common cases 623 .legalForCartesianProduct(AddrSpaces64, {S64}) 624 .legalForCartesianProduct(AddrSpaces32, {S32}) 625 .scalarize(0) 626 // Accept any address space as long as the size matches 627 .legalIf(sameSize(0, 1)) 628 .widenScalarIf(smallerThan(0, 1), 629 [](const LegalityQuery &Query) { 630 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 631 }) 632 .narrowScalarIf( 633 greaterThan(0, 1), 634 [](const LegalityQuery &Query) { 635 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 636 }); 637 638 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 639 .scalarize(0) 640 .custom(); 641 642 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 643 // handle some operations by just promoting the register during 644 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 645 auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned { 646 switch (AS) { 647 // FIXME: Private element size. 648 case AMDGPUAS::PRIVATE_ADDRESS: 649 return 32; 650 // FIXME: Check subtarget 651 case AMDGPUAS::LOCAL_ADDRESS: 652 return ST.useDS128() ? 128 : 64; 653 654 // Treat constant and global as identical. SMRD loads are sometimes usable 655 // for global loads (ideally constant address space should be eliminated) 656 // depending on the context. Legality cannot be context dependent, but 657 // RegBankSelect can split the load as necessary depending on the pointer 658 // register bank/uniformity and if the memory is invariant or not written in 659 // a kernel. 660 case AMDGPUAS::CONSTANT_ADDRESS: 661 case AMDGPUAS::GLOBAL_ADDRESS: 662 return 512; 663 default: 664 return 128; 665 } 666 }; 667 668 const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool { 669 const LLT DstTy = Query.Types[0]; 670 671 // Split vector extloads. 672 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 673 unsigned Align = Query.MMODescrs[0].AlignInBits; 674 675 if (MemSize < DstTy.getSizeInBits()) 676 MemSize = std::max(MemSize, Align); 677 678 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 679 return true; 680 681 const LLT PtrTy = Query.Types[1]; 682 unsigned AS = PtrTy.getAddressSpace(); 683 if (MemSize > maxSizeForAddrSpace(AS)) 684 return true; 685 686 // Catch weird sized loads that don't evenly divide into the access sizes 687 // TODO: May be able to widen depending on alignment etc. 688 unsigned NumRegs = MemSize / 32; 689 if (NumRegs == 3 && !ST.hasDwordx3LoadStores()) 690 return true; 691 692 if (Align < MemSize) { 693 const SITargetLowering *TLI = ST.getTargetLowering(); 694 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 695 } 696 697 return false; 698 }; 699 700 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 701 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 702 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 703 704 // TODO: Refine based on subtargets which support unaligned access or 128-bit 705 // LDS 706 // TODO: Unsupported flat for SI. 707 708 for (unsigned Op : {G_LOAD, G_STORE}) { 709 const bool IsStore = Op == G_STORE; 710 711 auto &Actions = getActionDefinitionsBuilder(Op); 712 // Whitelist the common cases. 713 // TODO: Pointer loads 714 // TODO: Wide constant loads 715 // TODO: Only CI+ has 3x loads 716 // TODO: Loads to s16 on gfx9 717 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 718 {V2S32, GlobalPtr, 64, GlobalAlign32}, 719 {V3S32, GlobalPtr, 96, GlobalAlign32}, 720 {S96, GlobalPtr, 96, GlobalAlign32}, 721 {V4S32, GlobalPtr, 128, GlobalAlign32}, 722 {S128, GlobalPtr, 128, GlobalAlign32}, 723 {S64, GlobalPtr, 64, GlobalAlign32}, 724 {V2S64, GlobalPtr, 128, GlobalAlign32}, 725 {V2S16, GlobalPtr, 32, GlobalAlign32}, 726 {S32, GlobalPtr, 8, GlobalAlign8}, 727 {S32, GlobalPtr, 16, GlobalAlign16}, 728 729 {S32, LocalPtr, 32, 32}, 730 {S64, LocalPtr, 64, 32}, 731 {V2S32, LocalPtr, 64, 32}, 732 {S32, LocalPtr, 8, 8}, 733 {S32, LocalPtr, 16, 16}, 734 {V2S16, LocalPtr, 32, 32}, 735 736 {S32, PrivatePtr, 32, 32}, 737 {S32, PrivatePtr, 8, 8}, 738 {S32, PrivatePtr, 16, 16}, 739 {V2S16, PrivatePtr, 32, 32}, 740 741 {S32, FlatPtr, 32, GlobalAlign32}, 742 {S32, FlatPtr, 16, GlobalAlign16}, 743 {S32, FlatPtr, 8, GlobalAlign8}, 744 {V2S16, FlatPtr, 32, GlobalAlign32}, 745 746 {S32, ConstantPtr, 32, GlobalAlign32}, 747 {V2S32, ConstantPtr, 64, GlobalAlign32}, 748 {V3S32, ConstantPtr, 96, GlobalAlign32}, 749 {V4S32, ConstantPtr, 128, GlobalAlign32}, 750 {S64, ConstantPtr, 64, GlobalAlign32}, 751 {S128, ConstantPtr, 128, GlobalAlign32}, 752 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 753 Actions 754 .customIf(typeIs(1, Constant32Ptr)) 755 .narrowScalarIf( 756 [=](const LegalityQuery &Query) -> bool { 757 return !Query.Types[0].isVector() && needToSplitLoad(Query); 758 }, 759 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 760 const LLT DstTy = Query.Types[0]; 761 const LLT PtrTy = Query.Types[1]; 762 763 const unsigned DstSize = DstTy.getSizeInBits(); 764 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 765 766 // Split extloads. 767 if (DstSize > MemSize) 768 return std::make_pair(0, LLT::scalar(MemSize)); 769 770 if (DstSize > 32 && (DstSize % 32 != 0)) { 771 // FIXME: Need a way to specify non-extload of larger size if 772 // suitably aligned. 773 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 774 } 775 776 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 777 if (MemSize > MaxSize) 778 return std::make_pair(0, LLT::scalar(MaxSize)); 779 780 unsigned Align = Query.MMODescrs[0].AlignInBits; 781 return std::make_pair(0, LLT::scalar(Align)); 782 }) 783 .fewerElementsIf( 784 [=](const LegalityQuery &Query) -> bool { 785 return Query.Types[0].isVector() && needToSplitLoad(Query); 786 }, 787 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 788 const LLT DstTy = Query.Types[0]; 789 const LLT PtrTy = Query.Types[1]; 790 791 LLT EltTy = DstTy.getElementType(); 792 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 793 794 // Split if it's too large for the address space. 795 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 796 unsigned NumElts = DstTy.getNumElements(); 797 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 798 799 // FIXME: Refine when odd breakdowns handled 800 // The scalars will need to be re-legalized. 801 if (NumPieces == 1 || NumPieces >= NumElts || 802 NumElts % NumPieces != 0) 803 return std::make_pair(0, EltTy); 804 805 return std::make_pair(0, 806 LLT::vector(NumElts / NumPieces, EltTy)); 807 } 808 809 // Need to split because of alignment. 810 unsigned Align = Query.MMODescrs[0].AlignInBits; 811 unsigned EltSize = EltTy.getSizeInBits(); 812 if (EltSize > Align && 813 (EltSize / Align < DstTy.getNumElements())) { 814 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 815 } 816 817 // May need relegalization for the scalars. 818 return std::make_pair(0, EltTy); 819 }) 820 .minScalar(0, S32); 821 822 if (IsStore) 823 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 824 825 // TODO: Need a bitcast lower option? 826 Actions 827 .legalIf([=](const LegalityQuery &Query) { 828 const LLT Ty0 = Query.Types[0]; 829 unsigned Size = Ty0.getSizeInBits(); 830 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 831 unsigned Align = Query.MMODescrs[0].AlignInBits; 832 833 // FIXME: Widening store from alignment not valid. 834 if (MemSize < Size) 835 MemSize = std::max(MemSize, Align); 836 837 // No extending vector loads. 838 if (Size > MemSize && Ty0.isVector()) 839 return false; 840 841 switch (MemSize) { 842 case 8: 843 case 16: 844 return Size == 32; 845 case 32: 846 case 64: 847 case 128: 848 return true; 849 case 96: 850 return ST.hasDwordx3LoadStores(); 851 case 256: 852 case 512: 853 return true; 854 default: 855 return false; 856 } 857 }) 858 .widenScalarToNextPow2(0) 859 // TODO: v3s32->v4s32 with alignment 860 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 861 } 862 863 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 864 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 865 {S32, GlobalPtr, 16, 2 * 8}, 866 {S32, LocalPtr, 8, 8}, 867 {S32, LocalPtr, 16, 16}, 868 {S32, PrivatePtr, 8, 8}, 869 {S32, PrivatePtr, 16, 16}, 870 {S32, ConstantPtr, 8, 8}, 871 {S32, ConstantPtr, 16, 2 * 8}}); 872 if (ST.hasFlatAddressSpace()) { 873 ExtLoads.legalForTypesWithMemDesc( 874 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 875 } 876 877 ExtLoads.clampScalar(0, S32, S32) 878 .widenScalarToNextPow2(0) 879 .unsupportedIfMemSizeNotPow2() 880 .lower(); 881 882 auto &Atomics = getActionDefinitionsBuilder( 883 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 884 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 885 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 886 G_ATOMICRMW_UMIN}) 887 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 888 {S64, GlobalPtr}, {S64, LocalPtr}}); 889 if (ST.hasFlatAddressSpace()) { 890 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 891 } 892 893 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 894 .legalFor({{S32, LocalPtr}}); 895 896 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 897 // demarshalling 898 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 899 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 900 {S32, FlatPtr}, {S64, FlatPtr}}) 901 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 902 {S32, RegionPtr}, {S64, RegionPtr}}); 903 904 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS) 905 .lower(); 906 907 // TODO: Pointer types, any 32-bit or 64-bit vector 908 909 // Condition should be s32 for scalar, s1 for vector. 910 getActionDefinitionsBuilder(G_SELECT) 911 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 912 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 913 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 914 .clampScalar(0, S16, S64) 915 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 916 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 917 .scalarize(1) 918 .clampMaxNumElements(0, S32, 2) 919 .clampMaxNumElements(0, LocalPtr, 2) 920 .clampMaxNumElements(0, PrivatePtr, 2) 921 .scalarize(0) 922 .widenScalarToNextPow2(0) 923 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 924 925 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 926 // be more flexible with the shift amount type. 927 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 928 .legalFor({{S32, S32}, {S64, S32}}); 929 if (ST.has16BitInsts()) { 930 if (ST.hasVOP3PInsts()) { 931 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 932 .clampMaxNumElements(0, S16, 2); 933 } else 934 Shifts.legalFor({{S16, S32}, {S16, S16}}); 935 936 // TODO: Support 16-bit shift amounts 937 Shifts.clampScalar(1, S32, S32); 938 Shifts.clampScalar(0, S16, S64); 939 Shifts.widenScalarToNextPow2(0, 16); 940 } else { 941 // Make sure we legalize the shift amount type first, as the general 942 // expansion for the shifted type will produce much worse code if it hasn't 943 // been truncated already. 944 Shifts.clampScalar(1, S32, S32); 945 Shifts.clampScalar(0, S32, S64); 946 Shifts.widenScalarToNextPow2(0, 32); 947 } 948 Shifts.scalarize(0); 949 950 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 951 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 952 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 953 unsigned IdxTypeIdx = 2; 954 955 getActionDefinitionsBuilder(Op) 956 .customIf([=](const LegalityQuery &Query) { 957 const LLT EltTy = Query.Types[EltTypeIdx]; 958 const LLT VecTy = Query.Types[VecTypeIdx]; 959 const LLT IdxTy = Query.Types[IdxTypeIdx]; 960 return (EltTy.getSizeInBits() == 16 || 961 EltTy.getSizeInBits() % 32 == 0) && 962 VecTy.getSizeInBits() % 32 == 0 && 963 VecTy.getSizeInBits() <= 1024 && 964 IdxTy.getSizeInBits() == 32; 965 }) 966 .clampScalar(EltTypeIdx, S32, S64) 967 .clampScalar(VecTypeIdx, S32, S64) 968 .clampScalar(IdxTypeIdx, S32, S32); 969 } 970 971 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 972 .unsupportedIf([=](const LegalityQuery &Query) { 973 const LLT &EltTy = Query.Types[1].getElementType(); 974 return Query.Types[0] != EltTy; 975 }); 976 977 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 978 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 979 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 980 981 // FIXME: Doesn't handle extract of illegal sizes. 982 getActionDefinitionsBuilder(Op) 983 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 984 // FIXME: Multiples of 16 should not be legal. 985 .legalIf([=](const LegalityQuery &Query) { 986 const LLT BigTy = Query.Types[BigTyIdx]; 987 const LLT LitTy = Query.Types[LitTyIdx]; 988 return (BigTy.getSizeInBits() % 32 == 0) && 989 (LitTy.getSizeInBits() % 16 == 0); 990 }) 991 .widenScalarIf( 992 [=](const LegalityQuery &Query) { 993 const LLT BigTy = Query.Types[BigTyIdx]; 994 return (BigTy.getScalarSizeInBits() < 16); 995 }, 996 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 997 .widenScalarIf( 998 [=](const LegalityQuery &Query) { 999 const LLT LitTy = Query.Types[LitTyIdx]; 1000 return (LitTy.getScalarSizeInBits() < 16); 1001 }, 1002 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1003 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1004 .widenScalarToNextPow2(BigTyIdx, 32); 1005 1006 } 1007 1008 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1009 .legalForCartesianProduct(AllS32Vectors, {S32}) 1010 .legalForCartesianProduct(AllS64Vectors, {S64}) 1011 .clampNumElements(0, V16S32, V32S32) 1012 .clampNumElements(0, V2S64, V16S64) 1013 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1014 1015 if (ST.hasScalarPackInsts()) 1016 BuildVector.legalFor({V2S16, S32}); 1017 1018 BuildVector 1019 .minScalarSameAs(1, 0) 1020 .legalIf(isRegisterType(0)) 1021 .minScalarOrElt(0, S32); 1022 1023 if (ST.hasScalarPackInsts()) { 1024 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1025 .legalFor({V2S16, S32}) 1026 .lower(); 1027 } else { 1028 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1029 .lower(); 1030 } 1031 1032 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1033 .legalIf(isRegisterType(0)); 1034 1035 // TODO: Don't fully scalarize v2s16 pieces 1036 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1037 1038 // Merge/Unmerge 1039 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1040 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1041 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1042 1043 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1044 const LLT &Ty = Query.Types[TypeIdx]; 1045 if (Ty.isVector()) { 1046 const LLT &EltTy = Ty.getElementType(); 1047 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 1048 return true; 1049 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1050 return true; 1051 } 1052 return false; 1053 }; 1054 1055 auto &Builder = getActionDefinitionsBuilder(Op) 1056 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1057 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1058 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1059 // valid. 1060 .clampScalar(LitTyIdx, S16, S256) 1061 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1062 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1063 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1064 elementTypeIs(1, S16)), 1065 changeTo(1, V2S16)) 1066 // Break up vectors with weird elements into scalars 1067 .fewerElementsIf( 1068 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 1069 scalarize(0)) 1070 .fewerElementsIf( 1071 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 1072 scalarize(1)) 1073 .clampScalar(BigTyIdx, S32, S1024) 1074 .lowerFor({{S16, V2S16}}); 1075 1076 if (Op == G_MERGE_VALUES) { 1077 Builder.widenScalarIf( 1078 // TODO: Use 16-bit shifts if legal for 8-bit values? 1079 [=](const LegalityQuery &Query) { 1080 const LLT Ty = Query.Types[LitTyIdx]; 1081 return Ty.getSizeInBits() < 32; 1082 }, 1083 changeTo(LitTyIdx, S32)); 1084 } 1085 1086 Builder.widenScalarIf( 1087 [=](const LegalityQuery &Query) { 1088 const LLT Ty = Query.Types[BigTyIdx]; 1089 return !isPowerOf2_32(Ty.getSizeInBits()) && 1090 Ty.getSizeInBits() % 16 != 0; 1091 }, 1092 [=](const LegalityQuery &Query) { 1093 // Pick the next power of 2, or a multiple of 64 over 128. 1094 // Whichever is smaller. 1095 const LLT &Ty = Query.Types[BigTyIdx]; 1096 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1097 if (NewSizeInBits >= 256) { 1098 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1099 if (RoundedTo < NewSizeInBits) 1100 NewSizeInBits = RoundedTo; 1101 } 1102 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1103 }) 1104 .legalIf([=](const LegalityQuery &Query) { 1105 const LLT &BigTy = Query.Types[BigTyIdx]; 1106 const LLT &LitTy = Query.Types[LitTyIdx]; 1107 1108 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1109 return false; 1110 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1111 return false; 1112 1113 return BigTy.getSizeInBits() % 16 == 0 && 1114 LitTy.getSizeInBits() % 16 == 0 && 1115 BigTy.getSizeInBits() <= 1024; 1116 }) 1117 // Any vectors left are the wrong size. Scalarize them. 1118 .scalarize(0) 1119 .scalarize(1); 1120 } 1121 1122 // TODO: Make legal for s32, s64. s64 case needs break down in regbankselect. 1123 getActionDefinitionsBuilder(G_SEXT_INREG) 1124 .clampScalar(0, MinLegalScalarShiftTy, S64) 1125 .lower(); 1126 1127 getActionDefinitionsBuilder({G_READ_REGISTER, G_WRITE_REGISTER}).lower(); 1128 1129 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1130 .legalFor({S64}); 1131 1132 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1133 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1134 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1135 .unsupported(); 1136 1137 computeTables(); 1138 verify(*ST.getInstrInfo()); 1139 } 1140 1141 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1142 MachineRegisterInfo &MRI, 1143 MachineIRBuilder &B, 1144 GISelChangeObserver &Observer) const { 1145 switch (MI.getOpcode()) { 1146 case TargetOpcode::G_ADDRSPACE_CAST: 1147 return legalizeAddrSpaceCast(MI, MRI, B); 1148 case TargetOpcode::G_FRINT: 1149 return legalizeFrint(MI, MRI, B); 1150 case TargetOpcode::G_FCEIL: 1151 return legalizeFceil(MI, MRI, B); 1152 case TargetOpcode::G_INTRINSIC_TRUNC: 1153 return legalizeIntrinsicTrunc(MI, MRI, B); 1154 case TargetOpcode::G_SITOFP: 1155 return legalizeITOFP(MI, MRI, B, true); 1156 case TargetOpcode::G_UITOFP: 1157 return legalizeITOFP(MI, MRI, B, false); 1158 case TargetOpcode::G_FMINNUM: 1159 case TargetOpcode::G_FMAXNUM: 1160 case TargetOpcode::G_FMINNUM_IEEE: 1161 case TargetOpcode::G_FMAXNUM_IEEE: 1162 return legalizeMinNumMaxNum(MI, MRI, B); 1163 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1164 return legalizeExtractVectorElt(MI, MRI, B); 1165 case TargetOpcode::G_INSERT_VECTOR_ELT: 1166 return legalizeInsertVectorElt(MI, MRI, B); 1167 case TargetOpcode::G_FSIN: 1168 case TargetOpcode::G_FCOS: 1169 return legalizeSinCos(MI, MRI, B); 1170 case TargetOpcode::G_GLOBAL_VALUE: 1171 return legalizeGlobalValue(MI, MRI, B); 1172 case TargetOpcode::G_LOAD: 1173 return legalizeLoad(MI, MRI, B, Observer); 1174 case TargetOpcode::G_FMAD: 1175 return legalizeFMad(MI, MRI, B); 1176 case TargetOpcode::G_FDIV: 1177 return legalizeFDIV(MI, MRI, B); 1178 case TargetOpcode::G_ATOMIC_CMPXCHG: 1179 return legalizeAtomicCmpXChg(MI, MRI, B); 1180 default: 1181 return false; 1182 } 1183 1184 llvm_unreachable("expected switch to return"); 1185 } 1186 1187 Register AMDGPULegalizerInfo::getSegmentAperture( 1188 unsigned AS, 1189 MachineRegisterInfo &MRI, 1190 MachineIRBuilder &B) const { 1191 MachineFunction &MF = B.getMF(); 1192 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1193 const LLT S32 = LLT::scalar(32); 1194 1195 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1196 1197 if (ST.hasApertureRegs()) { 1198 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1199 // getreg. 1200 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1201 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1202 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1203 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1204 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1205 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1206 unsigned Encoding = 1207 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1208 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1209 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1210 1211 Register ApertureReg = MRI.createGenericVirtualRegister(S32); 1212 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1213 1214 B.buildInstr(AMDGPU::S_GETREG_B32) 1215 .addDef(GetReg) 1216 .addImm(Encoding); 1217 MRI.setType(GetReg, S32); 1218 1219 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1220 B.buildInstr(TargetOpcode::G_SHL) 1221 .addDef(ApertureReg) 1222 .addUse(GetReg) 1223 .addUse(ShiftAmt.getReg(0)); 1224 1225 return ApertureReg; 1226 } 1227 1228 Register QueuePtr = MRI.createGenericVirtualRegister( 1229 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1230 1231 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1232 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1233 return Register(); 1234 1235 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1236 // private_segment_aperture_base_hi. 1237 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1238 1239 // TODO: can we be smarter about machine pointer info? 1240 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1241 MachineMemOperand *MMO = MF.getMachineMemOperand( 1242 PtrInfo, 1243 MachineMemOperand::MOLoad | 1244 MachineMemOperand::MODereferenceable | 1245 MachineMemOperand::MOInvariant, 1246 4, 1247 MinAlign(64, StructOffset)); 1248 1249 Register LoadResult = MRI.createGenericVirtualRegister(S32); 1250 Register LoadAddr; 1251 1252 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1253 B.buildLoad(LoadResult, LoadAddr, *MMO); 1254 return LoadResult; 1255 } 1256 1257 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1258 MachineInstr &MI, MachineRegisterInfo &MRI, 1259 MachineIRBuilder &B) const { 1260 MachineFunction &MF = B.getMF(); 1261 1262 B.setInstr(MI); 1263 1264 const LLT S32 = LLT::scalar(32); 1265 Register Dst = MI.getOperand(0).getReg(); 1266 Register Src = MI.getOperand(1).getReg(); 1267 1268 LLT DstTy = MRI.getType(Dst); 1269 LLT SrcTy = MRI.getType(Src); 1270 unsigned DestAS = DstTy.getAddressSpace(); 1271 unsigned SrcAS = SrcTy.getAddressSpace(); 1272 1273 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1274 // vector element. 1275 assert(!DstTy.isVector()); 1276 1277 const AMDGPUTargetMachine &TM 1278 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1279 1280 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1281 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1282 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1283 return true; 1284 } 1285 1286 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1287 // Truncate. 1288 B.buildExtract(Dst, Src, 0); 1289 MI.eraseFromParent(); 1290 return true; 1291 } 1292 1293 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1294 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1295 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1296 1297 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1298 // another. Merge operands are required to be the same type, but creating an 1299 // extra ptrtoint would be kind of pointless. 1300 auto HighAddr = B.buildConstant( 1301 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1302 B.buildMerge(Dst, {Src, HighAddr.getReg(0)}); 1303 MI.eraseFromParent(); 1304 return true; 1305 } 1306 1307 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1308 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1309 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1310 unsigned NullVal = TM.getNullPointerValue(DestAS); 1311 1312 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1313 auto FlatNull = B.buildConstant(SrcTy, 0); 1314 1315 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy); 1316 1317 // Extract low 32-bits of the pointer. 1318 B.buildExtract(PtrLo32, Src, 0); 1319 1320 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1321 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); 1322 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1323 1324 MI.eraseFromParent(); 1325 return true; 1326 } 1327 1328 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1329 return false; 1330 1331 if (!ST.hasFlatAddressSpace()) 1332 return false; 1333 1334 auto SegmentNull = 1335 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1336 auto FlatNull = 1337 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1338 1339 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1340 if (!ApertureReg.isValid()) 1341 return false; 1342 1343 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1344 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); 1345 1346 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy); 1347 1348 // Coerce the type of the low half of the result so we can use merge_values. 1349 Register SrcAsInt = MRI.createGenericVirtualRegister(S32); 1350 B.buildInstr(TargetOpcode::G_PTRTOINT) 1351 .addDef(SrcAsInt) 1352 .addUse(Src); 1353 1354 // TODO: Should we allow mismatched types but matching sizes in merges to 1355 // avoid the ptrtoint? 1356 B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); 1357 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); 1358 1359 MI.eraseFromParent(); 1360 return true; 1361 } 1362 1363 bool AMDGPULegalizerInfo::legalizeFrint( 1364 MachineInstr &MI, MachineRegisterInfo &MRI, 1365 MachineIRBuilder &B) const { 1366 B.setInstr(MI); 1367 1368 Register Src = MI.getOperand(1).getReg(); 1369 LLT Ty = MRI.getType(Src); 1370 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1371 1372 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1373 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1374 1375 auto C1 = B.buildFConstant(Ty, C1Val); 1376 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1377 1378 // TODO: Should this propagate fast-math-flags? 1379 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1380 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1381 1382 auto C2 = B.buildFConstant(Ty, C2Val); 1383 auto Fabs = B.buildFAbs(Ty, Src); 1384 1385 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1386 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1387 return true; 1388 } 1389 1390 bool AMDGPULegalizerInfo::legalizeFceil( 1391 MachineInstr &MI, MachineRegisterInfo &MRI, 1392 MachineIRBuilder &B) const { 1393 B.setInstr(MI); 1394 1395 const LLT S1 = LLT::scalar(1); 1396 const LLT S64 = LLT::scalar(64); 1397 1398 Register Src = MI.getOperand(1).getReg(); 1399 assert(MRI.getType(Src) == S64); 1400 1401 // result = trunc(src) 1402 // if (src > 0.0 && src != result) 1403 // result += 1.0 1404 1405 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src}); 1406 1407 const auto Zero = B.buildFConstant(S64, 0.0); 1408 const auto One = B.buildFConstant(S64, 1.0); 1409 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1410 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1411 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1412 auto Add = B.buildSelect(S64, And, One, Zero); 1413 1414 // TODO: Should this propagate fast-math-flags? 1415 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1416 return true; 1417 } 1418 1419 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1420 MachineIRBuilder &B) { 1421 const unsigned FractBits = 52; 1422 const unsigned ExpBits = 11; 1423 LLT S32 = LLT::scalar(32); 1424 1425 auto Const0 = B.buildConstant(S32, FractBits - 32); 1426 auto Const1 = B.buildConstant(S32, ExpBits); 1427 1428 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1429 .addUse(Const0.getReg(0)) 1430 .addUse(Const1.getReg(0)); 1431 1432 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1433 } 1434 1435 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1436 MachineInstr &MI, MachineRegisterInfo &MRI, 1437 MachineIRBuilder &B) const { 1438 B.setInstr(MI); 1439 1440 const LLT S1 = LLT::scalar(1); 1441 const LLT S32 = LLT::scalar(32); 1442 const LLT S64 = LLT::scalar(64); 1443 1444 Register Src = MI.getOperand(1).getReg(); 1445 assert(MRI.getType(Src) == S64); 1446 1447 // TODO: Should this use extract since the low half is unused? 1448 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1449 Register Hi = Unmerge.getReg(1); 1450 1451 // Extract the upper half, since this is where we will find the sign and 1452 // exponent. 1453 auto Exp = extractF64Exponent(Hi, B); 1454 1455 const unsigned FractBits = 52; 1456 1457 // Extract the sign bit. 1458 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1459 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1460 1461 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1462 1463 const auto Zero32 = B.buildConstant(S32, 0); 1464 1465 // Extend back to 64-bits. 1466 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); 1467 1468 auto Shr = B.buildAShr(S64, FractMask, Exp); 1469 auto Not = B.buildNot(S64, Shr); 1470 auto Tmp0 = B.buildAnd(S64, Src, Not); 1471 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1472 1473 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1474 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1475 1476 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1477 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1478 return true; 1479 } 1480 1481 bool AMDGPULegalizerInfo::legalizeITOFP( 1482 MachineInstr &MI, MachineRegisterInfo &MRI, 1483 MachineIRBuilder &B, bool Signed) const { 1484 B.setInstr(MI); 1485 1486 Register Dst = MI.getOperand(0).getReg(); 1487 Register Src = MI.getOperand(1).getReg(); 1488 1489 const LLT S64 = LLT::scalar(64); 1490 const LLT S32 = LLT::scalar(32); 1491 1492 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1493 1494 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1495 1496 auto CvtHi = Signed ? 1497 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1498 B.buildUITOFP(S64, Unmerge.getReg(1)); 1499 1500 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1501 1502 auto ThirtyTwo = B.buildConstant(S32, 32); 1503 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1504 .addUse(CvtHi.getReg(0)) 1505 .addUse(ThirtyTwo.getReg(0)); 1506 1507 // TODO: Should this propagate fast-math-flags? 1508 B.buildFAdd(Dst, LdExp, CvtLo); 1509 MI.eraseFromParent(); 1510 return true; 1511 } 1512 1513 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1514 MachineInstr &MI, MachineRegisterInfo &MRI, 1515 MachineIRBuilder &B) const { 1516 MachineFunction &MF = B.getMF(); 1517 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1518 1519 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1520 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1521 1522 // With ieee_mode disabled, the instructions have the correct behavior 1523 // already for G_FMINNUM/G_FMAXNUM 1524 if (!MFI->getMode().IEEE) 1525 return !IsIEEEOp; 1526 1527 if (IsIEEEOp) 1528 return true; 1529 1530 MachineIRBuilder HelperBuilder(MI); 1531 GISelObserverWrapper DummyObserver; 1532 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1533 HelperBuilder.setInstr(MI); 1534 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1535 } 1536 1537 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1538 MachineInstr &MI, MachineRegisterInfo &MRI, 1539 MachineIRBuilder &B) const { 1540 // TODO: Should move some of this into LegalizerHelper. 1541 1542 // TODO: Promote dynamic indexing of s16 to s32 1543 // TODO: Dynamic s64 indexing is only legal for SGPR. 1544 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); 1545 if (!IdxVal) // Dynamic case will be selected to register indexing. 1546 return true; 1547 1548 Register Dst = MI.getOperand(0).getReg(); 1549 Register Vec = MI.getOperand(1).getReg(); 1550 1551 LLT VecTy = MRI.getType(Vec); 1552 LLT EltTy = VecTy.getElementType(); 1553 assert(EltTy == MRI.getType(Dst)); 1554 1555 B.setInstr(MI); 1556 1557 if (IdxVal.getValue() < VecTy.getNumElements()) 1558 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); 1559 else 1560 B.buildUndef(Dst); 1561 1562 MI.eraseFromParent(); 1563 return true; 1564 } 1565 1566 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1567 MachineInstr &MI, MachineRegisterInfo &MRI, 1568 MachineIRBuilder &B) const { 1569 // TODO: Should move some of this into LegalizerHelper. 1570 1571 // TODO: Promote dynamic indexing of s16 to s32 1572 // TODO: Dynamic s64 indexing is only legal for SGPR. 1573 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); 1574 if (!IdxVal) // Dynamic case will be selected to register indexing. 1575 return true; 1576 1577 Register Dst = MI.getOperand(0).getReg(); 1578 Register Vec = MI.getOperand(1).getReg(); 1579 Register Ins = MI.getOperand(2).getReg(); 1580 1581 LLT VecTy = MRI.getType(Vec); 1582 LLT EltTy = VecTy.getElementType(); 1583 assert(EltTy == MRI.getType(Ins)); 1584 1585 B.setInstr(MI); 1586 1587 if (IdxVal.getValue() < VecTy.getNumElements()) 1588 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); 1589 else 1590 B.buildUndef(Dst); 1591 1592 MI.eraseFromParent(); 1593 return true; 1594 } 1595 1596 bool AMDGPULegalizerInfo::legalizeSinCos( 1597 MachineInstr &MI, MachineRegisterInfo &MRI, 1598 MachineIRBuilder &B) const { 1599 B.setInstr(MI); 1600 1601 Register DstReg = MI.getOperand(0).getReg(); 1602 Register SrcReg = MI.getOperand(1).getReg(); 1603 LLT Ty = MRI.getType(DstReg); 1604 unsigned Flags = MI.getFlags(); 1605 1606 Register TrigVal; 1607 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1608 if (ST.hasTrigReducedRange()) { 1609 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1610 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1611 .addUse(MulVal.getReg(0)) 1612 .setMIFlags(Flags).getReg(0); 1613 } else 1614 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1615 1616 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1617 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1618 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1619 .addUse(TrigVal) 1620 .setMIFlags(Flags); 1621 MI.eraseFromParent(); 1622 return true; 1623 } 1624 1625 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1626 Register DstReg, LLT PtrTy, 1627 MachineIRBuilder &B, const GlobalValue *GV, 1628 unsigned Offset, unsigned GAFlags) const { 1629 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1630 // to the following code sequence: 1631 // 1632 // For constant address space: 1633 // s_getpc_b64 s[0:1] 1634 // s_add_u32 s0, s0, $symbol 1635 // s_addc_u32 s1, s1, 0 1636 // 1637 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1638 // a fixup or relocation is emitted to replace $symbol with a literal 1639 // constant, which is a pc-relative offset from the encoding of the $symbol 1640 // operand to the global variable. 1641 // 1642 // For global address space: 1643 // s_getpc_b64 s[0:1] 1644 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1645 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1646 // 1647 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1648 // fixups or relocations are emitted to replace $symbol@*@lo and 1649 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1650 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1651 // operand to the global variable. 1652 // 1653 // What we want here is an offset from the value returned by s_getpc 1654 // (which is the address of the s_add_u32 instruction) to the global 1655 // variable, but since the encoding of $symbol starts 4 bytes after the start 1656 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1657 // small. This requires us to add 4 to the global variable offset in order to 1658 // compute the correct address. 1659 1660 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1661 1662 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1663 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1664 1665 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1666 .addDef(PCReg); 1667 1668 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1669 if (GAFlags == SIInstrInfo::MO_NONE) 1670 MIB.addImm(0); 1671 else 1672 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1673 1674 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1675 1676 if (PtrTy.getSizeInBits() == 32) 1677 B.buildExtract(DstReg, PCReg, 0); 1678 return true; 1679 } 1680 1681 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1682 MachineInstr &MI, MachineRegisterInfo &MRI, 1683 MachineIRBuilder &B) const { 1684 Register DstReg = MI.getOperand(0).getReg(); 1685 LLT Ty = MRI.getType(DstReg); 1686 unsigned AS = Ty.getAddressSpace(); 1687 1688 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1689 MachineFunction &MF = B.getMF(); 1690 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1691 B.setInstr(MI); 1692 1693 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1694 if (!MFI->isEntryFunction()) { 1695 const Function &Fn = MF.getFunction(); 1696 DiagnosticInfoUnsupported BadLDSDecl( 1697 Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 1698 Fn.getContext().diagnose(BadLDSDecl); 1699 } 1700 1701 // TODO: We could emit code to handle the initialization somewhere. 1702 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1703 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1704 MI.eraseFromParent(); 1705 return true; 1706 } 1707 1708 const Function &Fn = MF.getFunction(); 1709 DiagnosticInfoUnsupported BadInit( 1710 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1711 Fn.getContext().diagnose(BadInit); 1712 return true; 1713 } 1714 1715 const SITargetLowering *TLI = ST.getTargetLowering(); 1716 1717 if (TLI->shouldEmitFixup(GV)) { 1718 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 1719 MI.eraseFromParent(); 1720 return true; 1721 } 1722 1723 if (TLI->shouldEmitPCReloc(GV)) { 1724 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 1725 MI.eraseFromParent(); 1726 return true; 1727 } 1728 1729 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1730 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 1731 1732 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 1733 MachinePointerInfo::getGOT(MF), 1734 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1735 MachineMemOperand::MOInvariant, 1736 8 /*Size*/, 8 /*Align*/); 1737 1738 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 1739 1740 if (Ty.getSizeInBits() == 32) { 1741 // Truncate if this is a 32-bit constant adrdess. 1742 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 1743 B.buildExtract(DstReg, Load, 0); 1744 } else 1745 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 1746 1747 MI.eraseFromParent(); 1748 return true; 1749 } 1750 1751 bool AMDGPULegalizerInfo::legalizeLoad( 1752 MachineInstr &MI, MachineRegisterInfo &MRI, 1753 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 1754 B.setInstr(MI); 1755 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1756 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 1757 Observer.changingInstr(MI); 1758 MI.getOperand(1).setReg(Cast.getReg(0)); 1759 Observer.changedInstr(MI); 1760 return true; 1761 } 1762 1763 bool AMDGPULegalizerInfo::legalizeFMad( 1764 MachineInstr &MI, MachineRegisterInfo &MRI, 1765 MachineIRBuilder &B) const { 1766 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 1767 assert(Ty.isScalar()); 1768 1769 MachineFunction &MF = B.getMF(); 1770 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1771 1772 // TODO: Always legal with future ftz flag. 1773 if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals) 1774 return true; 1775 if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals) 1776 return true; 1777 1778 1779 MachineIRBuilder HelperBuilder(MI); 1780 GISelObserverWrapper DummyObserver; 1781 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1782 HelperBuilder.setMBB(*MI.getParent()); 1783 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 1784 } 1785 1786 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 1787 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 1788 Register DstReg = MI.getOperand(0).getReg(); 1789 Register PtrReg = MI.getOperand(1).getReg(); 1790 Register CmpVal = MI.getOperand(2).getReg(); 1791 Register NewVal = MI.getOperand(3).getReg(); 1792 1793 assert(SITargetLowering::isFlatGlobalAddrSpace( 1794 MRI.getType(PtrReg).getAddressSpace()) && 1795 "this should not have been custom lowered"); 1796 1797 LLT ValTy = MRI.getType(CmpVal); 1798 LLT VecTy = LLT::vector(2, ValTy); 1799 1800 B.setInstr(MI); 1801 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 1802 1803 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 1804 .addDef(DstReg) 1805 .addUse(PtrReg) 1806 .addUse(PackedVal) 1807 .setMemRefs(MI.memoperands()); 1808 1809 MI.eraseFromParent(); 1810 return true; 1811 } 1812 1813 // Return the use branch instruction, otherwise null if the usage is invalid. 1814 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 1815 MachineRegisterInfo &MRI, 1816 MachineInstr *&Br) { 1817 Register CondDef = MI.getOperand(0).getReg(); 1818 if (!MRI.hasOneNonDBGUse(CondDef)) 1819 return nullptr; 1820 1821 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 1822 if (UseMI.getParent() != MI.getParent() || 1823 UseMI.getOpcode() != AMDGPU::G_BRCOND) 1824 return nullptr; 1825 1826 // Make sure the cond br is followed by a G_BR 1827 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 1828 if (Next != MI.getParent()->end()) { 1829 if (Next->getOpcode() != AMDGPU::G_BR) 1830 return nullptr; 1831 Br = &*Next; 1832 } 1833 1834 return &UseMI; 1835 } 1836 1837 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 1838 Register Reg, LLT Ty) const { 1839 Register LiveIn = MRI.getLiveInVirtReg(Reg); 1840 if (LiveIn) 1841 return LiveIn; 1842 1843 Register NewReg = MRI.createGenericVirtualRegister(Ty); 1844 MRI.addLiveIn(Reg, NewReg); 1845 return NewReg; 1846 } 1847 1848 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 1849 const ArgDescriptor *Arg) const { 1850 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 1851 return false; // TODO: Handle these 1852 1853 assert(Arg->getRegister().isPhysical()); 1854 1855 MachineRegisterInfo &MRI = *B.getMRI(); 1856 1857 LLT Ty = MRI.getType(DstReg); 1858 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 1859 1860 if (Arg->isMasked()) { 1861 // TODO: Should we try to emit this once in the entry block? 1862 const LLT S32 = LLT::scalar(32); 1863 const unsigned Mask = Arg->getMask(); 1864 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 1865 1866 Register AndMaskSrc = LiveIn; 1867 1868 if (Shift != 0) { 1869 auto ShiftAmt = B.buildConstant(S32, Shift); 1870 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 1871 } 1872 1873 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 1874 } else 1875 B.buildCopy(DstReg, LiveIn); 1876 1877 // Insert the argument copy if it doens't already exist. 1878 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 1879 if (!MRI.getVRegDef(LiveIn)) { 1880 // FIXME: Should have scoped insert pt 1881 MachineBasicBlock &OrigInsBB = B.getMBB(); 1882 auto OrigInsPt = B.getInsertPt(); 1883 1884 MachineBasicBlock &EntryMBB = B.getMF().front(); 1885 EntryMBB.addLiveIn(Arg->getRegister()); 1886 B.setInsertPt(EntryMBB, EntryMBB.begin()); 1887 B.buildCopy(LiveIn, Arg->getRegister()); 1888 1889 B.setInsertPt(OrigInsBB, OrigInsPt); 1890 } 1891 1892 return true; 1893 } 1894 1895 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 1896 MachineInstr &MI, 1897 MachineRegisterInfo &MRI, 1898 MachineIRBuilder &B, 1899 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 1900 B.setInstr(MI); 1901 1902 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1903 1904 const ArgDescriptor *Arg; 1905 const TargetRegisterClass *RC; 1906 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 1907 if (!Arg) { 1908 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 1909 return false; 1910 } 1911 1912 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 1913 MI.eraseFromParent(); 1914 return true; 1915 } 1916 1917 return false; 1918 } 1919 1920 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 1921 MachineRegisterInfo &MRI, 1922 MachineIRBuilder &B) const { 1923 B.setInstr(MI); 1924 Register Dst = MI.getOperand(0).getReg(); 1925 LLT DstTy = MRI.getType(Dst); 1926 LLT S16 = LLT::scalar(16); 1927 LLT S32 = LLT::scalar(32); 1928 LLT S64 = LLT::scalar(64); 1929 1930 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 1931 return true; 1932 1933 if (DstTy == S16) 1934 return legalizeFDIV16(MI, MRI, B); 1935 if (DstTy == S32) 1936 return legalizeFDIV32(MI, MRI, B); 1937 if (DstTy == S64) 1938 return legalizeFDIV64(MI, MRI, B); 1939 1940 return false; 1941 } 1942 1943 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 1944 MachineRegisterInfo &MRI, 1945 MachineIRBuilder &B) const { 1946 Register Res = MI.getOperand(0).getReg(); 1947 Register LHS = MI.getOperand(1).getReg(); 1948 Register RHS = MI.getOperand(2).getReg(); 1949 1950 uint16_t Flags = MI.getFlags(); 1951 1952 LLT ResTy = MRI.getType(Res); 1953 LLT S32 = LLT::scalar(32); 1954 LLT S64 = LLT::scalar(64); 1955 1956 const MachineFunction &MF = B.getMF(); 1957 bool Unsafe = 1958 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 1959 1960 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 1961 return false; 1962 1963 if (!Unsafe && ResTy == S32 && 1964 MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals) 1965 return false; 1966 1967 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 1968 // 1 / x -> RCP(x) 1969 if (CLHS->isExactlyValue(1.0)) { 1970 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 1971 .addUse(RHS) 1972 .setMIFlags(Flags); 1973 1974 MI.eraseFromParent(); 1975 return true; 1976 } 1977 1978 // -1 / x -> RCP( FNEG(x) ) 1979 if (CLHS->isExactlyValue(-1.0)) { 1980 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 1981 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 1982 .addUse(FNeg.getReg(0)) 1983 .setMIFlags(Flags); 1984 1985 MI.eraseFromParent(); 1986 return true; 1987 } 1988 } 1989 1990 // x / y -> x * (1.0 / y) 1991 if (Unsafe) { 1992 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 1993 .addUse(RHS) 1994 .setMIFlags(Flags); 1995 B.buildFMul(Res, LHS, RCP, Flags); 1996 1997 MI.eraseFromParent(); 1998 return true; 1999 } 2000 2001 return false; 2002 } 2003 2004 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2005 MachineRegisterInfo &MRI, 2006 MachineIRBuilder &B) const { 2007 B.setInstr(MI); 2008 Register Res = MI.getOperand(0).getReg(); 2009 Register LHS = MI.getOperand(1).getReg(); 2010 Register RHS = MI.getOperand(2).getReg(); 2011 2012 uint16_t Flags = MI.getFlags(); 2013 2014 LLT S16 = LLT::scalar(16); 2015 LLT S32 = LLT::scalar(32); 2016 2017 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2018 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2019 2020 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2021 .addUse(RHSExt.getReg(0)) 2022 .setMIFlags(Flags); 2023 2024 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2025 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2026 2027 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2028 .addUse(RDst.getReg(0)) 2029 .addUse(RHS) 2030 .addUse(LHS) 2031 .setMIFlags(Flags); 2032 2033 MI.eraseFromParent(); 2034 return true; 2035 } 2036 2037 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2038 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2039 static void toggleSPDenormMode(bool Enable, 2040 MachineIRBuilder &B, 2041 const GCNSubtarget &ST, 2042 AMDGPU::SIModeRegisterDefaults Mode) { 2043 // Set SP denorm mode to this value. 2044 unsigned SPDenormMode = 2045 Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; 2046 2047 if (ST.hasDenormModeInst()) { 2048 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2049 unsigned DPDenormModeDefault = Mode.FP64FP16Denormals 2050 ? FP_DENORM_FLUSH_NONE 2051 : FP_DENORM_FLUSH_IN_FLUSH_OUT; 2052 2053 unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2054 B.buildInstr(AMDGPU::S_DENORM_MODE) 2055 .addImm(NewDenormModeValue); 2056 2057 } else { 2058 // Select FP32 bit field in mode register. 2059 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2060 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2061 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2062 2063 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2064 .addImm(SPDenormMode) 2065 .addImm(SPDenormModeBitField); 2066 } 2067 } 2068 2069 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2070 MachineRegisterInfo &MRI, 2071 MachineIRBuilder &B) const { 2072 B.setInstr(MI); 2073 Register Res = MI.getOperand(0).getReg(); 2074 Register LHS = MI.getOperand(1).getReg(); 2075 Register RHS = MI.getOperand(2).getReg(); 2076 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2077 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2078 2079 uint16_t Flags = MI.getFlags(); 2080 2081 LLT S32 = LLT::scalar(32); 2082 LLT S1 = LLT::scalar(1); 2083 2084 auto One = B.buildFConstant(S32, 1.0f); 2085 2086 auto DenominatorScaled = 2087 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2088 .addUse(RHS) 2089 .addUse(LHS) 2090 .addImm(1) 2091 .setMIFlags(Flags); 2092 auto NumeratorScaled = 2093 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2094 .addUse(LHS) 2095 .addUse(RHS) 2096 .addImm(0) 2097 .setMIFlags(Flags); 2098 2099 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2100 .addUse(DenominatorScaled.getReg(0)) 2101 .setMIFlags(Flags); 2102 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2103 2104 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2105 // aren't modeled as reading it. 2106 if (!Mode.FP32Denormals) 2107 toggleSPDenormMode(true, B, ST, Mode); 2108 2109 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2110 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2111 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2112 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2113 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2114 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2115 2116 if (!Mode.FP32Denormals) 2117 toggleSPDenormMode(false, B, ST, Mode); 2118 2119 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2120 .addUse(Fma4.getReg(0)) 2121 .addUse(Fma1.getReg(0)) 2122 .addUse(Fma3.getReg(0)) 2123 .addUse(NumeratorScaled.getReg(1)) 2124 .setMIFlags(Flags); 2125 2126 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2127 .addUse(Fmas.getReg(0)) 2128 .addUse(RHS) 2129 .addUse(LHS) 2130 .setMIFlags(Flags); 2131 2132 MI.eraseFromParent(); 2133 return true; 2134 } 2135 2136 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2137 MachineRegisterInfo &MRI, 2138 MachineIRBuilder &B) const { 2139 B.setInstr(MI); 2140 Register Res = MI.getOperand(0).getReg(); 2141 Register LHS = MI.getOperand(1).getReg(); 2142 Register RHS = MI.getOperand(2).getReg(); 2143 2144 uint16_t Flags = MI.getFlags(); 2145 2146 LLT S64 = LLT::scalar(64); 2147 LLT S1 = LLT::scalar(1); 2148 2149 auto One = B.buildFConstant(S64, 1.0); 2150 2151 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2152 .addUse(LHS) 2153 .addUse(RHS) 2154 .addImm(1) 2155 .setMIFlags(Flags); 2156 2157 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2158 2159 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2160 .addUse(DivScale0.getReg(0)) 2161 .setMIFlags(Flags); 2162 2163 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2164 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2165 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2166 2167 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2168 .addUse(LHS) 2169 .addUse(RHS) 2170 .addImm(0) 2171 .setMIFlags(Flags); 2172 2173 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2174 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); 2175 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2176 2177 Register Scale; 2178 if (!ST.hasUsableDivScaleConditionOutput()) { 2179 // Workaround a hardware bug on SI where the condition output from div_scale 2180 // is not usable. 2181 2182 Scale = MRI.createGenericVirtualRegister(S1); 2183 2184 LLT S32 = LLT::scalar(32); 2185 2186 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2187 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2188 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2189 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 2190 2191 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 2192 Scale1Unmerge.getReg(1)); 2193 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 2194 Scale0Unmerge.getReg(1)); 2195 B.buildXor(Scale, CmpNum, CmpDen); 2196 } else { 2197 Scale = DivScale1.getReg(1); 2198 } 2199 2200 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 2201 .addUse(Fma4.getReg(0)) 2202 .addUse(Fma3.getReg(0)) 2203 .addUse(Mul.getReg(0)) 2204 .addUse(Scale) 2205 .setMIFlags(Flags); 2206 2207 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 2208 .addUse(Fmas.getReg(0)) 2209 .addUse(RHS) 2210 .addUse(LHS) 2211 .setMIFlags(Flags); 2212 2213 MI.eraseFromParent(); 2214 return true; 2215 } 2216 2217 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 2218 MachineRegisterInfo &MRI, 2219 MachineIRBuilder &B) const { 2220 B.setInstr(MI); 2221 Register Res = MI.getOperand(0).getReg(); 2222 Register LHS = MI.getOperand(2).getReg(); 2223 Register RHS = MI.getOperand(3).getReg(); 2224 uint16_t Flags = MI.getFlags(); 2225 2226 LLT S32 = LLT::scalar(32); 2227 LLT S1 = LLT::scalar(1); 2228 2229 auto Abs = B.buildFAbs(S32, RHS, Flags); 2230 const APFloat C0Val(1.0f); 2231 2232 auto C0 = B.buildConstant(S32, 0x6f800000); 2233 auto C1 = B.buildConstant(S32, 0x2f800000); 2234 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 2235 2236 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 2237 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 2238 2239 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 2240 2241 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2242 .addUse(Mul0.getReg(0)) 2243 .setMIFlags(Flags); 2244 2245 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 2246 2247 B.buildFMul(Res, Sel, Mul1, Flags); 2248 2249 MI.eraseFromParent(); 2250 return true; 2251 } 2252 2253 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 2254 MachineRegisterInfo &MRI, 2255 MachineIRBuilder &B) const { 2256 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2257 if (!MFI->isEntryFunction()) { 2258 return legalizePreloadedArgIntrin(MI, MRI, B, 2259 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 2260 } 2261 2262 B.setInstr(MI); 2263 2264 uint64_t Offset = 2265 ST.getTargetLowering()->getImplicitParameterOffset( 2266 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 2267 Register DstReg = MI.getOperand(0).getReg(); 2268 LLT DstTy = MRI.getType(DstReg); 2269 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 2270 2271 const ArgDescriptor *Arg; 2272 const TargetRegisterClass *RC; 2273 std::tie(Arg, RC) 2274 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2275 if (!Arg) 2276 return false; 2277 2278 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 2279 if (!loadInputValue(KernargPtrReg, B, Arg)) 2280 return false; 2281 2282 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 2283 MI.eraseFromParent(); 2284 return true; 2285 } 2286 2287 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 2288 MachineRegisterInfo &MRI, 2289 MachineIRBuilder &B, 2290 unsigned AddrSpace) const { 2291 B.setInstr(MI); 2292 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 2293 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 2294 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 2295 MI.eraseFromParent(); 2296 return true; 2297 } 2298 2299 /// Handle register layout difference for f16 images for some subtargets. 2300 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 2301 MachineRegisterInfo &MRI, 2302 Register Reg) const { 2303 if (!ST.hasUnpackedD16VMem()) 2304 return Reg; 2305 2306 const LLT S16 = LLT::scalar(16); 2307 const LLT S32 = LLT::scalar(32); 2308 LLT StoreVT = MRI.getType(Reg); 2309 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 2310 2311 auto Unmerge = B.buildUnmerge(S16, Reg); 2312 2313 SmallVector<Register, 4> WideRegs; 2314 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 2315 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 2316 2317 int NumElts = StoreVT.getNumElements(); 2318 2319 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 2320 } 2321 2322 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI, 2323 MachineRegisterInfo &MRI, 2324 MachineIRBuilder &B, 2325 bool IsFormat) const { 2326 // TODO: Reject f16 format on targets where unsupported. 2327 Register VData = MI.getOperand(1).getReg(); 2328 LLT Ty = MRI.getType(VData); 2329 2330 B.setInstr(MI); 2331 2332 const LLT S32 = LLT::scalar(32); 2333 const LLT S16 = LLT::scalar(16); 2334 2335 // Fixup illegal register types for i8 stores. 2336 if (Ty == LLT::scalar(8) || Ty == S16) { 2337 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 2338 MI.getOperand(1).setReg(AnyExt); 2339 return true; 2340 } 2341 2342 if (Ty.isVector()) { 2343 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 2344 if (IsFormat) 2345 MI.getOperand(1).setReg(handleD16VData(B, MRI, VData)); 2346 return true; 2347 } 2348 2349 return Ty.getElementType() == S32 && Ty.getNumElements() <= 4; 2350 } 2351 2352 return Ty == S32; 2353 } 2354 2355 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 2356 MachineRegisterInfo &MRI, 2357 MachineIRBuilder &B) const { 2358 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 2359 auto IntrID = MI.getIntrinsicID(); 2360 switch (IntrID) { 2361 case Intrinsic::amdgcn_if: 2362 case Intrinsic::amdgcn_else: { 2363 MachineInstr *Br = nullptr; 2364 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 2365 const SIRegisterInfo *TRI 2366 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 2367 2368 B.setInstr(*BrCond); 2369 Register Def = MI.getOperand(1).getReg(); 2370 Register Use = MI.getOperand(3).getReg(); 2371 2372 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 2373 if (Br) 2374 BrTarget = Br->getOperand(0).getMBB(); 2375 2376 if (IntrID == Intrinsic::amdgcn_if) { 2377 B.buildInstr(AMDGPU::SI_IF) 2378 .addDef(Def) 2379 .addUse(Use) 2380 .addMBB(BrTarget); 2381 } else { 2382 B.buildInstr(AMDGPU::SI_ELSE) 2383 .addDef(Def) 2384 .addUse(Use) 2385 .addMBB(BrTarget) 2386 .addImm(0); 2387 } 2388 2389 if (Br) 2390 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 2391 2392 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 2393 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 2394 MI.eraseFromParent(); 2395 BrCond->eraseFromParent(); 2396 return true; 2397 } 2398 2399 return false; 2400 } 2401 case Intrinsic::amdgcn_loop: { 2402 MachineInstr *Br = nullptr; 2403 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 2404 const SIRegisterInfo *TRI 2405 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 2406 2407 B.setInstr(*BrCond); 2408 2409 // FIXME: Need to adjust branch targets based on unconditional branch. 2410 Register Reg = MI.getOperand(2).getReg(); 2411 B.buildInstr(AMDGPU::SI_LOOP) 2412 .addUse(Reg) 2413 .addMBB(BrCond->getOperand(1).getMBB()); 2414 MI.eraseFromParent(); 2415 BrCond->eraseFromParent(); 2416 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 2417 return true; 2418 } 2419 2420 return false; 2421 } 2422 case Intrinsic::amdgcn_kernarg_segment_ptr: 2423 return legalizePreloadedArgIntrin( 2424 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2425 case Intrinsic::amdgcn_implicitarg_ptr: 2426 return legalizeImplicitArgPtr(MI, MRI, B); 2427 case Intrinsic::amdgcn_workitem_id_x: 2428 return legalizePreloadedArgIntrin(MI, MRI, B, 2429 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 2430 case Intrinsic::amdgcn_workitem_id_y: 2431 return legalizePreloadedArgIntrin(MI, MRI, B, 2432 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 2433 case Intrinsic::amdgcn_workitem_id_z: 2434 return legalizePreloadedArgIntrin(MI, MRI, B, 2435 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 2436 case Intrinsic::amdgcn_workgroup_id_x: 2437 return legalizePreloadedArgIntrin(MI, MRI, B, 2438 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 2439 case Intrinsic::amdgcn_workgroup_id_y: 2440 return legalizePreloadedArgIntrin(MI, MRI, B, 2441 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 2442 case Intrinsic::amdgcn_workgroup_id_z: 2443 return legalizePreloadedArgIntrin(MI, MRI, B, 2444 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 2445 case Intrinsic::amdgcn_dispatch_ptr: 2446 return legalizePreloadedArgIntrin(MI, MRI, B, 2447 AMDGPUFunctionArgInfo::DISPATCH_PTR); 2448 case Intrinsic::amdgcn_queue_ptr: 2449 return legalizePreloadedArgIntrin(MI, MRI, B, 2450 AMDGPUFunctionArgInfo::QUEUE_PTR); 2451 case Intrinsic::amdgcn_implicit_buffer_ptr: 2452 return legalizePreloadedArgIntrin( 2453 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 2454 case Intrinsic::amdgcn_dispatch_id: 2455 return legalizePreloadedArgIntrin(MI, MRI, B, 2456 AMDGPUFunctionArgInfo::DISPATCH_ID); 2457 case Intrinsic::amdgcn_fdiv_fast: 2458 return legalizeFDIVFastIntrin(MI, MRI, B); 2459 case Intrinsic::amdgcn_is_shared: 2460 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 2461 case Intrinsic::amdgcn_is_private: 2462 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 2463 case Intrinsic::amdgcn_wavefrontsize: { 2464 B.setInstr(MI); 2465 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 2466 MI.eraseFromParent(); 2467 return true; 2468 } 2469 case Intrinsic::amdgcn_raw_buffer_store: 2470 return legalizeRawBufferStore(MI, MRI, B, false); 2471 case Intrinsic::amdgcn_raw_buffer_store_format: 2472 return legalizeRawBufferStore(MI, MRI, B, true); 2473 default: 2474 return true; 2475 } 2476 2477 return true; 2478 } 2479