1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPULegalizerInfo.h" 22 23 #include "AMDGPU.h" 24 #include "AMDGPUGlobalISelUtils.h" 25 #include "AMDGPUTargetMachine.h" 26 #include "SIMachineFunctionInfo.h" 27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 30 #include "llvm/CodeGen/TargetOpcodes.h" 31 #include "llvm/CodeGen/ValueTypes.h" 32 #include "llvm/IR/DerivedTypes.h" 33 #include "llvm/IR/DiagnosticInfo.h" 34 #include "llvm/IR/Type.h" 35 #include "llvm/Support/Debug.h" 36 37 #define DEBUG_TYPE "amdgpu-legalinfo" 38 39 using namespace llvm; 40 using namespace LegalizeActions; 41 using namespace LegalizeMutations; 42 using namespace LegalityPredicates; 43 using namespace MIPatternMatch; 44 45 // Round the number of elements to the next power of two elements 46 static LLT getPow2VectorType(LLT Ty) { 47 unsigned NElts = Ty.getNumElements(); 48 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 49 return Ty.changeNumElements(Pow2NElts); 50 } 51 52 // Round the number of bits to the next power of two bits 53 static LLT getPow2ScalarType(LLT Ty) { 54 unsigned Bits = Ty.getSizeInBits(); 55 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 56 return LLT::scalar(Pow2Bits); 57 } 58 59 static LegalityPredicate isMultiple32(unsigned TypeIdx, 60 unsigned MaxSize = 1024) { 61 return [=](const LegalityQuery &Query) { 62 const LLT Ty = Query.Types[TypeIdx]; 63 const LLT EltTy = Ty.getScalarType(); 64 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 65 }; 66 } 67 68 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 69 return [=](const LegalityQuery &Query) { 70 return Query.Types[TypeIdx].getSizeInBits() == Size; 71 }; 72 } 73 74 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 75 return [=](const LegalityQuery &Query) { 76 const LLT Ty = Query.Types[TypeIdx]; 77 return Ty.isVector() && 78 Ty.getNumElements() % 2 != 0 && 79 Ty.getElementType().getSizeInBits() < 32 && 80 Ty.getSizeInBits() % 32 != 0; 81 }; 82 } 83 84 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 85 return [=](const LegalityQuery &Query) { 86 const LLT Ty = Query.Types[TypeIdx]; 87 const LLT EltTy = Ty.getScalarType(); 88 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 89 }; 90 } 91 92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 93 return [=](const LegalityQuery &Query) { 94 const LLT Ty = Query.Types[TypeIdx]; 95 const LLT EltTy = Ty.getElementType(); 96 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 97 }; 98 } 99 100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 101 return [=](const LegalityQuery &Query) { 102 const LLT Ty = Query.Types[TypeIdx]; 103 const LLT EltTy = Ty.getElementType(); 104 unsigned Size = Ty.getSizeInBits(); 105 unsigned Pieces = (Size + 63) / 64; 106 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 107 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 108 }; 109 } 110 111 // Increase the number of vector elements to reach the next multiple of 32-bit 112 // type. 113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 114 return [=](const LegalityQuery &Query) { 115 const LLT Ty = Query.Types[TypeIdx]; 116 117 const LLT EltTy = Ty.getElementType(); 118 const int Size = Ty.getSizeInBits(); 119 const int EltSize = EltTy.getSizeInBits(); 120 const int NextMul32 = (Size + 31) / 32; 121 122 assert(EltSize < 32); 123 124 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 125 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 126 }; 127 } 128 129 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 130 return [=](const LegalityQuery &Query) { 131 const LLT QueryTy = Query.Types[TypeIdx]; 132 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 133 }; 134 } 135 136 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 137 return [=](const LegalityQuery &Query) { 138 const LLT QueryTy = Query.Types[TypeIdx]; 139 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 140 }; 141 } 142 143 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 144 return [=](const LegalityQuery &Query) { 145 const LLT QueryTy = Query.Types[TypeIdx]; 146 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 147 }; 148 } 149 150 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 151 // v2s16. 152 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 153 return [=](const LegalityQuery &Query) { 154 const LLT Ty = Query.Types[TypeIdx]; 155 if (Ty.isVector()) { 156 const int EltSize = Ty.getElementType().getSizeInBits(); 157 return EltSize == 32 || EltSize == 64 || 158 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 159 EltSize == 128 || EltSize == 256; 160 } 161 162 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 163 }; 164 } 165 166 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 167 return [=](const LegalityQuery &Query) { 168 const LLT QueryTy = Query.Types[TypeIdx]; 169 return QueryTy.isVector() && QueryTy.getElementType() == Type; 170 }; 171 } 172 173 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 174 return [=](const LegalityQuery &Query) { 175 const LLT Ty = Query.Types[TypeIdx]; 176 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 177 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 178 }; 179 } 180 181 static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) { 182 return [=](const LegalityQuery &Query) { 183 return Query.Types[TypeIdx0].getSizeInBits() < 184 Query.Types[TypeIdx1].getSizeInBits(); 185 }; 186 } 187 188 static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) { 189 return [=](const LegalityQuery &Query) { 190 return Query.Types[TypeIdx0].getSizeInBits() > 191 Query.Types[TypeIdx1].getSizeInBits(); 192 }; 193 } 194 195 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 196 const GCNTargetMachine &TM) 197 : ST(ST_) { 198 using namespace TargetOpcode; 199 200 auto GetAddrSpacePtr = [&TM](unsigned AS) { 201 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 202 }; 203 204 const LLT S1 = LLT::scalar(1); 205 const LLT S16 = LLT::scalar(16); 206 const LLT S32 = LLT::scalar(32); 207 const LLT S64 = LLT::scalar(64); 208 const LLT S128 = LLT::scalar(128); 209 const LLT S256 = LLT::scalar(256); 210 const LLT S1024 = LLT::scalar(1024); 211 212 const LLT V2S16 = LLT::vector(2, 16); 213 const LLT V4S16 = LLT::vector(4, 16); 214 215 const LLT V2S32 = LLT::vector(2, 32); 216 const LLT V3S32 = LLT::vector(3, 32); 217 const LLT V4S32 = LLT::vector(4, 32); 218 const LLT V5S32 = LLT::vector(5, 32); 219 const LLT V6S32 = LLT::vector(6, 32); 220 const LLT V7S32 = LLT::vector(7, 32); 221 const LLT V8S32 = LLT::vector(8, 32); 222 const LLT V9S32 = LLT::vector(9, 32); 223 const LLT V10S32 = LLT::vector(10, 32); 224 const LLT V11S32 = LLT::vector(11, 32); 225 const LLT V12S32 = LLT::vector(12, 32); 226 const LLT V13S32 = LLT::vector(13, 32); 227 const LLT V14S32 = LLT::vector(14, 32); 228 const LLT V15S32 = LLT::vector(15, 32); 229 const LLT V16S32 = LLT::vector(16, 32); 230 const LLT V32S32 = LLT::vector(32, 32); 231 232 const LLT V2S64 = LLT::vector(2, 64); 233 const LLT V3S64 = LLT::vector(3, 64); 234 const LLT V4S64 = LLT::vector(4, 64); 235 const LLT V5S64 = LLT::vector(5, 64); 236 const LLT V6S64 = LLT::vector(6, 64); 237 const LLT V7S64 = LLT::vector(7, 64); 238 const LLT V8S64 = LLT::vector(8, 64); 239 const LLT V16S64 = LLT::vector(16, 64); 240 241 std::initializer_list<LLT> AllS32Vectors = 242 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 243 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 244 std::initializer_list<LLT> AllS64Vectors = 245 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 246 247 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 248 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 249 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 250 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 251 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 252 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 253 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 254 255 const LLT CodePtr = FlatPtr; 256 257 const std::initializer_list<LLT> AddrSpaces64 = { 258 GlobalPtr, ConstantPtr, FlatPtr 259 }; 260 261 const std::initializer_list<LLT> AddrSpaces32 = { 262 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 263 }; 264 265 const std::initializer_list<LLT> FPTypesBase = { 266 S32, S64 267 }; 268 269 const std::initializer_list<LLT> FPTypes16 = { 270 S32, S64, S16 271 }; 272 273 const std::initializer_list<LLT> FPTypesPK16 = { 274 S32, S64, S16, V2S16 275 }; 276 277 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 278 279 setAction({G_BRCOND, S1}, Legal); // VCC branches 280 setAction({G_BRCOND, S32}, Legal); // SCC branches 281 282 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 283 // elements for v3s16 284 getActionDefinitionsBuilder(G_PHI) 285 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 286 .legalFor(AllS32Vectors) 287 .legalFor(AllS64Vectors) 288 .legalFor(AddrSpaces64) 289 .legalFor(AddrSpaces32) 290 .clampScalar(0, S32, S256) 291 .widenScalarToNextPow2(0, 32) 292 .clampMaxNumElements(0, S32, 16) 293 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 294 .legalIf(isPointer(0)); 295 296 if (ST.has16BitInsts()) { 297 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 298 .legalFor({S32, S16}) 299 .clampScalar(0, S16, S32) 300 .scalarize(0) 301 .widenScalarToNextPow2(0, 32); 302 } else { 303 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 304 .legalFor({S32}) 305 .clampScalar(0, S32, S32) 306 .scalarize(0); 307 } 308 309 // FIXME: Not really legal. Placeholder for custom lowering. 310 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 311 .customFor({S32, S64}) 312 .clampScalar(0, S32, S64) 313 .widenScalarToNextPow2(0, 32) 314 .scalarize(0); 315 316 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 317 .legalFor({S32}) 318 .clampScalar(0, S32, S32) 319 .scalarize(0); 320 321 // Report legal for any types we can handle anywhere. For the cases only legal 322 // on the SALU, RegBankSelect will be able to re-legalize. 323 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 324 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 325 .clampScalar(0, S32, S64) 326 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 327 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 328 .widenScalarToNextPow2(0) 329 .scalarize(0); 330 331 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 332 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 333 .legalFor({{S32, S1}, {S32, S32}}) 334 .minScalar(0, S32) 335 // TODO: .scalarize(0) 336 .lower(); 337 338 getActionDefinitionsBuilder(G_BITCAST) 339 // Don't worry about the size constraint. 340 .legalIf(all(isRegisterType(0), isRegisterType(1))) 341 .lower(); 342 343 344 getActionDefinitionsBuilder(G_CONSTANT) 345 .legalFor({S1, S32, S64, S16, GlobalPtr, 346 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 347 .clampScalar(0, S32, S64) 348 .widenScalarToNextPow2(0) 349 .legalIf(isPointer(0)); 350 351 getActionDefinitionsBuilder(G_FCONSTANT) 352 .legalFor({S32, S64, S16}) 353 .clampScalar(0, S16, S64); 354 355 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 356 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 357 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 358 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 359 .clampScalarOrElt(0, S32, S1024) 360 .legalIf(isMultiple32(0)) 361 .widenScalarToNextPow2(0, 32) 362 .clampMaxNumElements(0, S32, 16); 363 364 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 365 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 366 .unsupportedFor({PrivatePtr}) 367 .custom(); 368 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 369 370 auto &FPOpActions = getActionDefinitionsBuilder( 371 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 372 .legalFor({S32, S64}); 373 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 374 .customFor({S32, S64}); 375 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 376 .customFor({S32, S64}); 377 378 if (ST.has16BitInsts()) { 379 if (ST.hasVOP3PInsts()) 380 FPOpActions.legalFor({S16, V2S16}); 381 else 382 FPOpActions.legalFor({S16}); 383 384 TrigActions.customFor({S16}); 385 FDIVActions.customFor({S16}); 386 } 387 388 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 389 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 390 391 if (ST.hasVOP3PInsts()) { 392 MinNumMaxNum.customFor(FPTypesPK16) 393 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 394 .clampMaxNumElements(0, S16, 2) 395 .clampScalar(0, S16, S64) 396 .scalarize(0); 397 } else if (ST.has16BitInsts()) { 398 MinNumMaxNum.customFor(FPTypes16) 399 .clampScalar(0, S16, S64) 400 .scalarize(0); 401 } else { 402 MinNumMaxNum.customFor(FPTypesBase) 403 .clampScalar(0, S32, S64) 404 .scalarize(0); 405 } 406 407 if (ST.hasVOP3PInsts()) 408 FPOpActions.clampMaxNumElements(0, S16, 2); 409 410 FPOpActions 411 .scalarize(0) 412 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 413 414 TrigActions 415 .scalarize(0) 416 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 417 418 FDIVActions 419 .scalarize(0) 420 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 421 422 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 423 .legalFor(FPTypesPK16) 424 .clampMaxNumElements(0, S16, 2) 425 .scalarize(0) 426 .clampScalar(0, S16, S64); 427 428 if (ST.has16BitInsts()) { 429 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 430 .legalFor({S32, S64, S16}) 431 .scalarize(0) 432 .clampScalar(0, S16, S64); 433 } else { 434 getActionDefinitionsBuilder(G_FSQRT) 435 .legalFor({S32, S64}) 436 .scalarize(0) 437 .clampScalar(0, S32, S64); 438 439 if (ST.hasFractBug()) { 440 getActionDefinitionsBuilder(G_FFLOOR) 441 .customFor({S64}) 442 .legalFor({S32, S64}) 443 .scalarize(0) 444 .clampScalar(0, S32, S64); 445 } else { 446 getActionDefinitionsBuilder(G_FFLOOR) 447 .legalFor({S32, S64}) 448 .scalarize(0) 449 .clampScalar(0, S32, S64); 450 } 451 } 452 453 getActionDefinitionsBuilder(G_FPTRUNC) 454 .legalFor({{S32, S64}, {S16, S32}}) 455 .scalarize(0) 456 .lower(); 457 458 getActionDefinitionsBuilder(G_FPEXT) 459 .legalFor({{S64, S32}, {S32, S16}}) 460 .lowerFor({{S64, S16}}) // FIXME: Implement 461 .scalarize(0); 462 463 getActionDefinitionsBuilder(G_FSUB) 464 // Use actual fsub instruction 465 .legalFor({S32}) 466 // Must use fadd + fneg 467 .lowerFor({S64, S16, V2S16}) 468 .scalarize(0) 469 .clampScalar(0, S32, S64); 470 471 // Whether this is legal depends on the floating point mode for the function. 472 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 473 if (ST.hasMadF16()) 474 FMad.customFor({S32, S16}); 475 else 476 FMad.customFor({S32}); 477 FMad.scalarize(0) 478 .lower(); 479 480 getActionDefinitionsBuilder(G_TRUNC) 481 .alwaysLegal(); 482 483 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 484 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 485 {S32, S1}, {S64, S1}, {S16, S1}}) 486 .scalarize(0) 487 .clampScalar(0, S32, S64) 488 .widenScalarToNextPow2(1, 32); 489 490 // TODO: Split s1->s64 during regbankselect for VALU. 491 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 492 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 493 .lowerFor({{S32, S64}}) 494 .lowerIf(typeIs(1, S1)) 495 .customFor({{S64, S64}}); 496 if (ST.has16BitInsts()) 497 IToFP.legalFor({{S16, S16}}); 498 IToFP.clampScalar(1, S32, S64) 499 .scalarize(0) 500 .widenScalarToNextPow2(1); 501 502 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 503 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 504 .customFor({{S64, S64}}); 505 if (ST.has16BitInsts()) 506 FPToI.legalFor({{S16, S16}}); 507 else 508 FPToI.minScalar(1, S32); 509 510 FPToI.minScalar(0, S32) 511 .scalarize(0) 512 .lower(); 513 514 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 515 .scalarize(0) 516 .lower(); 517 518 if (ST.has16BitInsts()) { 519 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 520 .legalFor({S16, S32, S64}) 521 .clampScalar(0, S16, S64) 522 .scalarize(0); 523 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 524 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 525 .legalFor({S32, S64}) 526 .clampScalar(0, S32, S64) 527 .scalarize(0); 528 } else { 529 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 530 .legalFor({S32}) 531 .customFor({S64}) 532 .clampScalar(0, S32, S64) 533 .scalarize(0); 534 } 535 536 getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK}) 537 .scalarize(0) 538 .alwaysLegal(); 539 540 auto &CmpBuilder = 541 getActionDefinitionsBuilder(G_ICMP) 542 // The compare output type differs based on the register bank of the output, 543 // so make both s1 and s32 legal. 544 // 545 // Scalar compares producing output in scc will be promoted to s32, as that 546 // is the allocatable register type that will be needed for the copy from 547 // scc. This will be promoted during RegBankSelect, and we assume something 548 // before that won't try to use s32 result types. 549 // 550 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 551 // bank. 552 .legalForCartesianProduct( 553 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 554 .legalForCartesianProduct( 555 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 556 if (ST.has16BitInsts()) { 557 CmpBuilder.legalFor({{S1, S16}}); 558 } 559 560 CmpBuilder 561 .widenScalarToNextPow2(1) 562 .clampScalar(1, S32, S64) 563 .scalarize(0) 564 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 565 566 getActionDefinitionsBuilder(G_FCMP) 567 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 568 .widenScalarToNextPow2(1) 569 .clampScalar(1, S32, S64) 570 .scalarize(0); 571 572 // FIXME: fpow has a selection pattern that should move to custom lowering. 573 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 574 if (ST.has16BitInsts()) 575 Exp2Ops.legalFor({S32, S16}); 576 else 577 Exp2Ops.legalFor({S32}); 578 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 579 Exp2Ops.scalarize(0); 580 581 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 582 if (ST.has16BitInsts()) 583 ExpOps.customFor({{S32}, {S16}}); 584 else 585 ExpOps.customFor({S32}); 586 ExpOps.clampScalar(0, MinScalarFPTy, S32) 587 .scalarize(0); 588 589 // The 64-bit versions produce 32-bit results, but only on the SALU. 590 getActionDefinitionsBuilder(G_CTPOP) 591 .legalFor({{S32, S32}, {S32, S64}}) 592 .clampScalar(0, S32, S32) 593 .clampScalar(1, S32, S64) 594 .scalarize(0) 595 .widenScalarToNextPow2(0, 32) 596 .widenScalarToNextPow2(1, 32); 597 598 // The hardware instructions return a different result on 0 than the generic 599 // instructions expect. The hardware produces -1, but these produce the 600 // bitwidth. 601 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 602 .scalarize(0) 603 .clampScalar(0, S32, S32) 604 .clampScalar(1, S32, S64) 605 .widenScalarToNextPow2(0, 32) 606 .widenScalarToNextPow2(1, 32) 607 .lower(); 608 609 // The 64-bit versions produce 32-bit results, but only on the SALU. 610 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 611 .legalFor({{S32, S32}, {S32, S64}}) 612 .clampScalar(0, S32, S32) 613 .clampScalar(1, S32, S64) 614 .scalarize(0) 615 .widenScalarToNextPow2(0, 32) 616 .widenScalarToNextPow2(1, 32); 617 618 getActionDefinitionsBuilder(G_BITREVERSE) 619 .legalFor({S32}) 620 .clampScalar(0, S32, S32) 621 .scalarize(0); 622 623 if (ST.has16BitInsts()) { 624 getActionDefinitionsBuilder(G_BSWAP) 625 .legalFor({S16, S32, V2S16}) 626 .clampMaxNumElements(0, S16, 2) 627 // FIXME: Fixing non-power-of-2 before clamp is workaround for 628 // narrowScalar limitation. 629 .widenScalarToNextPow2(0) 630 .clampScalar(0, S16, S32) 631 .scalarize(0); 632 633 if (ST.hasVOP3PInsts()) { 634 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 635 .legalFor({S32, S16, V2S16}) 636 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 637 .clampMaxNumElements(0, S16, 2) 638 .clampScalar(0, S16, S32) 639 .widenScalarToNextPow2(0) 640 .scalarize(0); 641 } else { 642 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 643 .legalFor({S32, S16}) 644 .widenScalarToNextPow2(0) 645 .clampScalar(0, S16, S32) 646 .scalarize(0); 647 } 648 } else { 649 // TODO: Should have same legality without v_perm_b32 650 getActionDefinitionsBuilder(G_BSWAP) 651 .legalFor({S32}) 652 .lowerIf(narrowerThan(0, 32)) 653 // FIXME: Fixing non-power-of-2 before clamp is workaround for 654 // narrowScalar limitation. 655 .widenScalarToNextPow2(0) 656 .maxScalar(0, S32) 657 .scalarize(0) 658 .lower(); 659 660 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 661 .legalFor({S32}) 662 .clampScalar(0, S32, S32) 663 .widenScalarToNextPow2(0) 664 .scalarize(0); 665 } 666 667 getActionDefinitionsBuilder(G_INTTOPTR) 668 // List the common cases 669 .legalForCartesianProduct(AddrSpaces64, {S64}) 670 .legalForCartesianProduct(AddrSpaces32, {S32}) 671 .scalarize(0) 672 // Accept any address space as long as the size matches 673 .legalIf(sameSize(0, 1)) 674 .widenScalarIf(smallerThan(1, 0), 675 [](const LegalityQuery &Query) { 676 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 677 }) 678 .narrowScalarIf(greaterThan(1, 0), 679 [](const LegalityQuery &Query) { 680 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 681 }); 682 683 getActionDefinitionsBuilder(G_PTRTOINT) 684 // List the common cases 685 .legalForCartesianProduct(AddrSpaces64, {S64}) 686 .legalForCartesianProduct(AddrSpaces32, {S32}) 687 .scalarize(0) 688 // Accept any address space as long as the size matches 689 .legalIf(sameSize(0, 1)) 690 .widenScalarIf(smallerThan(0, 1), 691 [](const LegalityQuery &Query) { 692 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 693 }) 694 .narrowScalarIf( 695 greaterThan(0, 1), 696 [](const LegalityQuery &Query) { 697 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 698 }); 699 700 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 701 .scalarize(0) 702 .custom(); 703 704 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 705 // handle some operations by just promoting the register during 706 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 707 auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned { 708 switch (AS) { 709 // FIXME: Private element size. 710 case AMDGPUAS::PRIVATE_ADDRESS: 711 return 32; 712 // FIXME: Check subtarget 713 case AMDGPUAS::LOCAL_ADDRESS: 714 return ST.useDS128() ? 128 : 64; 715 716 // Treat constant and global as identical. SMRD loads are sometimes usable 717 // for global loads (ideally constant address space should be eliminated) 718 // depending on the context. Legality cannot be context dependent, but 719 // RegBankSelect can split the load as necessary depending on the pointer 720 // register bank/uniformity and if the memory is invariant or not written in 721 // a kernel. 722 case AMDGPUAS::CONSTANT_ADDRESS: 723 case AMDGPUAS::GLOBAL_ADDRESS: 724 return IsLoad ? 512 : 128; 725 default: 726 return 128; 727 } 728 }; 729 730 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 731 bool IsLoad) -> bool { 732 const LLT DstTy = Query.Types[0]; 733 734 // Split vector extloads. 735 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 736 unsigned Align = Query.MMODescrs[0].AlignInBits; 737 738 if (MemSize < DstTy.getSizeInBits()) 739 MemSize = std::max(MemSize, Align); 740 741 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 742 return true; 743 744 const LLT PtrTy = Query.Types[1]; 745 unsigned AS = PtrTy.getAddressSpace(); 746 if (MemSize > maxSizeForAddrSpace(AS, IsLoad)) 747 return true; 748 749 // Catch weird sized loads that don't evenly divide into the access sizes 750 // TODO: May be able to widen depending on alignment etc. 751 unsigned NumRegs = (MemSize + 31) / 32; 752 if (NumRegs == 3) { 753 if (!ST.hasDwordx3LoadStores()) 754 return true; 755 } else { 756 // If the alignment allows, these should have been widened. 757 if (!isPowerOf2_32(NumRegs)) 758 return true; 759 } 760 761 if (Align < MemSize) { 762 const SITargetLowering *TLI = ST.getTargetLowering(); 763 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 764 } 765 766 return false; 767 }; 768 769 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool { 770 unsigned Size = Query.Types[0].getSizeInBits(); 771 if (isPowerOf2_32(Size)) 772 return false; 773 774 if (Size == 96 && ST.hasDwordx3LoadStores()) 775 return false; 776 777 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 778 if (Size >= maxSizeForAddrSpace(AddrSpace, true)) 779 return false; 780 781 unsigned Align = Query.MMODescrs[0].AlignInBits; 782 unsigned RoundedSize = NextPowerOf2(Size); 783 return (Align >= RoundedSize); 784 }; 785 786 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 787 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 788 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 789 790 // TODO: Refine based on subtargets which support unaligned access or 128-bit 791 // LDS 792 // TODO: Unsupported flat for SI. 793 794 for (unsigned Op : {G_LOAD, G_STORE}) { 795 const bool IsStore = Op == G_STORE; 796 797 auto &Actions = getActionDefinitionsBuilder(Op); 798 // Whitelist the common cases. 799 // TODO: Loads to s16 on gfx9 800 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 801 {V2S32, GlobalPtr, 64, GlobalAlign32}, 802 {V4S32, GlobalPtr, 128, GlobalAlign32}, 803 {S128, GlobalPtr, 128, GlobalAlign32}, 804 {S64, GlobalPtr, 64, GlobalAlign32}, 805 {V2S64, GlobalPtr, 128, GlobalAlign32}, 806 {V2S16, GlobalPtr, 32, GlobalAlign32}, 807 {S32, GlobalPtr, 8, GlobalAlign8}, 808 {S32, GlobalPtr, 16, GlobalAlign16}, 809 810 {S32, LocalPtr, 32, 32}, 811 {S64, LocalPtr, 64, 32}, 812 {V2S32, LocalPtr, 64, 32}, 813 {S32, LocalPtr, 8, 8}, 814 {S32, LocalPtr, 16, 16}, 815 {V2S16, LocalPtr, 32, 32}, 816 817 {S32, PrivatePtr, 32, 32}, 818 {S32, PrivatePtr, 8, 8}, 819 {S32, PrivatePtr, 16, 16}, 820 {V2S16, PrivatePtr, 32, 32}, 821 822 {S32, FlatPtr, 32, GlobalAlign32}, 823 {S32, FlatPtr, 16, GlobalAlign16}, 824 {S32, FlatPtr, 8, GlobalAlign8}, 825 {V2S16, FlatPtr, 32, GlobalAlign32}, 826 827 {S32, ConstantPtr, 32, GlobalAlign32}, 828 {V2S32, ConstantPtr, 64, GlobalAlign32}, 829 {V4S32, ConstantPtr, 128, GlobalAlign32}, 830 {S64, ConstantPtr, 64, GlobalAlign32}, 831 {S128, ConstantPtr, 128, GlobalAlign32}, 832 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 833 Actions 834 .customIf(typeIs(1, Constant32Ptr)) 835 // Widen suitably aligned loads by loading extra elements. 836 .moreElementsIf([=](const LegalityQuery &Query) { 837 const LLT Ty = Query.Types[0]; 838 return Op == G_LOAD && Ty.isVector() && 839 shouldWidenLoadResult(Query); 840 }, moreElementsToNextPow2(0)) 841 .widenScalarIf([=](const LegalityQuery &Query) { 842 const LLT Ty = Query.Types[0]; 843 return Op == G_LOAD && !Ty.isVector() && 844 shouldWidenLoadResult(Query); 845 }, widenScalarOrEltToNextPow2(0)) 846 .narrowScalarIf( 847 [=](const LegalityQuery &Query) -> bool { 848 return !Query.Types[0].isVector() && 849 needToSplitMemOp(Query, Op == G_LOAD); 850 }, 851 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 852 const LLT DstTy = Query.Types[0]; 853 const LLT PtrTy = Query.Types[1]; 854 855 const unsigned DstSize = DstTy.getSizeInBits(); 856 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 857 858 // Split extloads. 859 if (DstSize > MemSize) 860 return std::make_pair(0, LLT::scalar(MemSize)); 861 862 if (!isPowerOf2_32(DstSize)) { 863 // We're probably decomposing an odd sized store. Try to split 864 // to the widest type. TODO: Account for alignment. As-is it 865 // should be OK, since the new parts will be further legalized. 866 unsigned FloorSize = PowerOf2Floor(DstSize); 867 return std::make_pair(0, LLT::scalar(FloorSize)); 868 } 869 870 if (DstSize > 32 && (DstSize % 32 != 0)) { 871 // FIXME: Need a way to specify non-extload of larger size if 872 // suitably aligned. 873 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 874 } 875 876 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 877 Op == G_LOAD); 878 if (MemSize > MaxSize) 879 return std::make_pair(0, LLT::scalar(MaxSize)); 880 881 unsigned Align = Query.MMODescrs[0].AlignInBits; 882 return std::make_pair(0, LLT::scalar(Align)); 883 }) 884 .fewerElementsIf( 885 [=](const LegalityQuery &Query) -> bool { 886 return Query.Types[0].isVector() && 887 needToSplitMemOp(Query, Op == G_LOAD); 888 }, 889 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 890 const LLT DstTy = Query.Types[0]; 891 const LLT PtrTy = Query.Types[1]; 892 893 LLT EltTy = DstTy.getElementType(); 894 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 895 Op == G_LOAD); 896 897 // FIXME: Handle widened to power of 2 results better. This ends 898 // up scalarizing. 899 // FIXME: 3 element stores scalarized on SI 900 901 // Split if it's too large for the address space. 902 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 903 unsigned NumElts = DstTy.getNumElements(); 904 unsigned EltSize = EltTy.getSizeInBits(); 905 906 if (MaxSize % EltSize == 0) { 907 return std::make_pair( 908 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 909 } 910 911 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 912 913 // FIXME: Refine when odd breakdowns handled 914 // The scalars will need to be re-legalized. 915 if (NumPieces == 1 || NumPieces >= NumElts || 916 NumElts % NumPieces != 0) 917 return std::make_pair(0, EltTy); 918 919 return std::make_pair(0, 920 LLT::vector(NumElts / NumPieces, EltTy)); 921 } 922 923 // FIXME: We could probably handle weird extending loads better. 924 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 925 if (DstTy.getSizeInBits() > MemSize) 926 return std::make_pair(0, EltTy); 927 928 unsigned EltSize = EltTy.getSizeInBits(); 929 unsigned DstSize = DstTy.getSizeInBits(); 930 if (!isPowerOf2_32(DstSize)) { 931 // We're probably decomposing an odd sized store. Try to split 932 // to the widest type. TODO: Account for alignment. As-is it 933 // should be OK, since the new parts will be further legalized. 934 unsigned FloorSize = PowerOf2Floor(DstSize); 935 return std::make_pair( 936 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 937 } 938 939 // Need to split because of alignment. 940 unsigned Align = Query.MMODescrs[0].AlignInBits; 941 if (EltSize > Align && 942 (EltSize / Align < DstTy.getNumElements())) { 943 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 944 } 945 946 // May need relegalization for the scalars. 947 return std::make_pair(0, EltTy); 948 }) 949 .minScalar(0, S32); 950 951 if (IsStore) 952 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 953 954 // TODO: Need a bitcast lower option? 955 Actions 956 .legalIf([=](const LegalityQuery &Query) { 957 const LLT Ty0 = Query.Types[0]; 958 unsigned Size = Ty0.getSizeInBits(); 959 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 960 unsigned Align = Query.MMODescrs[0].AlignInBits; 961 962 // FIXME: Widening store from alignment not valid. 963 if (MemSize < Size) 964 MemSize = std::max(MemSize, Align); 965 966 // No extending vector loads. 967 if (Size > MemSize && Ty0.isVector()) 968 return false; 969 970 switch (MemSize) { 971 case 8: 972 case 16: 973 return Size == 32; 974 case 32: 975 case 64: 976 case 128: 977 return true; 978 case 96: 979 return ST.hasDwordx3LoadStores(); 980 case 256: 981 case 512: 982 return true; 983 default: 984 return false; 985 } 986 }) 987 .widenScalarToNextPow2(0) 988 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 989 } 990 991 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 992 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 993 {S32, GlobalPtr, 16, 2 * 8}, 994 {S32, LocalPtr, 8, 8}, 995 {S32, LocalPtr, 16, 16}, 996 {S32, PrivatePtr, 8, 8}, 997 {S32, PrivatePtr, 16, 16}, 998 {S32, ConstantPtr, 8, 8}, 999 {S32, ConstantPtr, 16, 2 * 8}}); 1000 if (ST.hasFlatAddressSpace()) { 1001 ExtLoads.legalForTypesWithMemDesc( 1002 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1003 } 1004 1005 ExtLoads.clampScalar(0, S32, S32) 1006 .widenScalarToNextPow2(0) 1007 .unsupportedIfMemSizeNotPow2() 1008 .lower(); 1009 1010 auto &Atomics = getActionDefinitionsBuilder( 1011 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1012 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1013 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1014 G_ATOMICRMW_UMIN}) 1015 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1016 {S64, GlobalPtr}, {S64, LocalPtr}}); 1017 if (ST.hasFlatAddressSpace()) { 1018 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1019 } 1020 1021 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1022 .legalFor({{S32, LocalPtr}}); 1023 1024 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1025 // demarshalling 1026 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1027 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1028 {S32, FlatPtr}, {S64, FlatPtr}}) 1029 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1030 {S32, RegionPtr}, {S64, RegionPtr}}); 1031 // TODO: Pointer types, any 32-bit or 64-bit vector 1032 1033 // Condition should be s32 for scalar, s1 for vector. 1034 getActionDefinitionsBuilder(G_SELECT) 1035 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1036 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1037 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1038 .clampScalar(0, S16, S64) 1039 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1040 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1041 .scalarize(1) 1042 .clampMaxNumElements(0, S32, 2) 1043 .clampMaxNumElements(0, LocalPtr, 2) 1044 .clampMaxNumElements(0, PrivatePtr, 2) 1045 .scalarize(0) 1046 .widenScalarToNextPow2(0) 1047 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1048 1049 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1050 // be more flexible with the shift amount type. 1051 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1052 .legalFor({{S32, S32}, {S64, S32}}); 1053 if (ST.has16BitInsts()) { 1054 if (ST.hasVOP3PInsts()) { 1055 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 1056 .clampMaxNumElements(0, S16, 2); 1057 } else 1058 Shifts.legalFor({{S16, S32}, {S16, S16}}); 1059 1060 // TODO: Support 16-bit shift amounts 1061 Shifts.clampScalar(1, S32, S32); 1062 Shifts.clampScalar(0, S16, S64); 1063 Shifts.widenScalarToNextPow2(0, 16); 1064 } else { 1065 // Make sure we legalize the shift amount type first, as the general 1066 // expansion for the shifted type will produce much worse code if it hasn't 1067 // been truncated already. 1068 Shifts.clampScalar(1, S32, S32); 1069 Shifts.clampScalar(0, S32, S64); 1070 Shifts.widenScalarToNextPow2(0, 32); 1071 } 1072 Shifts.scalarize(0); 1073 1074 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1075 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1076 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1077 unsigned IdxTypeIdx = 2; 1078 1079 getActionDefinitionsBuilder(Op) 1080 .customIf([=](const LegalityQuery &Query) { 1081 const LLT EltTy = Query.Types[EltTypeIdx]; 1082 const LLT VecTy = Query.Types[VecTypeIdx]; 1083 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1084 return (EltTy.getSizeInBits() == 16 || 1085 EltTy.getSizeInBits() % 32 == 0) && 1086 VecTy.getSizeInBits() % 32 == 0 && 1087 VecTy.getSizeInBits() <= 1024 && 1088 IdxTy.getSizeInBits() == 32; 1089 }) 1090 .clampScalar(EltTypeIdx, S32, S64) 1091 .clampScalar(VecTypeIdx, S32, S64) 1092 .clampScalar(IdxTypeIdx, S32, S32); 1093 } 1094 1095 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1096 .unsupportedIf([=](const LegalityQuery &Query) { 1097 const LLT &EltTy = Query.Types[1].getElementType(); 1098 return Query.Types[0] != EltTy; 1099 }); 1100 1101 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1102 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1103 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1104 1105 // FIXME: Doesn't handle extract of illegal sizes. 1106 getActionDefinitionsBuilder(Op) 1107 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1108 // FIXME: Multiples of 16 should not be legal. 1109 .legalIf([=](const LegalityQuery &Query) { 1110 const LLT BigTy = Query.Types[BigTyIdx]; 1111 const LLT LitTy = Query.Types[LitTyIdx]; 1112 return (BigTy.getSizeInBits() % 32 == 0) && 1113 (LitTy.getSizeInBits() % 16 == 0); 1114 }) 1115 .widenScalarIf( 1116 [=](const LegalityQuery &Query) { 1117 const LLT BigTy = Query.Types[BigTyIdx]; 1118 return (BigTy.getScalarSizeInBits() < 16); 1119 }, 1120 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1121 .widenScalarIf( 1122 [=](const LegalityQuery &Query) { 1123 const LLT LitTy = Query.Types[LitTyIdx]; 1124 return (LitTy.getScalarSizeInBits() < 16); 1125 }, 1126 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1127 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1128 .widenScalarToNextPow2(BigTyIdx, 32); 1129 1130 } 1131 1132 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1133 .legalForCartesianProduct(AllS32Vectors, {S32}) 1134 .legalForCartesianProduct(AllS64Vectors, {S64}) 1135 .clampNumElements(0, V16S32, V32S32) 1136 .clampNumElements(0, V2S64, V16S64) 1137 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1138 1139 if (ST.hasScalarPackInsts()) { 1140 BuildVector 1141 // FIXME: Should probably widen s1 vectors straight to s32 1142 .minScalarOrElt(0, S16) 1143 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1144 .minScalar(1, S32); 1145 1146 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1147 .legalFor({V2S16, S32}) 1148 .lower(); 1149 BuildVector.minScalarOrElt(0, S32); 1150 } else { 1151 BuildVector.customFor({V2S16, S16}); 1152 BuildVector.minScalarOrElt(0, S32); 1153 1154 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1155 .customFor({V2S16, S32}) 1156 .lower(); 1157 } 1158 1159 BuildVector.legalIf(isRegisterType(0)); 1160 1161 // FIXME: Clamp maximum size 1162 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1163 .legalIf(isRegisterType(0)); 1164 1165 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1166 // pre-legalize. 1167 if (ST.hasVOP3PInsts()) { 1168 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1169 .customFor({V2S16, V2S16}) 1170 .lower(); 1171 } else 1172 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1173 1174 // Merge/Unmerge 1175 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1176 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1177 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1178 1179 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1180 const LLT &Ty = Query.Types[TypeIdx]; 1181 if (Ty.isVector()) { 1182 const LLT &EltTy = Ty.getElementType(); 1183 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 1184 return true; 1185 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1186 return true; 1187 } 1188 return false; 1189 }; 1190 1191 auto &Builder = getActionDefinitionsBuilder(Op) 1192 // Try to widen to s16 first for small types. 1193 // TODO: Only do this on targets with legal s16 shifts 1194 .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1195 1196 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1197 .lowerFor({{S16, V2S16}}) 1198 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1199 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1200 elementTypeIs(1, S16)), 1201 changeTo(1, V2S16)) 1202 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1203 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1204 // valid. 1205 .clampScalar(LitTyIdx, S32, S256) 1206 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1207 // Break up vectors with weird elements into scalars 1208 .fewerElementsIf( 1209 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 1210 scalarize(0)) 1211 .fewerElementsIf( 1212 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 1213 scalarize(1)) 1214 .clampScalar(BigTyIdx, S32, S1024); 1215 1216 if (Op == G_MERGE_VALUES) { 1217 Builder.widenScalarIf( 1218 // TODO: Use 16-bit shifts if legal for 8-bit values? 1219 [=](const LegalityQuery &Query) { 1220 const LLT Ty = Query.Types[LitTyIdx]; 1221 return Ty.getSizeInBits() < 32; 1222 }, 1223 changeTo(LitTyIdx, S32)); 1224 } 1225 1226 Builder.widenScalarIf( 1227 [=](const LegalityQuery &Query) { 1228 const LLT Ty = Query.Types[BigTyIdx]; 1229 return !isPowerOf2_32(Ty.getSizeInBits()) && 1230 Ty.getSizeInBits() % 16 != 0; 1231 }, 1232 [=](const LegalityQuery &Query) { 1233 // Pick the next power of 2, or a multiple of 64 over 128. 1234 // Whichever is smaller. 1235 const LLT &Ty = Query.Types[BigTyIdx]; 1236 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1237 if (NewSizeInBits >= 256) { 1238 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1239 if (RoundedTo < NewSizeInBits) 1240 NewSizeInBits = RoundedTo; 1241 } 1242 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1243 }) 1244 .legalIf([=](const LegalityQuery &Query) { 1245 const LLT &BigTy = Query.Types[BigTyIdx]; 1246 const LLT &LitTy = Query.Types[LitTyIdx]; 1247 1248 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1249 return false; 1250 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1251 return false; 1252 1253 return BigTy.getSizeInBits() % 16 == 0 && 1254 LitTy.getSizeInBits() % 16 == 0 && 1255 BigTy.getSizeInBits() <= 1024; 1256 }) 1257 // Any vectors left are the wrong size. Scalarize them. 1258 .scalarize(0) 1259 .scalarize(1); 1260 } 1261 1262 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1263 // RegBankSelect. 1264 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1265 .legalFor({{S32}, {S64}}); 1266 1267 if (ST.hasVOP3PInsts()) { 1268 SextInReg.lowerFor({{V2S16}}) 1269 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1270 // get more vector shift opportunities, since we'll get those when 1271 // expanded. 1272 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1273 } else if (ST.has16BitInsts()) { 1274 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1275 } else { 1276 // Prefer to promote to s32 before lowering if we don't have 16-bit 1277 // shifts. This avoid a lot of intermediate truncate and extend operations. 1278 SextInReg.lowerFor({{S32}, {S64}}); 1279 } 1280 1281 SextInReg 1282 .scalarize(0) 1283 .clampScalar(0, S32, S64) 1284 .lower(); 1285 1286 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1287 .legalFor({S64}); 1288 1289 getActionDefinitionsBuilder({ 1290 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1291 G_FCOPYSIGN, 1292 1293 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1294 G_READ_REGISTER, 1295 G_WRITE_REGISTER, 1296 1297 G_SADDO, G_SSUBO, 1298 1299 // TODO: Implement 1300 G_FMINIMUM, G_FMAXIMUM 1301 }).lower(); 1302 1303 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1304 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1305 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1306 .unsupported(); 1307 1308 computeTables(); 1309 verify(*ST.getInstrInfo()); 1310 } 1311 1312 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1313 MachineRegisterInfo &MRI, 1314 MachineIRBuilder &B, 1315 GISelChangeObserver &Observer) const { 1316 switch (MI.getOpcode()) { 1317 case TargetOpcode::G_ADDRSPACE_CAST: 1318 return legalizeAddrSpaceCast(MI, MRI, B); 1319 case TargetOpcode::G_FRINT: 1320 return legalizeFrint(MI, MRI, B); 1321 case TargetOpcode::G_FCEIL: 1322 return legalizeFceil(MI, MRI, B); 1323 case TargetOpcode::G_INTRINSIC_TRUNC: 1324 return legalizeIntrinsicTrunc(MI, MRI, B); 1325 case TargetOpcode::G_SITOFP: 1326 return legalizeITOFP(MI, MRI, B, true); 1327 case TargetOpcode::G_UITOFP: 1328 return legalizeITOFP(MI, MRI, B, false); 1329 case TargetOpcode::G_FPTOSI: 1330 return legalizeFPTOI(MI, MRI, B, true); 1331 case TargetOpcode::G_FPTOUI: 1332 return legalizeFPTOI(MI, MRI, B, false); 1333 case TargetOpcode::G_FMINNUM: 1334 case TargetOpcode::G_FMAXNUM: 1335 case TargetOpcode::G_FMINNUM_IEEE: 1336 case TargetOpcode::G_FMAXNUM_IEEE: 1337 return legalizeMinNumMaxNum(MI, MRI, B); 1338 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1339 return legalizeExtractVectorElt(MI, MRI, B); 1340 case TargetOpcode::G_INSERT_VECTOR_ELT: 1341 return legalizeInsertVectorElt(MI, MRI, B); 1342 case TargetOpcode::G_SHUFFLE_VECTOR: 1343 return legalizeShuffleVector(MI, MRI, B); 1344 case TargetOpcode::G_FSIN: 1345 case TargetOpcode::G_FCOS: 1346 return legalizeSinCos(MI, MRI, B); 1347 case TargetOpcode::G_GLOBAL_VALUE: 1348 return legalizeGlobalValue(MI, MRI, B); 1349 case TargetOpcode::G_LOAD: 1350 return legalizeLoad(MI, MRI, B, Observer); 1351 case TargetOpcode::G_FMAD: 1352 return legalizeFMad(MI, MRI, B); 1353 case TargetOpcode::G_FDIV: 1354 return legalizeFDIV(MI, MRI, B); 1355 case TargetOpcode::G_UDIV: 1356 case TargetOpcode::G_UREM: 1357 return legalizeUDIV_UREM(MI, MRI, B); 1358 case TargetOpcode::G_SDIV: 1359 case TargetOpcode::G_SREM: 1360 return legalizeSDIV_SREM(MI, MRI, B); 1361 case TargetOpcode::G_ATOMIC_CMPXCHG: 1362 return legalizeAtomicCmpXChg(MI, MRI, B); 1363 case TargetOpcode::G_FLOG: 1364 return legalizeFlog(MI, B, 1.0f / numbers::log2ef); 1365 case TargetOpcode::G_FLOG10: 1366 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1367 case TargetOpcode::G_FEXP: 1368 return legalizeFExp(MI, B); 1369 case TargetOpcode::G_FPOW: 1370 return legalizeFPow(MI, B); 1371 case TargetOpcode::G_FFLOOR: 1372 return legalizeFFloor(MI, MRI, B); 1373 case TargetOpcode::G_BUILD_VECTOR: 1374 return legalizeBuildVector(MI, MRI, B); 1375 default: 1376 return false; 1377 } 1378 1379 llvm_unreachable("expected switch to return"); 1380 } 1381 1382 Register AMDGPULegalizerInfo::getSegmentAperture( 1383 unsigned AS, 1384 MachineRegisterInfo &MRI, 1385 MachineIRBuilder &B) const { 1386 MachineFunction &MF = B.getMF(); 1387 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1388 const LLT S32 = LLT::scalar(32); 1389 1390 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1391 1392 if (ST.hasApertureRegs()) { 1393 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1394 // getreg. 1395 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1396 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1397 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1398 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1399 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1400 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1401 unsigned Encoding = 1402 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1403 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1404 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1405 1406 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1407 1408 B.buildInstr(AMDGPU::S_GETREG_B32) 1409 .addDef(GetReg) 1410 .addImm(Encoding); 1411 MRI.setType(GetReg, S32); 1412 1413 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1414 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1415 } 1416 1417 Register QueuePtr = MRI.createGenericVirtualRegister( 1418 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1419 1420 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1421 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1422 return Register(); 1423 1424 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1425 // private_segment_aperture_base_hi. 1426 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1427 1428 // TODO: can we be smarter about machine pointer info? 1429 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1430 MachineMemOperand *MMO = MF.getMachineMemOperand( 1431 PtrInfo, 1432 MachineMemOperand::MOLoad | 1433 MachineMemOperand::MODereferenceable | 1434 MachineMemOperand::MOInvariant, 1435 4, 1436 MinAlign(64, StructOffset)); 1437 1438 Register LoadAddr; 1439 1440 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1441 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1442 } 1443 1444 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1445 MachineInstr &MI, MachineRegisterInfo &MRI, 1446 MachineIRBuilder &B) const { 1447 MachineFunction &MF = B.getMF(); 1448 1449 B.setInstr(MI); 1450 1451 const LLT S32 = LLT::scalar(32); 1452 Register Dst = MI.getOperand(0).getReg(); 1453 Register Src = MI.getOperand(1).getReg(); 1454 1455 LLT DstTy = MRI.getType(Dst); 1456 LLT SrcTy = MRI.getType(Src); 1457 unsigned DestAS = DstTy.getAddressSpace(); 1458 unsigned SrcAS = SrcTy.getAddressSpace(); 1459 1460 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1461 // vector element. 1462 assert(!DstTy.isVector()); 1463 1464 const AMDGPUTargetMachine &TM 1465 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1466 1467 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1468 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1469 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1470 return true; 1471 } 1472 1473 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1474 // Truncate. 1475 B.buildExtract(Dst, Src, 0); 1476 MI.eraseFromParent(); 1477 return true; 1478 } 1479 1480 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1481 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1482 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1483 1484 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1485 // another. Merge operands are required to be the same type, but creating an 1486 // extra ptrtoint would be kind of pointless. 1487 auto HighAddr = B.buildConstant( 1488 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1489 B.buildMerge(Dst, {Src, HighAddr}); 1490 MI.eraseFromParent(); 1491 return true; 1492 } 1493 1494 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1495 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1496 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1497 unsigned NullVal = TM.getNullPointerValue(DestAS); 1498 1499 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1500 auto FlatNull = B.buildConstant(SrcTy, 0); 1501 1502 // Extract low 32-bits of the pointer. 1503 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1504 1505 auto CmpRes = 1506 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1507 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1508 1509 MI.eraseFromParent(); 1510 return true; 1511 } 1512 1513 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1514 return false; 1515 1516 if (!ST.hasFlatAddressSpace()) 1517 return false; 1518 1519 auto SegmentNull = 1520 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1521 auto FlatNull = 1522 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1523 1524 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1525 if (!ApertureReg.isValid()) 1526 return false; 1527 1528 auto CmpRes = 1529 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1530 1531 // Coerce the type of the low half of the result so we can use merge_values. 1532 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1533 1534 // TODO: Should we allow mismatched types but matching sizes in merges to 1535 // avoid the ptrtoint? 1536 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1537 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1538 1539 MI.eraseFromParent(); 1540 return true; 1541 } 1542 1543 bool AMDGPULegalizerInfo::legalizeFrint( 1544 MachineInstr &MI, MachineRegisterInfo &MRI, 1545 MachineIRBuilder &B) const { 1546 B.setInstr(MI); 1547 1548 Register Src = MI.getOperand(1).getReg(); 1549 LLT Ty = MRI.getType(Src); 1550 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1551 1552 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1553 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1554 1555 auto C1 = B.buildFConstant(Ty, C1Val); 1556 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1557 1558 // TODO: Should this propagate fast-math-flags? 1559 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1560 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1561 1562 auto C2 = B.buildFConstant(Ty, C2Val); 1563 auto Fabs = B.buildFAbs(Ty, Src); 1564 1565 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1566 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1567 return true; 1568 } 1569 1570 bool AMDGPULegalizerInfo::legalizeFceil( 1571 MachineInstr &MI, MachineRegisterInfo &MRI, 1572 MachineIRBuilder &B) const { 1573 B.setInstr(MI); 1574 1575 const LLT S1 = LLT::scalar(1); 1576 const LLT S64 = LLT::scalar(64); 1577 1578 Register Src = MI.getOperand(1).getReg(); 1579 assert(MRI.getType(Src) == S64); 1580 1581 // result = trunc(src) 1582 // if (src > 0.0 && src != result) 1583 // result += 1.0 1584 1585 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1586 1587 const auto Zero = B.buildFConstant(S64, 0.0); 1588 const auto One = B.buildFConstant(S64, 1.0); 1589 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1590 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1591 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1592 auto Add = B.buildSelect(S64, And, One, Zero); 1593 1594 // TODO: Should this propagate fast-math-flags? 1595 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1596 return true; 1597 } 1598 1599 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1600 MachineIRBuilder &B) { 1601 const unsigned FractBits = 52; 1602 const unsigned ExpBits = 11; 1603 LLT S32 = LLT::scalar(32); 1604 1605 auto Const0 = B.buildConstant(S32, FractBits - 32); 1606 auto Const1 = B.buildConstant(S32, ExpBits); 1607 1608 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1609 .addUse(Const0.getReg(0)) 1610 .addUse(Const1.getReg(0)); 1611 1612 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1613 } 1614 1615 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1616 MachineInstr &MI, MachineRegisterInfo &MRI, 1617 MachineIRBuilder &B) const { 1618 B.setInstr(MI); 1619 1620 const LLT S1 = LLT::scalar(1); 1621 const LLT S32 = LLT::scalar(32); 1622 const LLT S64 = LLT::scalar(64); 1623 1624 Register Src = MI.getOperand(1).getReg(); 1625 assert(MRI.getType(Src) == S64); 1626 1627 // TODO: Should this use extract since the low half is unused? 1628 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1629 Register Hi = Unmerge.getReg(1); 1630 1631 // Extract the upper half, since this is where we will find the sign and 1632 // exponent. 1633 auto Exp = extractF64Exponent(Hi, B); 1634 1635 const unsigned FractBits = 52; 1636 1637 // Extract the sign bit. 1638 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1639 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1640 1641 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1642 1643 const auto Zero32 = B.buildConstant(S32, 0); 1644 1645 // Extend back to 64-bits. 1646 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1647 1648 auto Shr = B.buildAShr(S64, FractMask, Exp); 1649 auto Not = B.buildNot(S64, Shr); 1650 auto Tmp0 = B.buildAnd(S64, Src, Not); 1651 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1652 1653 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1654 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1655 1656 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1657 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1658 return true; 1659 } 1660 1661 bool AMDGPULegalizerInfo::legalizeITOFP( 1662 MachineInstr &MI, MachineRegisterInfo &MRI, 1663 MachineIRBuilder &B, bool Signed) const { 1664 B.setInstr(MI); 1665 1666 Register Dst = MI.getOperand(0).getReg(); 1667 Register Src = MI.getOperand(1).getReg(); 1668 1669 const LLT S64 = LLT::scalar(64); 1670 const LLT S32 = LLT::scalar(32); 1671 1672 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1673 1674 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1675 1676 auto CvtHi = Signed ? 1677 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1678 B.buildUITOFP(S64, Unmerge.getReg(1)); 1679 1680 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1681 1682 auto ThirtyTwo = B.buildConstant(S32, 32); 1683 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1684 .addUse(CvtHi.getReg(0)) 1685 .addUse(ThirtyTwo.getReg(0)); 1686 1687 // TODO: Should this propagate fast-math-flags? 1688 B.buildFAdd(Dst, LdExp, CvtLo); 1689 MI.eraseFromParent(); 1690 return true; 1691 } 1692 1693 // TODO: Copied from DAG implementation. Verify logic and document how this 1694 // actually works. 1695 bool AMDGPULegalizerInfo::legalizeFPTOI( 1696 MachineInstr &MI, MachineRegisterInfo &MRI, 1697 MachineIRBuilder &B, bool Signed) const { 1698 B.setInstr(MI); 1699 1700 Register Dst = MI.getOperand(0).getReg(); 1701 Register Src = MI.getOperand(1).getReg(); 1702 1703 const LLT S64 = LLT::scalar(64); 1704 const LLT S32 = LLT::scalar(32); 1705 1706 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1707 1708 unsigned Flags = MI.getFlags(); 1709 1710 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1711 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1712 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1713 1714 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1715 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1716 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1717 1718 auto Hi = Signed ? 1719 B.buildFPTOSI(S32, FloorMul) : 1720 B.buildFPTOUI(S32, FloorMul); 1721 auto Lo = B.buildFPTOUI(S32, Fma); 1722 1723 B.buildMerge(Dst, { Lo, Hi }); 1724 MI.eraseFromParent(); 1725 1726 return true; 1727 } 1728 1729 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1730 MachineInstr &MI, MachineRegisterInfo &MRI, 1731 MachineIRBuilder &B) const { 1732 MachineFunction &MF = B.getMF(); 1733 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1734 1735 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1736 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1737 1738 // With ieee_mode disabled, the instructions have the correct behavior 1739 // already for G_FMINNUM/G_FMAXNUM 1740 if (!MFI->getMode().IEEE) 1741 return !IsIEEEOp; 1742 1743 if (IsIEEEOp) 1744 return true; 1745 1746 MachineIRBuilder HelperBuilder(MI); 1747 GISelObserverWrapper DummyObserver; 1748 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1749 HelperBuilder.setInstr(MI); 1750 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1751 } 1752 1753 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1754 MachineInstr &MI, MachineRegisterInfo &MRI, 1755 MachineIRBuilder &B) const { 1756 // TODO: Should move some of this into LegalizerHelper. 1757 1758 // TODO: Promote dynamic indexing of s16 to s32 1759 1760 // FIXME: Artifact combiner probably should have replaced the truncated 1761 // constant before this, so we shouldn't need 1762 // getConstantVRegValWithLookThrough. 1763 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1764 MI.getOperand(2).getReg(), MRI); 1765 if (!IdxVal) // Dynamic case will be selected to register indexing. 1766 return true; 1767 1768 Register Dst = MI.getOperand(0).getReg(); 1769 Register Vec = MI.getOperand(1).getReg(); 1770 1771 LLT VecTy = MRI.getType(Vec); 1772 LLT EltTy = VecTy.getElementType(); 1773 assert(EltTy == MRI.getType(Dst)); 1774 1775 B.setInstr(MI); 1776 1777 if (IdxVal->Value < VecTy.getNumElements()) 1778 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1779 else 1780 B.buildUndef(Dst); 1781 1782 MI.eraseFromParent(); 1783 return true; 1784 } 1785 1786 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1787 MachineInstr &MI, MachineRegisterInfo &MRI, 1788 MachineIRBuilder &B) const { 1789 // TODO: Should move some of this into LegalizerHelper. 1790 1791 // TODO: Promote dynamic indexing of s16 to s32 1792 1793 // FIXME: Artifact combiner probably should have replaced the truncated 1794 // constant before this, so we shouldn't need 1795 // getConstantVRegValWithLookThrough. 1796 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1797 MI.getOperand(3).getReg(), MRI); 1798 if (!IdxVal) // Dynamic case will be selected to register indexing. 1799 return true; 1800 1801 Register Dst = MI.getOperand(0).getReg(); 1802 Register Vec = MI.getOperand(1).getReg(); 1803 Register Ins = MI.getOperand(2).getReg(); 1804 1805 LLT VecTy = MRI.getType(Vec); 1806 LLT EltTy = VecTy.getElementType(); 1807 assert(EltTy == MRI.getType(Ins)); 1808 1809 B.setInstr(MI); 1810 1811 if (IdxVal->Value < VecTy.getNumElements()) 1812 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 1813 else 1814 B.buildUndef(Dst); 1815 1816 MI.eraseFromParent(); 1817 return true; 1818 } 1819 1820 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1821 MachineInstr &MI, MachineRegisterInfo &MRI, 1822 MachineIRBuilder &B) const { 1823 const LLT V2S16 = LLT::vector(2, 16); 1824 1825 Register Dst = MI.getOperand(0).getReg(); 1826 Register Src0 = MI.getOperand(1).getReg(); 1827 LLT DstTy = MRI.getType(Dst); 1828 LLT SrcTy = MRI.getType(Src0); 1829 1830 if (SrcTy == V2S16 && DstTy == V2S16 && 1831 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1832 return true; 1833 1834 MachineIRBuilder HelperBuilder(MI); 1835 GISelObserverWrapper DummyObserver; 1836 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1837 HelperBuilder.setInstr(MI); 1838 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1839 } 1840 1841 bool AMDGPULegalizerInfo::legalizeSinCos( 1842 MachineInstr &MI, MachineRegisterInfo &MRI, 1843 MachineIRBuilder &B) const { 1844 B.setInstr(MI); 1845 1846 Register DstReg = MI.getOperand(0).getReg(); 1847 Register SrcReg = MI.getOperand(1).getReg(); 1848 LLT Ty = MRI.getType(DstReg); 1849 unsigned Flags = MI.getFlags(); 1850 1851 Register TrigVal; 1852 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1853 if (ST.hasTrigReducedRange()) { 1854 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1855 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1856 .addUse(MulVal.getReg(0)) 1857 .setMIFlags(Flags).getReg(0); 1858 } else 1859 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1860 1861 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1862 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1863 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1864 .addUse(TrigVal) 1865 .setMIFlags(Flags); 1866 MI.eraseFromParent(); 1867 return true; 1868 } 1869 1870 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1871 Register DstReg, LLT PtrTy, 1872 MachineIRBuilder &B, const GlobalValue *GV, 1873 unsigned Offset, unsigned GAFlags) const { 1874 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1875 // to the following code sequence: 1876 // 1877 // For constant address space: 1878 // s_getpc_b64 s[0:1] 1879 // s_add_u32 s0, s0, $symbol 1880 // s_addc_u32 s1, s1, 0 1881 // 1882 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1883 // a fixup or relocation is emitted to replace $symbol with a literal 1884 // constant, which is a pc-relative offset from the encoding of the $symbol 1885 // operand to the global variable. 1886 // 1887 // For global address space: 1888 // s_getpc_b64 s[0:1] 1889 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1890 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1891 // 1892 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1893 // fixups or relocations are emitted to replace $symbol@*@lo and 1894 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1895 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1896 // operand to the global variable. 1897 // 1898 // What we want here is an offset from the value returned by s_getpc 1899 // (which is the address of the s_add_u32 instruction) to the global 1900 // variable, but since the encoding of $symbol starts 4 bytes after the start 1901 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1902 // small. This requires us to add 4 to the global variable offset in order to 1903 // compute the correct address. 1904 1905 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1906 1907 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1908 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1909 1910 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1911 .addDef(PCReg); 1912 1913 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1914 if (GAFlags == SIInstrInfo::MO_NONE) 1915 MIB.addImm(0); 1916 else 1917 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1918 1919 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1920 1921 if (PtrTy.getSizeInBits() == 32) 1922 B.buildExtract(DstReg, PCReg, 0); 1923 return true; 1924 } 1925 1926 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1927 MachineInstr &MI, MachineRegisterInfo &MRI, 1928 MachineIRBuilder &B) const { 1929 Register DstReg = MI.getOperand(0).getReg(); 1930 LLT Ty = MRI.getType(DstReg); 1931 unsigned AS = Ty.getAddressSpace(); 1932 1933 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1934 MachineFunction &MF = B.getMF(); 1935 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1936 B.setInstr(MI); 1937 1938 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1939 if (!MFI->isEntryFunction()) { 1940 const Function &Fn = MF.getFunction(); 1941 DiagnosticInfoUnsupported BadLDSDecl( 1942 Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 1943 Fn.getContext().diagnose(BadLDSDecl); 1944 } 1945 1946 // TODO: We could emit code to handle the initialization somewhere. 1947 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1948 const SITargetLowering *TLI = ST.getTargetLowering(); 1949 if (!TLI->shouldUseLDSConstAddress(GV)) { 1950 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 1951 return true; // Leave in place; 1952 } 1953 1954 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1955 MI.eraseFromParent(); 1956 return true; 1957 } 1958 1959 const Function &Fn = MF.getFunction(); 1960 DiagnosticInfoUnsupported BadInit( 1961 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1962 Fn.getContext().diagnose(BadInit); 1963 return true; 1964 } 1965 1966 const SITargetLowering *TLI = ST.getTargetLowering(); 1967 1968 if (TLI->shouldEmitFixup(GV)) { 1969 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 1970 MI.eraseFromParent(); 1971 return true; 1972 } 1973 1974 if (TLI->shouldEmitPCReloc(GV)) { 1975 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 1976 MI.eraseFromParent(); 1977 return true; 1978 } 1979 1980 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1981 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 1982 1983 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 1984 MachinePointerInfo::getGOT(MF), 1985 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1986 MachineMemOperand::MOInvariant, 1987 8 /*Size*/, 8 /*Align*/); 1988 1989 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 1990 1991 if (Ty.getSizeInBits() == 32) { 1992 // Truncate if this is a 32-bit constant adrdess. 1993 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 1994 B.buildExtract(DstReg, Load, 0); 1995 } else 1996 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 1997 1998 MI.eraseFromParent(); 1999 return true; 2000 } 2001 2002 bool AMDGPULegalizerInfo::legalizeLoad( 2003 MachineInstr &MI, MachineRegisterInfo &MRI, 2004 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2005 B.setInstr(MI); 2006 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2007 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2008 Observer.changingInstr(MI); 2009 MI.getOperand(1).setReg(Cast.getReg(0)); 2010 Observer.changedInstr(MI); 2011 return true; 2012 } 2013 2014 bool AMDGPULegalizerInfo::legalizeFMad( 2015 MachineInstr &MI, MachineRegisterInfo &MRI, 2016 MachineIRBuilder &B) const { 2017 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2018 assert(Ty.isScalar()); 2019 2020 MachineFunction &MF = B.getMF(); 2021 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2022 2023 // TODO: Always legal with future ftz flag. 2024 // FIXME: Do we need just output? 2025 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2026 return true; 2027 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2028 return true; 2029 2030 MachineIRBuilder HelperBuilder(MI); 2031 GISelObserverWrapper DummyObserver; 2032 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2033 HelperBuilder.setMBB(*MI.getParent()); 2034 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2035 } 2036 2037 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2038 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2039 Register DstReg = MI.getOperand(0).getReg(); 2040 Register PtrReg = MI.getOperand(1).getReg(); 2041 Register CmpVal = MI.getOperand(2).getReg(); 2042 Register NewVal = MI.getOperand(3).getReg(); 2043 2044 assert(SITargetLowering::isFlatGlobalAddrSpace( 2045 MRI.getType(PtrReg).getAddressSpace()) && 2046 "this should not have been custom lowered"); 2047 2048 LLT ValTy = MRI.getType(CmpVal); 2049 LLT VecTy = LLT::vector(2, ValTy); 2050 2051 B.setInstr(MI); 2052 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2053 2054 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2055 .addDef(DstReg) 2056 .addUse(PtrReg) 2057 .addUse(PackedVal) 2058 .setMemRefs(MI.memoperands()); 2059 2060 MI.eraseFromParent(); 2061 return true; 2062 } 2063 2064 bool AMDGPULegalizerInfo::legalizeFlog( 2065 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2066 Register Dst = MI.getOperand(0).getReg(); 2067 Register Src = MI.getOperand(1).getReg(); 2068 LLT Ty = B.getMRI()->getType(Dst); 2069 unsigned Flags = MI.getFlags(); 2070 B.setInstr(MI); 2071 2072 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2073 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2074 2075 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2076 MI.eraseFromParent(); 2077 return true; 2078 } 2079 2080 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2081 MachineIRBuilder &B) const { 2082 Register Dst = MI.getOperand(0).getReg(); 2083 Register Src = MI.getOperand(1).getReg(); 2084 unsigned Flags = MI.getFlags(); 2085 LLT Ty = B.getMRI()->getType(Dst); 2086 B.setInstr(MI); 2087 2088 auto K = B.buildFConstant(Ty, numbers::log2e); 2089 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2090 B.buildFExp2(Dst, Mul, Flags); 2091 MI.eraseFromParent(); 2092 return true; 2093 } 2094 2095 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2096 MachineIRBuilder &B) const { 2097 Register Dst = MI.getOperand(0).getReg(); 2098 Register Src0 = MI.getOperand(1).getReg(); 2099 Register Src1 = MI.getOperand(2).getReg(); 2100 unsigned Flags = MI.getFlags(); 2101 LLT Ty = B.getMRI()->getType(Dst); 2102 B.setInstr(MI); 2103 const LLT S16 = LLT::scalar(16); 2104 const LLT S32 = LLT::scalar(32); 2105 2106 if (Ty == S32) { 2107 auto Log = B.buildFLog2(S32, Src0, Flags); 2108 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2109 .addUse(Log.getReg(0)) 2110 .addUse(Src1) 2111 .setMIFlags(Flags); 2112 B.buildFExp2(Dst, Mul, Flags); 2113 } else if (Ty == S16) { 2114 // There's no f16 fmul_legacy, so we need to convert for it. 2115 auto Log = B.buildFLog2(S16, Src0, Flags); 2116 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2117 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2118 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2119 .addUse(Ext0.getReg(0)) 2120 .addUse(Ext1.getReg(0)) 2121 .setMIFlags(Flags); 2122 2123 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2124 } else 2125 return false; 2126 2127 MI.eraseFromParent(); 2128 return true; 2129 } 2130 2131 // Find a source register, ignoring any possible source modifiers. 2132 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2133 Register ModSrc = OrigSrc; 2134 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2135 ModSrc = SrcFNeg->getOperand(1).getReg(); 2136 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2137 ModSrc = SrcFAbs->getOperand(1).getReg(); 2138 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2139 ModSrc = SrcFAbs->getOperand(1).getReg(); 2140 return ModSrc; 2141 } 2142 2143 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2144 MachineRegisterInfo &MRI, 2145 MachineIRBuilder &B) const { 2146 B.setInstr(MI); 2147 2148 const LLT S1 = LLT::scalar(1); 2149 const LLT S64 = LLT::scalar(64); 2150 Register Dst = MI.getOperand(0).getReg(); 2151 Register OrigSrc = MI.getOperand(1).getReg(); 2152 unsigned Flags = MI.getFlags(); 2153 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2154 "this should not have been custom lowered"); 2155 2156 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2157 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2158 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2159 // V_FRACT bug is: 2160 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2161 // 2162 // Convert floor(x) to (x - fract(x)) 2163 2164 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2165 .addUse(OrigSrc) 2166 .setMIFlags(Flags); 2167 2168 // Give source modifier matching some assistance before obscuring a foldable 2169 // pattern. 2170 2171 // TODO: We can avoid the neg on the fract? The input sign to fract 2172 // shouldn't matter? 2173 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2174 2175 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2176 2177 Register Min = MRI.createGenericVirtualRegister(S64); 2178 2179 // We don't need to concern ourselves with the snan handling difference, so 2180 // use the one which will directly select. 2181 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2182 if (MFI->getMode().IEEE) 2183 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2184 else 2185 B.buildFMinNum(Min, Fract, Const, Flags); 2186 2187 Register CorrectedFract = Min; 2188 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2189 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2190 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2191 } 2192 2193 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2194 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2195 2196 MI.eraseFromParent(); 2197 return true; 2198 } 2199 2200 // Turn an illegal packed v2s16 build vector into bit operations. 2201 // TODO: This should probably be a bitcast action in LegalizerHelper. 2202 bool AMDGPULegalizerInfo::legalizeBuildVector( 2203 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2204 Register Dst = MI.getOperand(0).getReg(); 2205 LLT DstTy = MRI.getType(Dst); 2206 const LLT S32 = LLT::scalar(32); 2207 const LLT V2S16 = LLT::vector(2, 16); 2208 (void)DstTy; 2209 (void)V2S16; 2210 assert(DstTy == V2S16); 2211 2212 Register Src0 = MI.getOperand(1).getReg(); 2213 Register Src1 = MI.getOperand(2).getReg(); 2214 assert(MRI.getType(Src0) == LLT::scalar(16)); 2215 2216 B.setInstr(MI); 2217 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2218 B.buildBitcast(Dst, Merge); 2219 2220 MI.eraseFromParent(); 2221 return true; 2222 } 2223 2224 // Return the use branch instruction, otherwise null if the usage is invalid. 2225 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2226 MachineRegisterInfo &MRI, 2227 MachineInstr *&Br) { 2228 Register CondDef = MI.getOperand(0).getReg(); 2229 if (!MRI.hasOneNonDBGUse(CondDef)) 2230 return nullptr; 2231 2232 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2233 if (UseMI.getParent() != MI.getParent() || 2234 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2235 return nullptr; 2236 2237 // Make sure the cond br is followed by a G_BR 2238 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2239 if (Next != MI.getParent()->end()) { 2240 if (Next->getOpcode() != AMDGPU::G_BR) 2241 return nullptr; 2242 Br = &*Next; 2243 } 2244 2245 return &UseMI; 2246 } 2247 2248 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 2249 Register Reg, LLT Ty) const { 2250 Register LiveIn = MRI.getLiveInVirtReg(Reg); 2251 if (LiveIn) 2252 return LiveIn; 2253 2254 Register NewReg = MRI.createGenericVirtualRegister(Ty); 2255 MRI.addLiveIn(Reg, NewReg); 2256 return NewReg; 2257 } 2258 2259 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2260 const ArgDescriptor *Arg) const { 2261 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2262 return false; // TODO: Handle these 2263 2264 assert(Arg->getRegister().isPhysical()); 2265 2266 MachineRegisterInfo &MRI = *B.getMRI(); 2267 2268 LLT Ty = MRI.getType(DstReg); 2269 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 2270 2271 if (Arg->isMasked()) { 2272 // TODO: Should we try to emit this once in the entry block? 2273 const LLT S32 = LLT::scalar(32); 2274 const unsigned Mask = Arg->getMask(); 2275 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2276 2277 Register AndMaskSrc = LiveIn; 2278 2279 if (Shift != 0) { 2280 auto ShiftAmt = B.buildConstant(S32, Shift); 2281 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2282 } 2283 2284 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2285 } else 2286 B.buildCopy(DstReg, LiveIn); 2287 2288 // Insert the argument copy if it doens't already exist. 2289 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2290 if (!MRI.getVRegDef(LiveIn)) { 2291 // FIXME: Should have scoped insert pt 2292 MachineBasicBlock &OrigInsBB = B.getMBB(); 2293 auto OrigInsPt = B.getInsertPt(); 2294 2295 MachineBasicBlock &EntryMBB = B.getMF().front(); 2296 EntryMBB.addLiveIn(Arg->getRegister()); 2297 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2298 B.buildCopy(LiveIn, Arg->getRegister()); 2299 2300 B.setInsertPt(OrigInsBB, OrigInsPt); 2301 } 2302 2303 return true; 2304 } 2305 2306 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2307 MachineInstr &MI, 2308 MachineRegisterInfo &MRI, 2309 MachineIRBuilder &B, 2310 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2311 B.setInstr(MI); 2312 2313 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2314 2315 const ArgDescriptor *Arg; 2316 const TargetRegisterClass *RC; 2317 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 2318 if (!Arg) { 2319 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2320 return false; 2321 } 2322 2323 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 2324 MI.eraseFromParent(); 2325 return true; 2326 } 2327 2328 return false; 2329 } 2330 2331 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2332 MachineRegisterInfo &MRI, 2333 MachineIRBuilder &B) const { 2334 B.setInstr(MI); 2335 Register Dst = MI.getOperand(0).getReg(); 2336 LLT DstTy = MRI.getType(Dst); 2337 LLT S16 = LLT::scalar(16); 2338 LLT S32 = LLT::scalar(32); 2339 LLT S64 = LLT::scalar(64); 2340 2341 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2342 return true; 2343 2344 if (DstTy == S16) 2345 return legalizeFDIV16(MI, MRI, B); 2346 if (DstTy == S32) 2347 return legalizeFDIV32(MI, MRI, B); 2348 if (DstTy == S64) 2349 return legalizeFDIV64(MI, MRI, B); 2350 2351 return false; 2352 } 2353 2354 static Register buildDivRCP(MachineIRBuilder &B, Register Src) { 2355 const LLT S32 = LLT::scalar(32); 2356 2357 auto Cvt0 = B.buildUITOFP(S32, Src); 2358 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0}); 2359 auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000)); 2360 auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1); 2361 return B.buildFPTOUI(S32, Mul).getReg(0); 2362 } 2363 2364 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2365 Register DstReg, 2366 Register Num, 2367 Register Den, 2368 bool IsRem) const { 2369 const LLT S1 = LLT::scalar(1); 2370 const LLT S32 = LLT::scalar(32); 2371 2372 // RCP = URECIP(Den) = 2^32 / Den + e 2373 // e is rounding error. 2374 auto RCP = buildDivRCP(B, Den); 2375 2376 // RCP_LO = mul(RCP, Den) 2377 auto RCP_LO = B.buildMul(S32, RCP, Den); 2378 2379 // RCP_HI = mulhu (RCP, Den) */ 2380 auto RCP_HI = B.buildUMulH(S32, RCP, Den); 2381 2382 // NEG_RCP_LO = -RCP_LO 2383 auto Zero = B.buildConstant(S32, 0); 2384 auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO); 2385 2386 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 2387 auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero); 2388 auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO); 2389 2390 // Calculate the rounding error from the URECIP instruction 2391 // E = mulhu(ABS_RCP_LO, RCP) 2392 auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP); 2393 2394 // RCP_A_E = RCP + E 2395 auto RCP_A_E = B.buildAdd(S32, RCP, E); 2396 2397 // RCP_S_E = RCP - E 2398 auto RCP_S_E = B.buildSub(S32, RCP, E); 2399 2400 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 2401 auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E); 2402 2403 // Quotient = mulhu(Tmp0, Num)stmp 2404 auto Quotient = B.buildUMulH(S32, Tmp0, Num); 2405 2406 // Num_S_Remainder = Quotient * Den 2407 auto Num_S_Remainder = B.buildMul(S32, Quotient, Den); 2408 2409 // Remainder = Num - Num_S_Remainder 2410 auto Remainder = B.buildSub(S32, Num, Num_S_Remainder); 2411 2412 // Remainder_GE_Den = Remainder >= Den 2413 auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den); 2414 2415 // Remainder_GE_Zero = Num >= Num_S_Remainder; 2416 auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1, 2417 Num, Num_S_Remainder); 2418 2419 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 2420 auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero); 2421 2422 // Calculate Division result: 2423 2424 // Quotient_A_One = Quotient + 1 2425 auto One = B.buildConstant(S32, 1); 2426 auto Quotient_A_One = B.buildAdd(S32, Quotient, One); 2427 2428 // Quotient_S_One = Quotient - 1 2429 auto Quotient_S_One = B.buildSub(S32, Quotient, One); 2430 2431 // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient) 2432 auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One); 2433 2434 // Div = (Remainder_GE_Zero ? Div : Quotient_S_One) 2435 if (IsRem) { 2436 Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One); 2437 2438 // Calculate Rem result: 2439 auto Remainder_S_Den = B.buildSub(S32, Remainder, Den); 2440 2441 // Remainder_A_Den = Remainder + Den 2442 auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den); 2443 2444 // Rem = (Tmp1 ? Remainder_S_Den : Remainder) 2445 auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder); 2446 2447 // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den) 2448 B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den); 2449 } else { 2450 B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One); 2451 } 2452 } 2453 2454 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2455 MachineRegisterInfo &MRI, 2456 MachineIRBuilder &B) const { 2457 B.setInstr(MI); 2458 const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM; 2459 Register DstReg = MI.getOperand(0).getReg(); 2460 Register Num = MI.getOperand(1).getReg(); 2461 Register Den = MI.getOperand(2).getReg(); 2462 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem); 2463 MI.eraseFromParent(); 2464 return true; 2465 } 2466 2467 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2468 MachineRegisterInfo &MRI, 2469 MachineIRBuilder &B) const { 2470 if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32)) 2471 return legalizeUDIV_UREM32(MI, MRI, B); 2472 return false; 2473 } 2474 2475 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI, 2476 MachineRegisterInfo &MRI, 2477 MachineIRBuilder &B) const { 2478 B.setInstr(MI); 2479 const LLT S32 = LLT::scalar(32); 2480 2481 const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM; 2482 Register DstReg = MI.getOperand(0).getReg(); 2483 Register LHS = MI.getOperand(1).getReg(); 2484 Register RHS = MI.getOperand(2).getReg(); 2485 2486 auto ThirtyOne = B.buildConstant(S32, 31); 2487 auto LHSign = B.buildAShr(S32, LHS, ThirtyOne); 2488 auto RHSign = B.buildAShr(S32, LHS, ThirtyOne); 2489 2490 LHS = B.buildAdd(S32, LHS, LHSign).getReg(0); 2491 RHS = B.buildAdd(S32, RHS, RHSign).getReg(0); 2492 2493 LHS = B.buildXor(S32, LHS, LHSign).getReg(0); 2494 RHS = B.buildXor(S32, RHS, RHSign).getReg(0); 2495 2496 Register UDivRem = MRI.createGenericVirtualRegister(S32); 2497 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem); 2498 2499 if (IsRem) { 2500 auto RSign = LHSign; // Remainder sign is the same as LHS 2501 UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0); 2502 B.buildSub(DstReg, UDivRem, RSign); 2503 } else { 2504 auto DSign = B.buildXor(S32, LHSign, RHSign); 2505 UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0); 2506 B.buildSub(DstReg, UDivRem, DSign); 2507 } 2508 2509 MI.eraseFromParent(); 2510 return true; 2511 } 2512 2513 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2514 MachineRegisterInfo &MRI, 2515 MachineIRBuilder &B) const { 2516 if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32)) 2517 return legalizeSDIV_SREM32(MI, MRI, B); 2518 return false; 2519 } 2520 2521 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2522 MachineRegisterInfo &MRI, 2523 MachineIRBuilder &B) const { 2524 Register Res = MI.getOperand(0).getReg(); 2525 Register LHS = MI.getOperand(1).getReg(); 2526 Register RHS = MI.getOperand(2).getReg(); 2527 2528 uint16_t Flags = MI.getFlags(); 2529 2530 LLT ResTy = MRI.getType(Res); 2531 LLT S32 = LLT::scalar(32); 2532 LLT S64 = LLT::scalar(64); 2533 2534 const MachineFunction &MF = B.getMF(); 2535 bool Unsafe = 2536 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2537 2538 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2539 return false; 2540 2541 if (!Unsafe && ResTy == S32 && 2542 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2543 return false; 2544 2545 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2546 // 1 / x -> RCP(x) 2547 if (CLHS->isExactlyValue(1.0)) { 2548 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2549 .addUse(RHS) 2550 .setMIFlags(Flags); 2551 2552 MI.eraseFromParent(); 2553 return true; 2554 } 2555 2556 // -1 / x -> RCP( FNEG(x) ) 2557 if (CLHS->isExactlyValue(-1.0)) { 2558 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2559 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2560 .addUse(FNeg.getReg(0)) 2561 .setMIFlags(Flags); 2562 2563 MI.eraseFromParent(); 2564 return true; 2565 } 2566 } 2567 2568 // x / y -> x * (1.0 / y) 2569 if (Unsafe) { 2570 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2571 .addUse(RHS) 2572 .setMIFlags(Flags); 2573 B.buildFMul(Res, LHS, RCP, Flags); 2574 2575 MI.eraseFromParent(); 2576 return true; 2577 } 2578 2579 return false; 2580 } 2581 2582 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2583 MachineRegisterInfo &MRI, 2584 MachineIRBuilder &B) const { 2585 B.setInstr(MI); 2586 Register Res = MI.getOperand(0).getReg(); 2587 Register LHS = MI.getOperand(1).getReg(); 2588 Register RHS = MI.getOperand(2).getReg(); 2589 2590 uint16_t Flags = MI.getFlags(); 2591 2592 LLT S16 = LLT::scalar(16); 2593 LLT S32 = LLT::scalar(32); 2594 2595 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2596 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2597 2598 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2599 .addUse(RHSExt.getReg(0)) 2600 .setMIFlags(Flags); 2601 2602 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2603 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2604 2605 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2606 .addUse(RDst.getReg(0)) 2607 .addUse(RHS) 2608 .addUse(LHS) 2609 .setMIFlags(Flags); 2610 2611 MI.eraseFromParent(); 2612 return true; 2613 } 2614 2615 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2616 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2617 static void toggleSPDenormMode(bool Enable, 2618 MachineIRBuilder &B, 2619 const GCNSubtarget &ST, 2620 AMDGPU::SIModeRegisterDefaults Mode) { 2621 // Set SP denorm mode to this value. 2622 unsigned SPDenormMode = 2623 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2624 2625 if (ST.hasDenormModeInst()) { 2626 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2627 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2628 2629 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2630 B.buildInstr(AMDGPU::S_DENORM_MODE) 2631 .addImm(NewDenormModeValue); 2632 2633 } else { 2634 // Select FP32 bit field in mode register. 2635 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2636 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2637 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2638 2639 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2640 .addImm(SPDenormMode) 2641 .addImm(SPDenormModeBitField); 2642 } 2643 } 2644 2645 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2646 MachineRegisterInfo &MRI, 2647 MachineIRBuilder &B) const { 2648 B.setInstr(MI); 2649 Register Res = MI.getOperand(0).getReg(); 2650 Register LHS = MI.getOperand(1).getReg(); 2651 Register RHS = MI.getOperand(2).getReg(); 2652 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2653 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2654 2655 uint16_t Flags = MI.getFlags(); 2656 2657 LLT S32 = LLT::scalar(32); 2658 LLT S1 = LLT::scalar(1); 2659 2660 auto One = B.buildFConstant(S32, 1.0f); 2661 2662 auto DenominatorScaled = 2663 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2664 .addUse(RHS) 2665 .addUse(LHS) 2666 .addImm(1) 2667 .setMIFlags(Flags); 2668 auto NumeratorScaled = 2669 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2670 .addUse(LHS) 2671 .addUse(RHS) 2672 .addImm(0) 2673 .setMIFlags(Flags); 2674 2675 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2676 .addUse(DenominatorScaled.getReg(0)) 2677 .setMIFlags(Flags); 2678 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2679 2680 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2681 // aren't modeled as reading it. 2682 if (!Mode.allFP32Denormals()) 2683 toggleSPDenormMode(true, B, ST, Mode); 2684 2685 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2686 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2687 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2688 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2689 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2690 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2691 2692 if (!Mode.allFP32Denormals()) 2693 toggleSPDenormMode(false, B, ST, Mode); 2694 2695 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2696 .addUse(Fma4.getReg(0)) 2697 .addUse(Fma1.getReg(0)) 2698 .addUse(Fma3.getReg(0)) 2699 .addUse(NumeratorScaled.getReg(1)) 2700 .setMIFlags(Flags); 2701 2702 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2703 .addUse(Fmas.getReg(0)) 2704 .addUse(RHS) 2705 .addUse(LHS) 2706 .setMIFlags(Flags); 2707 2708 MI.eraseFromParent(); 2709 return true; 2710 } 2711 2712 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2713 MachineRegisterInfo &MRI, 2714 MachineIRBuilder &B) const { 2715 B.setInstr(MI); 2716 Register Res = MI.getOperand(0).getReg(); 2717 Register LHS = MI.getOperand(1).getReg(); 2718 Register RHS = MI.getOperand(2).getReg(); 2719 2720 uint16_t Flags = MI.getFlags(); 2721 2722 LLT S64 = LLT::scalar(64); 2723 LLT S1 = LLT::scalar(1); 2724 2725 auto One = B.buildFConstant(S64, 1.0); 2726 2727 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2728 .addUse(LHS) 2729 .addUse(RHS) 2730 .addImm(1) 2731 .setMIFlags(Flags); 2732 2733 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2734 2735 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2736 .addUse(DivScale0.getReg(0)) 2737 .setMIFlags(Flags); 2738 2739 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2740 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2741 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2742 2743 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2744 .addUse(LHS) 2745 .addUse(RHS) 2746 .addImm(0) 2747 .setMIFlags(Flags); 2748 2749 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2750 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); 2751 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2752 2753 Register Scale; 2754 if (!ST.hasUsableDivScaleConditionOutput()) { 2755 // Workaround a hardware bug on SI where the condition output from div_scale 2756 // is not usable. 2757 2758 LLT S32 = LLT::scalar(32); 2759 2760 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2761 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2762 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2763 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 2764 2765 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 2766 Scale1Unmerge.getReg(1)); 2767 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 2768 Scale0Unmerge.getReg(1)); 2769 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 2770 } else { 2771 Scale = DivScale1.getReg(1); 2772 } 2773 2774 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 2775 .addUse(Fma4.getReg(0)) 2776 .addUse(Fma3.getReg(0)) 2777 .addUse(Mul.getReg(0)) 2778 .addUse(Scale) 2779 .setMIFlags(Flags); 2780 2781 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 2782 .addUse(Fmas.getReg(0)) 2783 .addUse(RHS) 2784 .addUse(LHS) 2785 .setMIFlags(Flags); 2786 2787 MI.eraseFromParent(); 2788 return true; 2789 } 2790 2791 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 2792 MachineRegisterInfo &MRI, 2793 MachineIRBuilder &B) const { 2794 B.setInstr(MI); 2795 Register Res = MI.getOperand(0).getReg(); 2796 Register LHS = MI.getOperand(2).getReg(); 2797 Register RHS = MI.getOperand(3).getReg(); 2798 uint16_t Flags = MI.getFlags(); 2799 2800 LLT S32 = LLT::scalar(32); 2801 LLT S1 = LLT::scalar(1); 2802 2803 auto Abs = B.buildFAbs(S32, RHS, Flags); 2804 const APFloat C0Val(1.0f); 2805 2806 auto C0 = B.buildConstant(S32, 0x6f800000); 2807 auto C1 = B.buildConstant(S32, 0x2f800000); 2808 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 2809 2810 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 2811 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 2812 2813 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 2814 2815 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2816 .addUse(Mul0.getReg(0)) 2817 .setMIFlags(Flags); 2818 2819 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 2820 2821 B.buildFMul(Res, Sel, Mul1, Flags); 2822 2823 MI.eraseFromParent(); 2824 return true; 2825 } 2826 2827 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 2828 MachineRegisterInfo &MRI, 2829 MachineIRBuilder &B) const { 2830 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2831 if (!MFI->isEntryFunction()) { 2832 return legalizePreloadedArgIntrin(MI, MRI, B, 2833 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 2834 } 2835 2836 B.setInstr(MI); 2837 2838 uint64_t Offset = 2839 ST.getTargetLowering()->getImplicitParameterOffset( 2840 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 2841 Register DstReg = MI.getOperand(0).getReg(); 2842 LLT DstTy = MRI.getType(DstReg); 2843 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 2844 2845 const ArgDescriptor *Arg; 2846 const TargetRegisterClass *RC; 2847 std::tie(Arg, RC) 2848 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2849 if (!Arg) 2850 return false; 2851 2852 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 2853 if (!loadInputValue(KernargPtrReg, B, Arg)) 2854 return false; 2855 2856 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 2857 MI.eraseFromParent(); 2858 return true; 2859 } 2860 2861 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 2862 MachineRegisterInfo &MRI, 2863 MachineIRBuilder &B, 2864 unsigned AddrSpace) const { 2865 B.setInstr(MI); 2866 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 2867 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 2868 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 2869 MI.eraseFromParent(); 2870 return true; 2871 } 2872 2873 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 2874 // offset (the offset that is included in bounds checking and swizzling, to be 2875 // split between the instruction's voffset and immoffset fields) and soffset 2876 // (the offset that is excluded from bounds checking and swizzling, to go in 2877 // the instruction's soffset field). This function takes the first kind of 2878 // offset and figures out how to split it between voffset and immoffset. 2879 std::tuple<Register, unsigned, unsigned> 2880 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 2881 Register OrigOffset) const { 2882 const unsigned MaxImm = 4095; 2883 Register BaseReg; 2884 unsigned TotalConstOffset; 2885 MachineInstr *OffsetDef; 2886 const LLT S32 = LLT::scalar(32); 2887 2888 std::tie(BaseReg, TotalConstOffset, OffsetDef) 2889 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 2890 2891 unsigned ImmOffset = TotalConstOffset; 2892 2893 // If the immediate value is too big for the immoffset field, put the value 2894 // and -4096 into the immoffset field so that the value that is copied/added 2895 // for the voffset field is a multiple of 4096, and it stands more chance 2896 // of being CSEd with the copy/add for another similar load/store. 2897 // However, do not do that rounding down to a multiple of 4096 if that is a 2898 // negative number, as it appears to be illegal to have a negative offset 2899 // in the vgpr, even if adding the immediate offset makes it positive. 2900 unsigned Overflow = ImmOffset & ~MaxImm; 2901 ImmOffset -= Overflow; 2902 if ((int32_t)Overflow < 0) { 2903 Overflow += ImmOffset; 2904 ImmOffset = 0; 2905 } 2906 2907 if (Overflow != 0) { 2908 if (!BaseReg) { 2909 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 2910 } else { 2911 auto OverflowVal = B.buildConstant(S32, Overflow); 2912 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 2913 } 2914 } 2915 2916 if (!BaseReg) 2917 BaseReg = B.buildConstant(S32, 0).getReg(0); 2918 2919 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 2920 } 2921 2922 /// Handle register layout difference for f16 images for some subtargets. 2923 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 2924 MachineRegisterInfo &MRI, 2925 Register Reg) const { 2926 if (!ST.hasUnpackedD16VMem()) 2927 return Reg; 2928 2929 const LLT S16 = LLT::scalar(16); 2930 const LLT S32 = LLT::scalar(32); 2931 LLT StoreVT = MRI.getType(Reg); 2932 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 2933 2934 auto Unmerge = B.buildUnmerge(S16, Reg); 2935 2936 SmallVector<Register, 4> WideRegs; 2937 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 2938 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 2939 2940 int NumElts = StoreVT.getNumElements(); 2941 2942 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 2943 } 2944 2945 Register AMDGPULegalizerInfo::fixStoreSourceType( 2946 MachineIRBuilder &B, Register VData, bool IsFormat) const { 2947 MachineRegisterInfo *MRI = B.getMRI(); 2948 LLT Ty = MRI->getType(VData); 2949 2950 const LLT S16 = LLT::scalar(16); 2951 2952 // Fixup illegal register types for i8 stores. 2953 if (Ty == LLT::scalar(8) || Ty == S16) { 2954 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 2955 return AnyExt; 2956 } 2957 2958 if (Ty.isVector()) { 2959 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 2960 if (IsFormat) 2961 return handleD16VData(B, *MRI, VData); 2962 } 2963 } 2964 2965 return VData; 2966 } 2967 2968 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 2969 MachineRegisterInfo &MRI, 2970 MachineIRBuilder &B, 2971 bool IsTyped, 2972 bool IsFormat) const { 2973 B.setInstr(MI); 2974 2975 Register VData = MI.getOperand(1).getReg(); 2976 LLT Ty = MRI.getType(VData); 2977 LLT EltTy = Ty.getScalarType(); 2978 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 2979 const LLT S32 = LLT::scalar(32); 2980 2981 VData = fixStoreSourceType(B, VData, IsFormat); 2982 Register RSrc = MI.getOperand(2).getReg(); 2983 2984 MachineMemOperand *MMO = *MI.memoperands_begin(); 2985 const int MemSize = MMO->getSize(); 2986 2987 unsigned ImmOffset; 2988 unsigned TotalOffset; 2989 2990 // The typed intrinsics add an immediate after the registers. 2991 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 2992 2993 // The struct intrinsic variants add one additional operand over raw. 2994 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 2995 Register VIndex; 2996 int OpOffset = 0; 2997 if (HasVIndex) { 2998 VIndex = MI.getOperand(3).getReg(); 2999 OpOffset = 1; 3000 } 3001 3002 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3003 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3004 3005 unsigned Format = 0; 3006 if (IsTyped) { 3007 Format = MI.getOperand(5 + OpOffset).getImm(); 3008 ++OpOffset; 3009 } 3010 3011 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3012 3013 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3014 if (TotalOffset != 0) 3015 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3016 3017 unsigned Opc; 3018 if (IsTyped) { 3019 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3020 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3021 } else if (IsFormat) { 3022 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3023 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3024 } else { 3025 switch (MemSize) { 3026 case 1: 3027 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3028 break; 3029 case 2: 3030 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3031 break; 3032 default: 3033 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3034 break; 3035 } 3036 } 3037 3038 if (!VIndex) 3039 VIndex = B.buildConstant(S32, 0).getReg(0); 3040 3041 auto MIB = B.buildInstr(Opc) 3042 .addUse(VData) // vdata 3043 .addUse(RSrc) // rsrc 3044 .addUse(VIndex) // vindex 3045 .addUse(VOffset) // voffset 3046 .addUse(SOffset) // soffset 3047 .addImm(ImmOffset); // offset(imm) 3048 3049 if (IsTyped) 3050 MIB.addImm(Format); 3051 3052 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3053 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3054 .addMemOperand(MMO); 3055 3056 MI.eraseFromParent(); 3057 return true; 3058 } 3059 3060 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3061 MachineRegisterInfo &MRI, 3062 MachineIRBuilder &B, 3063 bool IsFormat, 3064 bool IsTyped) const { 3065 B.setInstr(MI); 3066 3067 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3068 MachineMemOperand *MMO = *MI.memoperands_begin(); 3069 const int MemSize = MMO->getSize(); 3070 const LLT S32 = LLT::scalar(32); 3071 3072 Register Dst = MI.getOperand(0).getReg(); 3073 Register RSrc = MI.getOperand(2).getReg(); 3074 3075 // The typed intrinsics add an immediate after the registers. 3076 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3077 3078 // The struct intrinsic variants add one additional operand over raw. 3079 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3080 Register VIndex; 3081 int OpOffset = 0; 3082 if (HasVIndex) { 3083 VIndex = MI.getOperand(3).getReg(); 3084 OpOffset = 1; 3085 } 3086 3087 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3088 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3089 3090 unsigned Format = 0; 3091 if (IsTyped) { 3092 Format = MI.getOperand(5 + OpOffset).getImm(); 3093 ++OpOffset; 3094 } 3095 3096 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3097 unsigned ImmOffset; 3098 unsigned TotalOffset; 3099 3100 LLT Ty = MRI.getType(Dst); 3101 LLT EltTy = Ty.getScalarType(); 3102 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3103 const bool Unpacked = ST.hasUnpackedD16VMem(); 3104 3105 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3106 if (TotalOffset != 0) 3107 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3108 3109 unsigned Opc; 3110 3111 if (IsTyped) { 3112 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3113 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3114 } else if (IsFormat) { 3115 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3116 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3117 } else { 3118 switch (MemSize) { 3119 case 1: 3120 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3121 break; 3122 case 2: 3123 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3124 break; 3125 default: 3126 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3127 break; 3128 } 3129 } 3130 3131 Register LoadDstReg; 3132 3133 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3134 LLT UnpackedTy = Ty.changeElementSize(32); 3135 3136 if (IsExtLoad) 3137 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3138 else if (Unpacked && IsD16 && Ty.isVector()) 3139 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3140 else 3141 LoadDstReg = Dst; 3142 3143 if (!VIndex) 3144 VIndex = B.buildConstant(S32, 0).getReg(0); 3145 3146 auto MIB = B.buildInstr(Opc) 3147 .addDef(LoadDstReg) // vdata 3148 .addUse(RSrc) // rsrc 3149 .addUse(VIndex) // vindex 3150 .addUse(VOffset) // voffset 3151 .addUse(SOffset) // soffset 3152 .addImm(ImmOffset); // offset(imm) 3153 3154 if (IsTyped) 3155 MIB.addImm(Format); 3156 3157 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3158 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3159 .addMemOperand(MMO); 3160 3161 if (LoadDstReg != Dst) { 3162 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3163 3164 // Widen result for extending loads was widened. 3165 if (IsExtLoad) 3166 B.buildTrunc(Dst, LoadDstReg); 3167 else { 3168 // Repack to original 16-bit vector result 3169 // FIXME: G_TRUNC should work, but legalization currently fails 3170 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3171 SmallVector<Register, 4> Repack; 3172 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3173 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3174 B.buildMerge(Dst, Repack); 3175 } 3176 } 3177 3178 MI.eraseFromParent(); 3179 return true; 3180 } 3181 3182 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3183 MachineIRBuilder &B, 3184 bool IsInc) const { 3185 B.setInstr(MI); 3186 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3187 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3188 B.buildInstr(Opc) 3189 .addDef(MI.getOperand(0).getReg()) 3190 .addUse(MI.getOperand(2).getReg()) 3191 .addUse(MI.getOperand(3).getReg()) 3192 .cloneMemRefs(MI); 3193 MI.eraseFromParent(); 3194 return true; 3195 } 3196 3197 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3198 switch (IntrID) { 3199 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3200 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3201 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3202 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3203 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3204 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3205 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3206 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3207 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3208 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3209 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3210 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3211 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3212 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3213 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3214 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3215 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3216 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3217 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3218 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3219 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3220 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3221 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3222 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3223 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3224 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3225 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3226 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3227 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3228 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3229 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3230 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3231 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3232 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3233 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3234 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3235 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3236 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3237 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3238 default: 3239 llvm_unreachable("unhandled atomic opcode"); 3240 } 3241 } 3242 3243 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3244 MachineIRBuilder &B, 3245 Intrinsic::ID IID) const { 3246 B.setInstr(MI); 3247 3248 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3249 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3250 3251 Register Dst = MI.getOperand(0).getReg(); 3252 Register VData = MI.getOperand(2).getReg(); 3253 3254 Register CmpVal; 3255 int OpOffset = 0; 3256 3257 if (IsCmpSwap) { 3258 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3259 ++OpOffset; 3260 } 3261 3262 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3263 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3264 3265 // The struct intrinsic variants add one additional operand over raw. 3266 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3267 Register VIndex; 3268 if (HasVIndex) { 3269 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3270 ++OpOffset; 3271 } 3272 3273 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3274 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3275 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3276 3277 MachineMemOperand *MMO = *MI.memoperands_begin(); 3278 3279 unsigned ImmOffset; 3280 unsigned TotalOffset; 3281 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3282 if (TotalOffset != 0) 3283 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3284 3285 if (!VIndex) 3286 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3287 3288 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3289 .addDef(Dst) 3290 .addUse(VData); // vdata 3291 3292 if (IsCmpSwap) 3293 MIB.addReg(CmpVal); 3294 3295 MIB.addUse(RSrc) // rsrc 3296 .addUse(VIndex) // vindex 3297 .addUse(VOffset) // voffset 3298 .addUse(SOffset) // soffset 3299 .addImm(ImmOffset) // offset(imm) 3300 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3301 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3302 .addMemOperand(MMO); 3303 3304 MI.eraseFromParent(); 3305 return true; 3306 } 3307 3308 // Produce a vector of s16 elements from s32 pieces. 3309 static void truncToS16Vector(MachineIRBuilder &B, Register DstReg, 3310 ArrayRef<Register> UnmergeParts) { 3311 const LLT S16 = LLT::scalar(16); 3312 3313 SmallVector<Register, 4> RemergeParts(UnmergeParts.size()); 3314 for (int I = 0, E = UnmergeParts.size(); I != E; ++I) 3315 RemergeParts[I] = B.buildTrunc(S16, UnmergeParts[I]).getReg(0); 3316 3317 B.buildBuildVector(DstReg, RemergeParts); 3318 } 3319 3320 /// Convert a set of s32 registers to a result vector with s16 elements. 3321 static void bitcastToS16Vector(MachineIRBuilder &B, Register DstReg, 3322 ArrayRef<Register> UnmergeParts) { 3323 MachineRegisterInfo &MRI = *B.getMRI(); 3324 const LLT V2S16 = LLT::vector(2, 16); 3325 LLT TargetTy = MRI.getType(DstReg); 3326 int NumElts = UnmergeParts.size(); 3327 3328 if (NumElts == 1) { 3329 assert(TargetTy == V2S16); 3330 B.buildBitcast(DstReg, UnmergeParts[0]); 3331 return; 3332 } 3333 3334 SmallVector<Register, 4> RemergeParts(NumElts); 3335 for (int I = 0; I != NumElts; ++I) 3336 RemergeParts[I] = B.buildBitcast(V2S16, UnmergeParts[I]).getReg(0); 3337 3338 if (TargetTy.getSizeInBits() == 32u * NumElts) { 3339 B.buildConcatVectors(DstReg, RemergeParts); 3340 return; 3341 } 3342 3343 const LLT V3S16 = LLT::vector(3, 16); 3344 const LLT V6S16 = LLT::vector(6, 16); 3345 3346 // Widen to v6s16 and unpack v3 parts. 3347 assert(TargetTy == V3S16); 3348 3349 RemergeParts.push_back(B.buildUndef(V2S16).getReg(0)); 3350 auto Concat = B.buildConcatVectors(V6S16, RemergeParts); 3351 B.buildUnmerge({DstReg, MRI.createGenericVirtualRegister(V3S16)}, Concat); 3352 } 3353 3354 // FIXME: Just vector trunc should be sufficent, but legalization currently 3355 // broken. 3356 static void repackUnpackedD16Load(MachineIRBuilder &B, Register DstReg, 3357 Register WideDstReg) { 3358 const LLT S32 = LLT::scalar(32); 3359 const LLT S16 = LLT::scalar(16); 3360 3361 auto Unmerge = B.buildUnmerge(S32, WideDstReg); 3362 3363 int NumOps = Unmerge->getNumOperands() - 1; 3364 SmallVector<Register, 4> RemergeParts(NumOps); 3365 for (int I = 0; I != NumOps; ++I) 3366 RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0); 3367 3368 B.buildBuildVector(DstReg, RemergeParts); 3369 } 3370 3371 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3372 MachineInstr &MI, MachineIRBuilder &B, 3373 GISelChangeObserver &Observer, 3374 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3375 bool IsTFE = MI.getNumExplicitDefs() == 2; 3376 3377 // We are only processing the operands of d16 image operations on subtargets 3378 // that use the unpacked register layout, or need to repack the TFE result. 3379 3380 // TODO: Need to handle a16 images too 3381 // TODO: Do we need to guard against already legalized intrinsics? 3382 if (!IsTFE && !ST.hasUnpackedD16VMem()) 3383 return true; 3384 3385 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3386 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3387 3388 if (BaseOpcode->Atomic) // No d16 atomics, or TFE. 3389 return true; 3390 3391 B.setInstr(MI); 3392 3393 MachineRegisterInfo *MRI = B.getMRI(); 3394 const LLT S32 = LLT::scalar(32); 3395 const LLT S16 = LLT::scalar(16); 3396 3397 if (BaseOpcode->Store) { // No TFE for stores? 3398 Register VData = MI.getOperand(1).getReg(); 3399 LLT Ty = MRI->getType(VData); 3400 if (!Ty.isVector() || Ty.getElementType() != S16) 3401 return true; 3402 3403 B.setInstr(MI); 3404 3405 Observer.changingInstr(MI); 3406 MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData)); 3407 Observer.changedInstr(MI); 3408 return true; 3409 } 3410 3411 Register DstReg = MI.getOperand(0).getReg(); 3412 LLT Ty = MRI->getType(DstReg); 3413 const LLT EltTy = Ty.getScalarType(); 3414 const bool IsD16 = Ty.getScalarType() == S16; 3415 const unsigned NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3416 3417 if (IsTFE) { 3418 // In the IR, TFE is supposed to be used with a 2 element struct return 3419 // type. The intruction really returns these two values in one contiguous 3420 // register, with one additional dword beyond the loaded data. Rewrite the 3421 // return type to use a single register result. 3422 Register Dst1Reg = MI.getOperand(1).getReg(); 3423 if (MRI->getType(Dst1Reg) != S32) 3424 return false; 3425 3426 // TODO: Make sure the TFE operand bit is set. 3427 3428 // The raw dword aligned data component of the load. The only legal cases 3429 // where this matters should be when using the packed D16 format, for 3430 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3431 LLT RoundedTy; 3432 LLT TFETy; 3433 3434 if (IsD16 && ST.hasUnpackedD16VMem()) { 3435 RoundedTy = LLT::scalarOrVector(NumElts, 32); 3436 TFETy = LLT::vector(NumElts + 1, 32); 3437 } else { 3438 unsigned EltSize = Ty.getScalarSizeInBits(); 3439 unsigned RoundedElts = (Ty.getSizeInBits() + 31) / 32; 3440 unsigned RoundedSize = 32 * RoundedElts; 3441 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3442 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3443 } 3444 3445 Register TFEReg = MRI->createGenericVirtualRegister(TFETy); 3446 Observer.changingInstr(MI); 3447 3448 MI.getOperand(0).setReg(TFEReg); 3449 MI.RemoveOperand(1); 3450 3451 Observer.changedInstr(MI); 3452 3453 // Insert after the instruction. 3454 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3455 3456 // Now figure out how to copy the new result register back into the old 3457 // result. 3458 3459 SmallVector<Register, 5> UnmergeResults(TFETy.getNumElements(), Dst1Reg); 3460 int NumDataElts = TFETy.getNumElements() - 1; 3461 3462 if (!Ty.isVector()) { 3463 // Simplest case is a trivial unmerge (plus a truncate for d16). 3464 UnmergeResults[0] = Ty == S32 ? 3465 DstReg : MRI->createGenericVirtualRegister(S32); 3466 3467 B.buildUnmerge(UnmergeResults, TFEReg); 3468 if (Ty != S32) 3469 B.buildTrunc(DstReg, UnmergeResults[0]); 3470 return true; 3471 } 3472 3473 // We have to repack into a new vector of some kind. 3474 for (int I = 0; I != NumDataElts; ++I) 3475 UnmergeResults[I] = MRI->createGenericVirtualRegister(S32); 3476 B.buildUnmerge(UnmergeResults, TFEReg); 3477 3478 // Drop the final TFE element. 3479 ArrayRef<Register> DataPart(UnmergeResults.data(), NumDataElts); 3480 3481 if (EltTy == S32) 3482 B.buildBuildVector(DstReg, DataPart); 3483 else if (ST.hasUnpackedD16VMem()) 3484 truncToS16Vector(B, DstReg, DataPart); 3485 else 3486 bitcastToS16Vector(B, DstReg, DataPart); 3487 3488 return true; 3489 } 3490 3491 // Must be an image load. 3492 if (!Ty.isVector() || Ty.getElementType() != S16) 3493 return true; 3494 3495 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3496 3497 LLT WidenedTy = Ty.changeElementType(S32); 3498 Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy); 3499 3500 Observer.changingInstr(MI); 3501 MI.getOperand(0).setReg(WideDstReg); 3502 Observer.changedInstr(MI); 3503 3504 repackUnpackedD16Load(B, DstReg, WideDstReg); 3505 return true; 3506 } 3507 3508 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 3509 MachineInstr &MI, MachineIRBuilder &B, 3510 GISelChangeObserver &Observer) const { 3511 Register Dst = MI.getOperand(0).getReg(); 3512 LLT Ty = B.getMRI()->getType(Dst); 3513 unsigned Size = Ty.getSizeInBits(); 3514 MachineFunction &MF = B.getMF(); 3515 3516 Observer.changingInstr(MI); 3517 3518 // FIXME: We don't really need this intermediate instruction. The intrinsic 3519 // should be fixed to have a memory operand. Since it's readnone, we're not 3520 // allowed to add one. 3521 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 3522 MI.RemoveOperand(1); // Remove intrinsic ID 3523 3524 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 3525 // TODO: Should this use datalayout alignment? 3526 const unsigned MemSize = (Size + 7) / 8; 3527 const unsigned MemAlign = 4; 3528 MachineMemOperand *MMO = MF.getMachineMemOperand( 3529 MachinePointerInfo(), 3530 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 3531 MachineMemOperand::MOInvariant, MemSize, MemAlign); 3532 MI.addMemOperand(MF, MMO); 3533 3534 // There are no 96-bit result scalar loads, but widening to 128-bit should 3535 // always be legal. We may need to restore this to a 96-bit result if it turns 3536 // out this needs to be converted to a vector load during RegBankSelect. 3537 if (!isPowerOf2_32(Size)) { 3538 LegalizerHelper Helper(MF, *this, Observer, B); 3539 B.setInstr(MI); 3540 3541 if (Ty.isVector()) 3542 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 3543 else 3544 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 3545 } 3546 3547 Observer.changedInstr(MI); 3548 return true; 3549 } 3550 3551 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 3552 MachineIRBuilder &B, 3553 GISelChangeObserver &Observer) const { 3554 MachineRegisterInfo &MRI = *B.getMRI(); 3555 3556 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 3557 auto IntrID = MI.getIntrinsicID(); 3558 switch (IntrID) { 3559 case Intrinsic::amdgcn_if: 3560 case Intrinsic::amdgcn_else: { 3561 MachineInstr *Br = nullptr; 3562 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 3563 const SIRegisterInfo *TRI 3564 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 3565 3566 B.setInstr(*BrCond); 3567 Register Def = MI.getOperand(1).getReg(); 3568 Register Use = MI.getOperand(3).getReg(); 3569 3570 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 3571 if (Br) 3572 BrTarget = Br->getOperand(0).getMBB(); 3573 3574 if (IntrID == Intrinsic::amdgcn_if) { 3575 B.buildInstr(AMDGPU::SI_IF) 3576 .addDef(Def) 3577 .addUse(Use) 3578 .addMBB(BrTarget); 3579 } else { 3580 B.buildInstr(AMDGPU::SI_ELSE) 3581 .addDef(Def) 3582 .addUse(Use) 3583 .addMBB(BrTarget) 3584 .addImm(0); 3585 } 3586 3587 if (Br) 3588 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 3589 3590 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 3591 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 3592 MI.eraseFromParent(); 3593 BrCond->eraseFromParent(); 3594 return true; 3595 } 3596 3597 return false; 3598 } 3599 case Intrinsic::amdgcn_loop: { 3600 MachineInstr *Br = nullptr; 3601 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 3602 const SIRegisterInfo *TRI 3603 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 3604 3605 B.setInstr(*BrCond); 3606 3607 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 3608 if (Br) 3609 BrTarget = Br->getOperand(0).getMBB(); 3610 3611 Register Reg = MI.getOperand(2).getReg(); 3612 B.buildInstr(AMDGPU::SI_LOOP) 3613 .addUse(Reg) 3614 .addMBB(BrTarget); 3615 3616 if (Br) 3617 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 3618 3619 MI.eraseFromParent(); 3620 BrCond->eraseFromParent(); 3621 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 3622 return true; 3623 } 3624 3625 return false; 3626 } 3627 case Intrinsic::amdgcn_kernarg_segment_ptr: 3628 return legalizePreloadedArgIntrin( 3629 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3630 case Intrinsic::amdgcn_implicitarg_ptr: 3631 return legalizeImplicitArgPtr(MI, MRI, B); 3632 case Intrinsic::amdgcn_workitem_id_x: 3633 return legalizePreloadedArgIntrin(MI, MRI, B, 3634 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 3635 case Intrinsic::amdgcn_workitem_id_y: 3636 return legalizePreloadedArgIntrin(MI, MRI, B, 3637 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 3638 case Intrinsic::amdgcn_workitem_id_z: 3639 return legalizePreloadedArgIntrin(MI, MRI, B, 3640 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 3641 case Intrinsic::amdgcn_workgroup_id_x: 3642 return legalizePreloadedArgIntrin(MI, MRI, B, 3643 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 3644 case Intrinsic::amdgcn_workgroup_id_y: 3645 return legalizePreloadedArgIntrin(MI, MRI, B, 3646 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 3647 case Intrinsic::amdgcn_workgroup_id_z: 3648 return legalizePreloadedArgIntrin(MI, MRI, B, 3649 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 3650 case Intrinsic::amdgcn_dispatch_ptr: 3651 return legalizePreloadedArgIntrin(MI, MRI, B, 3652 AMDGPUFunctionArgInfo::DISPATCH_PTR); 3653 case Intrinsic::amdgcn_queue_ptr: 3654 return legalizePreloadedArgIntrin(MI, MRI, B, 3655 AMDGPUFunctionArgInfo::QUEUE_PTR); 3656 case Intrinsic::amdgcn_implicit_buffer_ptr: 3657 return legalizePreloadedArgIntrin( 3658 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 3659 case Intrinsic::amdgcn_dispatch_id: 3660 return legalizePreloadedArgIntrin(MI, MRI, B, 3661 AMDGPUFunctionArgInfo::DISPATCH_ID); 3662 case Intrinsic::amdgcn_fdiv_fast: 3663 return legalizeFDIVFastIntrin(MI, MRI, B); 3664 case Intrinsic::amdgcn_is_shared: 3665 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 3666 case Intrinsic::amdgcn_is_private: 3667 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 3668 case Intrinsic::amdgcn_wavefrontsize: { 3669 B.setInstr(MI); 3670 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 3671 MI.eraseFromParent(); 3672 return true; 3673 } 3674 case Intrinsic::amdgcn_s_buffer_load: 3675 return legalizeSBufferLoad(MI, B, Observer); 3676 case Intrinsic::amdgcn_raw_buffer_store: 3677 case Intrinsic::amdgcn_struct_buffer_store: 3678 return legalizeBufferStore(MI, MRI, B, false, false); 3679 case Intrinsic::amdgcn_raw_buffer_store_format: 3680 case Intrinsic::amdgcn_struct_buffer_store_format: 3681 return legalizeBufferStore(MI, MRI, B, false, true); 3682 case Intrinsic::amdgcn_raw_tbuffer_store: 3683 case Intrinsic::amdgcn_struct_tbuffer_store: 3684 return legalizeBufferStore(MI, MRI, B, true, true); 3685 case Intrinsic::amdgcn_raw_buffer_load: 3686 case Intrinsic::amdgcn_struct_buffer_load: 3687 return legalizeBufferLoad(MI, MRI, B, false, false); 3688 case Intrinsic::amdgcn_raw_buffer_load_format: 3689 case Intrinsic::amdgcn_struct_buffer_load_format: 3690 return legalizeBufferLoad(MI, MRI, B, true, false); 3691 case Intrinsic::amdgcn_raw_tbuffer_load: 3692 case Intrinsic::amdgcn_struct_tbuffer_load: 3693 return legalizeBufferLoad(MI, MRI, B, true, true); 3694 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3695 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3696 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3697 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3698 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3699 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3700 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3701 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3702 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3703 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3704 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3705 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3706 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3707 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3708 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3709 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3710 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3711 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3712 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3713 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3714 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3715 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3716 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3717 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3718 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3719 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3720 return legalizeBufferAtomic(MI, B, IntrID); 3721 case Intrinsic::amdgcn_atomic_inc: 3722 return legalizeAtomicIncDec(MI, B, true); 3723 case Intrinsic::amdgcn_atomic_dec: 3724 return legalizeAtomicIncDec(MI, B, false); 3725 default: { 3726 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 3727 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 3728 return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr); 3729 return true; 3730 } 3731 } 3732 3733 return true; 3734 } 3735