1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPULegalizerInfo.h" 22 23 #include "AMDGPU.h" 24 #include "AMDGPUGlobalISelUtils.h" 25 #include "AMDGPUTargetMachine.h" 26 #include "SIMachineFunctionInfo.h" 27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 30 #include "llvm/CodeGen/TargetOpcodes.h" 31 #include "llvm/CodeGen/ValueTypes.h" 32 #include "llvm/IR/DerivedTypes.h" 33 #include "llvm/IR/DiagnosticInfo.h" 34 #include "llvm/IR/Type.h" 35 #include "llvm/Support/Debug.h" 36 37 #define DEBUG_TYPE "amdgpu-legalinfo" 38 39 using namespace llvm; 40 using namespace LegalizeActions; 41 using namespace LegalizeMutations; 42 using namespace LegalityPredicates; 43 using namespace MIPatternMatch; 44 45 // Round the number of elements to the next power of two elements 46 static LLT getPow2VectorType(LLT Ty) { 47 unsigned NElts = Ty.getNumElements(); 48 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 49 return Ty.changeNumElements(Pow2NElts); 50 } 51 52 // Round the number of bits to the next power of two bits 53 static LLT getPow2ScalarType(LLT Ty) { 54 unsigned Bits = Ty.getSizeInBits(); 55 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 56 return LLT::scalar(Pow2Bits); 57 } 58 59 static LegalityPredicate isMultiple32(unsigned TypeIdx, 60 unsigned MaxSize = 1024) { 61 return [=](const LegalityQuery &Query) { 62 const LLT Ty = Query.Types[TypeIdx]; 63 const LLT EltTy = Ty.getScalarType(); 64 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 65 }; 66 } 67 68 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 69 return [=](const LegalityQuery &Query) { 70 return Query.Types[TypeIdx].getSizeInBits() == Size; 71 }; 72 } 73 74 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 75 return [=](const LegalityQuery &Query) { 76 const LLT Ty = Query.Types[TypeIdx]; 77 return Ty.isVector() && 78 Ty.getNumElements() % 2 != 0 && 79 Ty.getElementType().getSizeInBits() < 32 && 80 Ty.getSizeInBits() % 32 != 0; 81 }; 82 } 83 84 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 85 return [=](const LegalityQuery &Query) { 86 const LLT Ty = Query.Types[TypeIdx]; 87 const LLT EltTy = Ty.getScalarType(); 88 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 89 }; 90 } 91 92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 93 return [=](const LegalityQuery &Query) { 94 const LLT Ty = Query.Types[TypeIdx]; 95 const LLT EltTy = Ty.getElementType(); 96 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 97 }; 98 } 99 100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 101 return [=](const LegalityQuery &Query) { 102 const LLT Ty = Query.Types[TypeIdx]; 103 const LLT EltTy = Ty.getElementType(); 104 unsigned Size = Ty.getSizeInBits(); 105 unsigned Pieces = (Size + 63) / 64; 106 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 107 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 108 }; 109 } 110 111 // Increase the number of vector elements to reach the next multiple of 32-bit 112 // type. 113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 114 return [=](const LegalityQuery &Query) { 115 const LLT Ty = Query.Types[TypeIdx]; 116 117 const LLT EltTy = Ty.getElementType(); 118 const int Size = Ty.getSizeInBits(); 119 const int EltSize = EltTy.getSizeInBits(); 120 const int NextMul32 = (Size + 31) / 32; 121 122 assert(EltSize < 32); 123 124 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 125 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 126 }; 127 } 128 129 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 130 return [=](const LegalityQuery &Query) { 131 const LLT QueryTy = Query.Types[TypeIdx]; 132 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 133 }; 134 } 135 136 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 137 return [=](const LegalityQuery &Query) { 138 const LLT QueryTy = Query.Types[TypeIdx]; 139 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 140 }; 141 } 142 143 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 144 return [=](const LegalityQuery &Query) { 145 const LLT QueryTy = Query.Types[TypeIdx]; 146 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 147 }; 148 } 149 150 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 151 // v2s16. 152 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 153 return [=](const LegalityQuery &Query) { 154 const LLT Ty = Query.Types[TypeIdx]; 155 if (Ty.isVector()) { 156 const int EltSize = Ty.getElementType().getSizeInBits(); 157 return EltSize == 32 || EltSize == 64 || 158 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 159 EltSize == 128 || EltSize == 256; 160 } 161 162 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 163 }; 164 } 165 166 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 167 return [=](const LegalityQuery &Query) { 168 const LLT QueryTy = Query.Types[TypeIdx]; 169 return QueryTy.isVector() && QueryTy.getElementType() == Type; 170 }; 171 } 172 173 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 174 return [=](const LegalityQuery &Query) { 175 const LLT Ty = Query.Types[TypeIdx]; 176 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 177 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 178 }; 179 } 180 181 static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) { 182 return [=](const LegalityQuery &Query) { 183 return Query.Types[TypeIdx0].getSizeInBits() < 184 Query.Types[TypeIdx1].getSizeInBits(); 185 }; 186 } 187 188 static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) { 189 return [=](const LegalityQuery &Query) { 190 return Query.Types[TypeIdx0].getSizeInBits() > 191 Query.Types[TypeIdx1].getSizeInBits(); 192 }; 193 } 194 195 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 196 const GCNTargetMachine &TM) 197 : ST(ST_) { 198 using namespace TargetOpcode; 199 200 auto GetAddrSpacePtr = [&TM](unsigned AS) { 201 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 202 }; 203 204 const LLT S1 = LLT::scalar(1); 205 const LLT S16 = LLT::scalar(16); 206 const LLT S32 = LLT::scalar(32); 207 const LLT S64 = LLT::scalar(64); 208 const LLT S128 = LLT::scalar(128); 209 const LLT S256 = LLT::scalar(256); 210 const LLT S1024 = LLT::scalar(1024); 211 212 const LLT V2S16 = LLT::vector(2, 16); 213 const LLT V4S16 = LLT::vector(4, 16); 214 215 const LLT V2S32 = LLT::vector(2, 32); 216 const LLT V3S32 = LLT::vector(3, 32); 217 const LLT V4S32 = LLT::vector(4, 32); 218 const LLT V5S32 = LLT::vector(5, 32); 219 const LLT V6S32 = LLT::vector(6, 32); 220 const LLT V7S32 = LLT::vector(7, 32); 221 const LLT V8S32 = LLT::vector(8, 32); 222 const LLT V9S32 = LLT::vector(9, 32); 223 const LLT V10S32 = LLT::vector(10, 32); 224 const LLT V11S32 = LLT::vector(11, 32); 225 const LLT V12S32 = LLT::vector(12, 32); 226 const LLT V13S32 = LLT::vector(13, 32); 227 const LLT V14S32 = LLT::vector(14, 32); 228 const LLT V15S32 = LLT::vector(15, 32); 229 const LLT V16S32 = LLT::vector(16, 32); 230 const LLT V32S32 = LLT::vector(32, 32); 231 232 const LLT V2S64 = LLT::vector(2, 64); 233 const LLT V3S64 = LLT::vector(3, 64); 234 const LLT V4S64 = LLT::vector(4, 64); 235 const LLT V5S64 = LLT::vector(5, 64); 236 const LLT V6S64 = LLT::vector(6, 64); 237 const LLT V7S64 = LLT::vector(7, 64); 238 const LLT V8S64 = LLT::vector(8, 64); 239 const LLT V16S64 = LLT::vector(16, 64); 240 241 std::initializer_list<LLT> AllS32Vectors = 242 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 243 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 244 std::initializer_list<LLT> AllS64Vectors = 245 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 246 247 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 248 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 249 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 250 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 251 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 252 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 253 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 254 255 const LLT CodePtr = FlatPtr; 256 257 const std::initializer_list<LLT> AddrSpaces64 = { 258 GlobalPtr, ConstantPtr, FlatPtr 259 }; 260 261 const std::initializer_list<LLT> AddrSpaces32 = { 262 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 263 }; 264 265 const std::initializer_list<LLT> FPTypesBase = { 266 S32, S64 267 }; 268 269 const std::initializer_list<LLT> FPTypes16 = { 270 S32, S64, S16 271 }; 272 273 const std::initializer_list<LLT> FPTypesPK16 = { 274 S32, S64, S16, V2S16 275 }; 276 277 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 278 279 setAction({G_BRCOND, S1}, Legal); // VCC branches 280 setAction({G_BRCOND, S32}, Legal); // SCC branches 281 282 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 283 // elements for v3s16 284 getActionDefinitionsBuilder(G_PHI) 285 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 286 .legalFor(AllS32Vectors) 287 .legalFor(AllS64Vectors) 288 .legalFor(AddrSpaces64) 289 .legalFor(AddrSpaces32) 290 .clampScalar(0, S32, S256) 291 .widenScalarToNextPow2(0, 32) 292 .clampMaxNumElements(0, S32, 16) 293 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 294 .legalIf(isPointer(0)); 295 296 if (ST.has16BitInsts()) { 297 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 298 .legalFor({S32, S16}) 299 .clampScalar(0, S16, S32) 300 .scalarize(0) 301 .widenScalarToNextPow2(0, 32); 302 } else { 303 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 304 .legalFor({S32}) 305 .clampScalar(0, S32, S32) 306 .scalarize(0); 307 } 308 309 // FIXME: Not really legal. Placeholder for custom lowering. 310 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 311 .customFor({S32, S64}) 312 .clampScalar(0, S32, S64) 313 .widenScalarToNextPow2(0, 32) 314 .scalarize(0); 315 316 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 317 .legalFor({S32}) 318 .clampScalar(0, S32, S32) 319 .scalarize(0); 320 321 // Report legal for any types we can handle anywhere. For the cases only legal 322 // on the SALU, RegBankSelect will be able to re-legalize. 323 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 324 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 325 .clampScalar(0, S32, S64) 326 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 327 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 328 .widenScalarToNextPow2(0) 329 .scalarize(0); 330 331 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 332 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 333 .legalFor({{S32, S1}, {S32, S32}}) 334 .clampScalar(0, S32, S32) 335 .scalarize(0); // TODO: Implement. 336 337 getActionDefinitionsBuilder(G_BITCAST) 338 // Don't worry about the size constraint. 339 .legalIf(all(isRegisterType(0), isRegisterType(1))) 340 .lower(); 341 342 343 getActionDefinitionsBuilder(G_CONSTANT) 344 .legalFor({S1, S32, S64, S16, GlobalPtr, 345 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 346 .clampScalar(0, S32, S64) 347 .widenScalarToNextPow2(0) 348 .legalIf(isPointer(0)); 349 350 getActionDefinitionsBuilder(G_FCONSTANT) 351 .legalFor({S32, S64, S16}) 352 .clampScalar(0, S16, S64); 353 354 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 355 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 356 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 357 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 358 .clampScalarOrElt(0, S32, S1024) 359 .legalIf(isMultiple32(0)) 360 .widenScalarToNextPow2(0, 32) 361 .clampMaxNumElements(0, S32, 16); 362 363 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 364 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 365 .unsupportedFor({PrivatePtr}) 366 .custom(); 367 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 368 369 auto &FPOpActions = getActionDefinitionsBuilder( 370 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 371 .legalFor({S32, S64}); 372 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 373 .customFor({S32, S64}); 374 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 375 .customFor({S32, S64}); 376 377 if (ST.has16BitInsts()) { 378 if (ST.hasVOP3PInsts()) 379 FPOpActions.legalFor({S16, V2S16}); 380 else 381 FPOpActions.legalFor({S16}); 382 383 TrigActions.customFor({S16}); 384 FDIVActions.customFor({S16}); 385 } 386 387 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 388 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 389 390 if (ST.hasVOP3PInsts()) { 391 MinNumMaxNum.customFor(FPTypesPK16) 392 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 393 .clampMaxNumElements(0, S16, 2) 394 .clampScalar(0, S16, S64) 395 .scalarize(0); 396 } else if (ST.has16BitInsts()) { 397 MinNumMaxNum.customFor(FPTypes16) 398 .clampScalar(0, S16, S64) 399 .scalarize(0); 400 } else { 401 MinNumMaxNum.customFor(FPTypesBase) 402 .clampScalar(0, S32, S64) 403 .scalarize(0); 404 } 405 406 if (ST.hasVOP3PInsts()) 407 FPOpActions.clampMaxNumElements(0, S16, 2); 408 409 FPOpActions 410 .scalarize(0) 411 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 412 413 TrigActions 414 .scalarize(0) 415 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 416 417 FDIVActions 418 .scalarize(0) 419 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 420 421 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 422 .legalFor(FPTypesPK16) 423 .clampMaxNumElements(0, S16, 2) 424 .scalarize(0) 425 .clampScalar(0, S16, S64); 426 427 if (ST.has16BitInsts()) { 428 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 429 .legalFor({S32, S64, S16}) 430 .scalarize(0) 431 .clampScalar(0, S16, S64); 432 } else { 433 getActionDefinitionsBuilder(G_FSQRT) 434 .legalFor({S32, S64}) 435 .scalarize(0) 436 .clampScalar(0, S32, S64); 437 438 if (ST.hasFractBug()) { 439 getActionDefinitionsBuilder(G_FFLOOR) 440 .customFor({S64}) 441 .legalFor({S32, S64}) 442 .scalarize(0) 443 .clampScalar(0, S32, S64); 444 } else { 445 getActionDefinitionsBuilder(G_FFLOOR) 446 .legalFor({S32, S64}) 447 .scalarize(0) 448 .clampScalar(0, S32, S64); 449 } 450 } 451 452 getActionDefinitionsBuilder(G_FPTRUNC) 453 .legalFor({{S32, S64}, {S16, S32}}) 454 .scalarize(0) 455 .lower(); 456 457 getActionDefinitionsBuilder(G_FPEXT) 458 .legalFor({{S64, S32}, {S32, S16}}) 459 .lowerFor({{S64, S16}}) // FIXME: Implement 460 .scalarize(0); 461 462 getActionDefinitionsBuilder(G_FSUB) 463 // Use actual fsub instruction 464 .legalFor({S32}) 465 // Must use fadd + fneg 466 .lowerFor({S64, S16, V2S16}) 467 .scalarize(0) 468 .clampScalar(0, S32, S64); 469 470 // Whether this is legal depends on the floating point mode for the function. 471 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 472 if (ST.hasMadF16()) 473 FMad.customFor({S32, S16}); 474 else 475 FMad.customFor({S32}); 476 FMad.scalarize(0) 477 .lower(); 478 479 getActionDefinitionsBuilder(G_TRUNC) 480 .alwaysLegal(); 481 482 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 483 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 484 {S32, S1}, {S64, S1}, {S16, S1}}) 485 .scalarize(0) 486 .clampScalar(0, S32, S64) 487 .widenScalarToNextPow2(1, 32); 488 489 // TODO: Split s1->s64 during regbankselect for VALU. 490 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 491 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 492 .lowerFor({{S32, S64}}) 493 .lowerIf(typeIs(1, S1)) 494 .customFor({{S64, S64}}); 495 if (ST.has16BitInsts()) 496 IToFP.legalFor({{S16, S16}}); 497 IToFP.clampScalar(1, S32, S64) 498 .scalarize(0) 499 .widenScalarToNextPow2(1); 500 501 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 502 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 503 .customFor({{S64, S64}}); 504 if (ST.has16BitInsts()) 505 FPToI.legalFor({{S16, S16}}); 506 else 507 FPToI.minScalar(1, S32); 508 509 FPToI.minScalar(0, S32) 510 .scalarize(0) 511 .lower(); 512 513 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 514 .scalarize(0) 515 .lower(); 516 517 if (ST.has16BitInsts()) { 518 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 519 .legalFor({S16, S32, S64}) 520 .clampScalar(0, S16, S64) 521 .scalarize(0); 522 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 523 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 524 .legalFor({S32, S64}) 525 .clampScalar(0, S32, S64) 526 .scalarize(0); 527 } else { 528 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 529 .legalFor({S32}) 530 .customFor({S64}) 531 .clampScalar(0, S32, S64) 532 .scalarize(0); 533 } 534 535 getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK}) 536 .scalarize(0) 537 .alwaysLegal(); 538 539 auto &CmpBuilder = 540 getActionDefinitionsBuilder(G_ICMP) 541 // The compare output type differs based on the register bank of the output, 542 // so make both s1 and s32 legal. 543 // 544 // Scalar compares producing output in scc will be promoted to s32, as that 545 // is the allocatable register type that will be needed for the copy from 546 // scc. This will be promoted during RegBankSelect, and we assume something 547 // before that won't try to use s32 result types. 548 // 549 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 550 // bank. 551 .legalForCartesianProduct( 552 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 553 .legalForCartesianProduct( 554 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 555 if (ST.has16BitInsts()) { 556 CmpBuilder.legalFor({{S1, S16}}); 557 } 558 559 CmpBuilder 560 .widenScalarToNextPow2(1) 561 .clampScalar(1, S32, S64) 562 .scalarize(0) 563 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 564 565 getActionDefinitionsBuilder(G_FCMP) 566 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 567 .widenScalarToNextPow2(1) 568 .clampScalar(1, S32, S64) 569 .scalarize(0); 570 571 // FIXME: fpow has a selection pattern that should move to custom lowering. 572 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 573 if (ST.has16BitInsts()) 574 Exp2Ops.legalFor({S32, S16}); 575 else 576 Exp2Ops.legalFor({S32}); 577 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 578 Exp2Ops.scalarize(0); 579 580 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 581 if (ST.has16BitInsts()) 582 ExpOps.customFor({{S32}, {S16}}); 583 else 584 ExpOps.customFor({S32}); 585 ExpOps.clampScalar(0, MinScalarFPTy, S32) 586 .scalarize(0); 587 588 // The 64-bit versions produce 32-bit results, but only on the SALU. 589 getActionDefinitionsBuilder(G_CTPOP) 590 .legalFor({{S32, S32}, {S32, S64}}) 591 .clampScalar(0, S32, S32) 592 .clampScalar(1, S32, S64) 593 .scalarize(0) 594 .widenScalarToNextPow2(0, 32) 595 .widenScalarToNextPow2(1, 32); 596 597 // The hardware instructions return a different result on 0 than the generic 598 // instructions expect. The hardware produces -1, but these produce the 599 // bitwidth. 600 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 601 .scalarize(0) 602 .clampScalar(0, S32, S32) 603 .clampScalar(1, S32, S64) 604 .widenScalarToNextPow2(0, 32) 605 .widenScalarToNextPow2(1, 32) 606 .lower(); 607 608 // The 64-bit versions produce 32-bit results, but only on the SALU. 609 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 610 .legalFor({{S32, S32}, {S32, S64}}) 611 .clampScalar(0, S32, S32) 612 .clampScalar(1, S32, S64) 613 .scalarize(0) 614 .widenScalarToNextPow2(0, 32) 615 .widenScalarToNextPow2(1, 32); 616 617 getActionDefinitionsBuilder(G_BITREVERSE) 618 .legalFor({S32}) 619 .clampScalar(0, S32, S32) 620 .scalarize(0); 621 622 if (ST.has16BitInsts()) { 623 getActionDefinitionsBuilder(G_BSWAP) 624 .legalFor({S16, S32, V2S16}) 625 .clampMaxNumElements(0, S16, 2) 626 // FIXME: Fixing non-power-of-2 before clamp is workaround for 627 // narrowScalar limitation. 628 .widenScalarToNextPow2(0) 629 .clampScalar(0, S16, S32) 630 .scalarize(0); 631 632 if (ST.hasVOP3PInsts()) { 633 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 634 .legalFor({S32, S16, V2S16}) 635 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 636 .clampMaxNumElements(0, S16, 2) 637 .clampScalar(0, S16, S32) 638 .widenScalarToNextPow2(0) 639 .scalarize(0); 640 } else { 641 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 642 .legalFor({S32, S16}) 643 .widenScalarToNextPow2(0) 644 .clampScalar(0, S16, S32) 645 .scalarize(0); 646 } 647 } else { 648 // TODO: Should have same legality without v_perm_b32 649 getActionDefinitionsBuilder(G_BSWAP) 650 .legalFor({S32}) 651 .lowerIf(narrowerThan(0, 32)) 652 // FIXME: Fixing non-power-of-2 before clamp is workaround for 653 // narrowScalar limitation. 654 .widenScalarToNextPow2(0) 655 .maxScalar(0, S32) 656 .scalarize(0) 657 .lower(); 658 659 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 660 .legalFor({S32}) 661 .clampScalar(0, S32, S32) 662 .widenScalarToNextPow2(0) 663 .scalarize(0); 664 } 665 666 getActionDefinitionsBuilder(G_INTTOPTR) 667 // List the common cases 668 .legalForCartesianProduct(AddrSpaces64, {S64}) 669 .legalForCartesianProduct(AddrSpaces32, {S32}) 670 .scalarize(0) 671 // Accept any address space as long as the size matches 672 .legalIf(sameSize(0, 1)) 673 .widenScalarIf(smallerThan(1, 0), 674 [](const LegalityQuery &Query) { 675 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 676 }) 677 .narrowScalarIf(greaterThan(1, 0), 678 [](const LegalityQuery &Query) { 679 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 680 }); 681 682 getActionDefinitionsBuilder(G_PTRTOINT) 683 // List the common cases 684 .legalForCartesianProduct(AddrSpaces64, {S64}) 685 .legalForCartesianProduct(AddrSpaces32, {S32}) 686 .scalarize(0) 687 // Accept any address space as long as the size matches 688 .legalIf(sameSize(0, 1)) 689 .widenScalarIf(smallerThan(0, 1), 690 [](const LegalityQuery &Query) { 691 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 692 }) 693 .narrowScalarIf( 694 greaterThan(0, 1), 695 [](const LegalityQuery &Query) { 696 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 697 }); 698 699 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 700 .scalarize(0) 701 .custom(); 702 703 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 704 // handle some operations by just promoting the register during 705 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 706 auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned { 707 switch (AS) { 708 // FIXME: Private element size. 709 case AMDGPUAS::PRIVATE_ADDRESS: 710 return 32; 711 // FIXME: Check subtarget 712 case AMDGPUAS::LOCAL_ADDRESS: 713 return ST.useDS128() ? 128 : 64; 714 715 // Treat constant and global as identical. SMRD loads are sometimes usable 716 // for global loads (ideally constant address space should be eliminated) 717 // depending on the context. Legality cannot be context dependent, but 718 // RegBankSelect can split the load as necessary depending on the pointer 719 // register bank/uniformity and if the memory is invariant or not written in 720 // a kernel. 721 case AMDGPUAS::CONSTANT_ADDRESS: 722 case AMDGPUAS::GLOBAL_ADDRESS: 723 return IsLoad ? 512 : 128; 724 default: 725 return 128; 726 } 727 }; 728 729 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 730 bool IsLoad) -> bool { 731 const LLT DstTy = Query.Types[0]; 732 733 // Split vector extloads. 734 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 735 unsigned Align = Query.MMODescrs[0].AlignInBits; 736 737 if (MemSize < DstTy.getSizeInBits()) 738 MemSize = std::max(MemSize, Align); 739 740 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 741 return true; 742 743 const LLT PtrTy = Query.Types[1]; 744 unsigned AS = PtrTy.getAddressSpace(); 745 if (MemSize > maxSizeForAddrSpace(AS, IsLoad)) 746 return true; 747 748 // Catch weird sized loads that don't evenly divide into the access sizes 749 // TODO: May be able to widen depending on alignment etc. 750 unsigned NumRegs = (MemSize + 31) / 32; 751 if (NumRegs == 3) { 752 if (!ST.hasDwordx3LoadStores()) 753 return true; 754 } else { 755 // If the alignment allows, these should have been widened. 756 if (!isPowerOf2_32(NumRegs)) 757 return true; 758 } 759 760 if (Align < MemSize) { 761 const SITargetLowering *TLI = ST.getTargetLowering(); 762 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 763 } 764 765 return false; 766 }; 767 768 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool { 769 unsigned Size = Query.Types[0].getSizeInBits(); 770 if (isPowerOf2_32(Size)) 771 return false; 772 773 if (Size == 96 && ST.hasDwordx3LoadStores()) 774 return false; 775 776 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 777 if (Size >= maxSizeForAddrSpace(AddrSpace, true)) 778 return false; 779 780 unsigned Align = Query.MMODescrs[0].AlignInBits; 781 unsigned RoundedSize = NextPowerOf2(Size); 782 return (Align >= RoundedSize); 783 }; 784 785 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 786 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 787 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 788 789 // TODO: Refine based on subtargets which support unaligned access or 128-bit 790 // LDS 791 // TODO: Unsupported flat for SI. 792 793 for (unsigned Op : {G_LOAD, G_STORE}) { 794 const bool IsStore = Op == G_STORE; 795 796 auto &Actions = getActionDefinitionsBuilder(Op); 797 // Whitelist the common cases. 798 // TODO: Loads to s16 on gfx9 799 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 800 {V2S32, GlobalPtr, 64, GlobalAlign32}, 801 {V4S32, GlobalPtr, 128, GlobalAlign32}, 802 {S128, GlobalPtr, 128, GlobalAlign32}, 803 {S64, GlobalPtr, 64, GlobalAlign32}, 804 {V2S64, GlobalPtr, 128, GlobalAlign32}, 805 {V2S16, GlobalPtr, 32, GlobalAlign32}, 806 {S32, GlobalPtr, 8, GlobalAlign8}, 807 {S32, GlobalPtr, 16, GlobalAlign16}, 808 809 {S32, LocalPtr, 32, 32}, 810 {S64, LocalPtr, 64, 32}, 811 {V2S32, LocalPtr, 64, 32}, 812 {S32, LocalPtr, 8, 8}, 813 {S32, LocalPtr, 16, 16}, 814 {V2S16, LocalPtr, 32, 32}, 815 816 {S32, PrivatePtr, 32, 32}, 817 {S32, PrivatePtr, 8, 8}, 818 {S32, PrivatePtr, 16, 16}, 819 {V2S16, PrivatePtr, 32, 32}, 820 821 {S32, FlatPtr, 32, GlobalAlign32}, 822 {S32, FlatPtr, 16, GlobalAlign16}, 823 {S32, FlatPtr, 8, GlobalAlign8}, 824 {V2S16, FlatPtr, 32, GlobalAlign32}, 825 826 {S32, ConstantPtr, 32, GlobalAlign32}, 827 {V2S32, ConstantPtr, 64, GlobalAlign32}, 828 {V4S32, ConstantPtr, 128, GlobalAlign32}, 829 {S64, ConstantPtr, 64, GlobalAlign32}, 830 {S128, ConstantPtr, 128, GlobalAlign32}, 831 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 832 Actions 833 .customIf(typeIs(1, Constant32Ptr)) 834 // Widen suitably aligned loads by loading extra elements. 835 .moreElementsIf([=](const LegalityQuery &Query) { 836 const LLT Ty = Query.Types[0]; 837 return Op == G_LOAD && Ty.isVector() && 838 shouldWidenLoadResult(Query); 839 }, moreElementsToNextPow2(0)) 840 .widenScalarIf([=](const LegalityQuery &Query) { 841 const LLT Ty = Query.Types[0]; 842 return Op == G_LOAD && !Ty.isVector() && 843 shouldWidenLoadResult(Query); 844 }, widenScalarOrEltToNextPow2(0)) 845 .narrowScalarIf( 846 [=](const LegalityQuery &Query) -> bool { 847 return !Query.Types[0].isVector() && 848 needToSplitMemOp(Query, Op == G_LOAD); 849 }, 850 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 851 const LLT DstTy = Query.Types[0]; 852 const LLT PtrTy = Query.Types[1]; 853 854 const unsigned DstSize = DstTy.getSizeInBits(); 855 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 856 857 // Split extloads. 858 if (DstSize > MemSize) 859 return std::make_pair(0, LLT::scalar(MemSize)); 860 861 if (!isPowerOf2_32(DstSize)) { 862 // We're probably decomposing an odd sized store. Try to split 863 // to the widest type. TODO: Account for alignment. As-is it 864 // should be OK, since the new parts will be further legalized. 865 unsigned FloorSize = PowerOf2Floor(DstSize); 866 return std::make_pair(0, LLT::scalar(FloorSize)); 867 } 868 869 if (DstSize > 32 && (DstSize % 32 != 0)) { 870 // FIXME: Need a way to specify non-extload of larger size if 871 // suitably aligned. 872 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 873 } 874 875 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 876 Op == G_LOAD); 877 if (MemSize > MaxSize) 878 return std::make_pair(0, LLT::scalar(MaxSize)); 879 880 unsigned Align = Query.MMODescrs[0].AlignInBits; 881 return std::make_pair(0, LLT::scalar(Align)); 882 }) 883 .fewerElementsIf( 884 [=](const LegalityQuery &Query) -> bool { 885 return Query.Types[0].isVector() && 886 needToSplitMemOp(Query, Op == G_LOAD); 887 }, 888 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 889 const LLT DstTy = Query.Types[0]; 890 const LLT PtrTy = Query.Types[1]; 891 892 LLT EltTy = DstTy.getElementType(); 893 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 894 Op == G_LOAD); 895 896 // FIXME: Handle widened to power of 2 results better. This ends 897 // up scalarizing. 898 // FIXME: 3 element stores scalarized on SI 899 900 // Split if it's too large for the address space. 901 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 902 unsigned NumElts = DstTy.getNumElements(); 903 unsigned EltSize = EltTy.getSizeInBits(); 904 905 if (MaxSize % EltSize == 0) { 906 return std::make_pair( 907 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 908 } 909 910 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 911 912 // FIXME: Refine when odd breakdowns handled 913 // The scalars will need to be re-legalized. 914 if (NumPieces == 1 || NumPieces >= NumElts || 915 NumElts % NumPieces != 0) 916 return std::make_pair(0, EltTy); 917 918 return std::make_pair(0, 919 LLT::vector(NumElts / NumPieces, EltTy)); 920 } 921 922 // FIXME: We could probably handle weird extending loads better. 923 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 924 if (DstTy.getSizeInBits() > MemSize) 925 return std::make_pair(0, EltTy); 926 927 unsigned EltSize = EltTy.getSizeInBits(); 928 unsigned DstSize = DstTy.getSizeInBits(); 929 if (!isPowerOf2_32(DstSize)) { 930 // We're probably decomposing an odd sized store. Try to split 931 // to the widest type. TODO: Account for alignment. As-is it 932 // should be OK, since the new parts will be further legalized. 933 unsigned FloorSize = PowerOf2Floor(DstSize); 934 return std::make_pair( 935 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 936 } 937 938 // Need to split because of alignment. 939 unsigned Align = Query.MMODescrs[0].AlignInBits; 940 if (EltSize > Align && 941 (EltSize / Align < DstTy.getNumElements())) { 942 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 943 } 944 945 // May need relegalization for the scalars. 946 return std::make_pair(0, EltTy); 947 }) 948 .minScalar(0, S32); 949 950 if (IsStore) 951 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 952 953 // TODO: Need a bitcast lower option? 954 Actions 955 .legalIf([=](const LegalityQuery &Query) { 956 const LLT Ty0 = Query.Types[0]; 957 unsigned Size = Ty0.getSizeInBits(); 958 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 959 unsigned Align = Query.MMODescrs[0].AlignInBits; 960 961 // FIXME: Widening store from alignment not valid. 962 if (MemSize < Size) 963 MemSize = std::max(MemSize, Align); 964 965 // No extending vector loads. 966 if (Size > MemSize && Ty0.isVector()) 967 return false; 968 969 switch (MemSize) { 970 case 8: 971 case 16: 972 return Size == 32; 973 case 32: 974 case 64: 975 case 128: 976 return true; 977 case 96: 978 return ST.hasDwordx3LoadStores(); 979 case 256: 980 case 512: 981 return true; 982 default: 983 return false; 984 } 985 }) 986 .widenScalarToNextPow2(0) 987 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 988 } 989 990 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 991 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 992 {S32, GlobalPtr, 16, 2 * 8}, 993 {S32, LocalPtr, 8, 8}, 994 {S32, LocalPtr, 16, 16}, 995 {S32, PrivatePtr, 8, 8}, 996 {S32, PrivatePtr, 16, 16}, 997 {S32, ConstantPtr, 8, 8}, 998 {S32, ConstantPtr, 16, 2 * 8}}); 999 if (ST.hasFlatAddressSpace()) { 1000 ExtLoads.legalForTypesWithMemDesc( 1001 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1002 } 1003 1004 ExtLoads.clampScalar(0, S32, S32) 1005 .widenScalarToNextPow2(0) 1006 .unsupportedIfMemSizeNotPow2() 1007 .lower(); 1008 1009 auto &Atomics = getActionDefinitionsBuilder( 1010 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1011 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1012 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1013 G_ATOMICRMW_UMIN}) 1014 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1015 {S64, GlobalPtr}, {S64, LocalPtr}}); 1016 if (ST.hasFlatAddressSpace()) { 1017 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1018 } 1019 1020 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1021 .legalFor({{S32, LocalPtr}}); 1022 1023 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1024 // demarshalling 1025 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1026 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1027 {S32, FlatPtr}, {S64, FlatPtr}}) 1028 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1029 {S32, RegionPtr}, {S64, RegionPtr}}); 1030 // TODO: Pointer types, any 32-bit or 64-bit vector 1031 1032 // Condition should be s32 for scalar, s1 for vector. 1033 getActionDefinitionsBuilder(G_SELECT) 1034 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1035 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1036 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1037 .clampScalar(0, S16, S64) 1038 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1039 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1040 .scalarize(1) 1041 .clampMaxNumElements(0, S32, 2) 1042 .clampMaxNumElements(0, LocalPtr, 2) 1043 .clampMaxNumElements(0, PrivatePtr, 2) 1044 .scalarize(0) 1045 .widenScalarToNextPow2(0) 1046 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1047 1048 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1049 // be more flexible with the shift amount type. 1050 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1051 .legalFor({{S32, S32}, {S64, S32}}); 1052 if (ST.has16BitInsts()) { 1053 if (ST.hasVOP3PInsts()) { 1054 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 1055 .clampMaxNumElements(0, S16, 2); 1056 } else 1057 Shifts.legalFor({{S16, S32}, {S16, S16}}); 1058 1059 // TODO: Support 16-bit shift amounts 1060 Shifts.clampScalar(1, S32, S32); 1061 Shifts.clampScalar(0, S16, S64); 1062 Shifts.widenScalarToNextPow2(0, 16); 1063 } else { 1064 // Make sure we legalize the shift amount type first, as the general 1065 // expansion for the shifted type will produce much worse code if it hasn't 1066 // been truncated already. 1067 Shifts.clampScalar(1, S32, S32); 1068 Shifts.clampScalar(0, S32, S64); 1069 Shifts.widenScalarToNextPow2(0, 32); 1070 } 1071 Shifts.scalarize(0); 1072 1073 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1074 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1075 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1076 unsigned IdxTypeIdx = 2; 1077 1078 getActionDefinitionsBuilder(Op) 1079 .customIf([=](const LegalityQuery &Query) { 1080 const LLT EltTy = Query.Types[EltTypeIdx]; 1081 const LLT VecTy = Query.Types[VecTypeIdx]; 1082 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1083 return (EltTy.getSizeInBits() == 16 || 1084 EltTy.getSizeInBits() % 32 == 0) && 1085 VecTy.getSizeInBits() % 32 == 0 && 1086 VecTy.getSizeInBits() <= 1024 && 1087 IdxTy.getSizeInBits() == 32; 1088 }) 1089 .clampScalar(EltTypeIdx, S32, S64) 1090 .clampScalar(VecTypeIdx, S32, S64) 1091 .clampScalar(IdxTypeIdx, S32, S32); 1092 } 1093 1094 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1095 .unsupportedIf([=](const LegalityQuery &Query) { 1096 const LLT &EltTy = Query.Types[1].getElementType(); 1097 return Query.Types[0] != EltTy; 1098 }); 1099 1100 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1101 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1102 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1103 1104 // FIXME: Doesn't handle extract of illegal sizes. 1105 getActionDefinitionsBuilder(Op) 1106 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1107 // FIXME: Multiples of 16 should not be legal. 1108 .legalIf([=](const LegalityQuery &Query) { 1109 const LLT BigTy = Query.Types[BigTyIdx]; 1110 const LLT LitTy = Query.Types[LitTyIdx]; 1111 return (BigTy.getSizeInBits() % 32 == 0) && 1112 (LitTy.getSizeInBits() % 16 == 0); 1113 }) 1114 .widenScalarIf( 1115 [=](const LegalityQuery &Query) { 1116 const LLT BigTy = Query.Types[BigTyIdx]; 1117 return (BigTy.getScalarSizeInBits() < 16); 1118 }, 1119 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1120 .widenScalarIf( 1121 [=](const LegalityQuery &Query) { 1122 const LLT LitTy = Query.Types[LitTyIdx]; 1123 return (LitTy.getScalarSizeInBits() < 16); 1124 }, 1125 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1126 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1127 .widenScalarToNextPow2(BigTyIdx, 32); 1128 1129 } 1130 1131 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1132 .legalForCartesianProduct(AllS32Vectors, {S32}) 1133 .legalForCartesianProduct(AllS64Vectors, {S64}) 1134 .clampNumElements(0, V16S32, V32S32) 1135 .clampNumElements(0, V2S64, V16S64) 1136 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1137 1138 if (ST.hasScalarPackInsts()) { 1139 BuildVector 1140 // FIXME: Should probably widen s1 vectors straight to s32 1141 .minScalarOrElt(0, S16) 1142 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1143 .minScalar(1, S32); 1144 1145 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1146 .legalFor({V2S16, S32}) 1147 .lower(); 1148 BuildVector.minScalarOrElt(0, S32); 1149 } else { 1150 BuildVector.customFor({V2S16, S16}); 1151 BuildVector.minScalarOrElt(0, S32); 1152 1153 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1154 .customFor({V2S16, S32}) 1155 .lower(); 1156 } 1157 1158 BuildVector.legalIf(isRegisterType(0)); 1159 1160 // FIXME: Clamp maximum size 1161 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1162 .legalIf(isRegisterType(0)); 1163 1164 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1165 // pre-legalize. 1166 if (ST.hasVOP3PInsts()) { 1167 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1168 .customFor({V2S16, V2S16}) 1169 .lower(); 1170 } else 1171 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1172 1173 // Merge/Unmerge 1174 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1175 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1176 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1177 1178 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1179 const LLT &Ty = Query.Types[TypeIdx]; 1180 if (Ty.isVector()) { 1181 const LLT &EltTy = Ty.getElementType(); 1182 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 1183 return true; 1184 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1185 return true; 1186 } 1187 return false; 1188 }; 1189 1190 auto &Builder = getActionDefinitionsBuilder(Op) 1191 // Try to widen to s16 first for small types. 1192 // TODO: Only do this on targets with legal s16 shifts 1193 .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1194 1195 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1196 .lowerFor({{S16, V2S16}}) 1197 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1198 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1199 elementTypeIs(1, S16)), 1200 changeTo(1, V2S16)) 1201 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1202 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1203 // valid. 1204 .clampScalar(LitTyIdx, S32, S256) 1205 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1206 // Break up vectors with weird elements into scalars 1207 .fewerElementsIf( 1208 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 1209 scalarize(0)) 1210 .fewerElementsIf( 1211 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 1212 scalarize(1)) 1213 .clampScalar(BigTyIdx, S32, S1024); 1214 1215 if (Op == G_MERGE_VALUES) { 1216 Builder.widenScalarIf( 1217 // TODO: Use 16-bit shifts if legal for 8-bit values? 1218 [=](const LegalityQuery &Query) { 1219 const LLT Ty = Query.Types[LitTyIdx]; 1220 return Ty.getSizeInBits() < 32; 1221 }, 1222 changeTo(LitTyIdx, S32)); 1223 } 1224 1225 Builder.widenScalarIf( 1226 [=](const LegalityQuery &Query) { 1227 const LLT Ty = Query.Types[BigTyIdx]; 1228 return !isPowerOf2_32(Ty.getSizeInBits()) && 1229 Ty.getSizeInBits() % 16 != 0; 1230 }, 1231 [=](const LegalityQuery &Query) { 1232 // Pick the next power of 2, or a multiple of 64 over 128. 1233 // Whichever is smaller. 1234 const LLT &Ty = Query.Types[BigTyIdx]; 1235 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1236 if (NewSizeInBits >= 256) { 1237 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1238 if (RoundedTo < NewSizeInBits) 1239 NewSizeInBits = RoundedTo; 1240 } 1241 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1242 }) 1243 .legalIf([=](const LegalityQuery &Query) { 1244 const LLT &BigTy = Query.Types[BigTyIdx]; 1245 const LLT &LitTy = Query.Types[LitTyIdx]; 1246 1247 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1248 return false; 1249 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1250 return false; 1251 1252 return BigTy.getSizeInBits() % 16 == 0 && 1253 LitTy.getSizeInBits() % 16 == 0 && 1254 BigTy.getSizeInBits() <= 1024; 1255 }) 1256 // Any vectors left are the wrong size. Scalarize them. 1257 .scalarize(0) 1258 .scalarize(1); 1259 } 1260 1261 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1262 // RegBankSelect. 1263 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1264 .legalFor({{S32}, {S64}}); 1265 1266 if (ST.hasVOP3PInsts()) { 1267 SextInReg.lowerFor({{V2S16}}) 1268 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1269 // get more vector shift opportunities, since we'll get those when 1270 // expanded. 1271 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1272 } else if (ST.has16BitInsts()) { 1273 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1274 } else { 1275 // Prefer to promote to s32 before lowering if we don't have 16-bit 1276 // shifts. This avoid a lot of intermediate truncate and extend operations. 1277 SextInReg.lowerFor({{S32}, {S64}}); 1278 } 1279 1280 SextInReg 1281 .scalarize(0) 1282 .clampScalar(0, S32, S64) 1283 .lower(); 1284 1285 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1286 .legalFor({S64}); 1287 1288 getActionDefinitionsBuilder({ 1289 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1290 G_FCOPYSIGN, 1291 1292 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1293 G_READ_REGISTER, 1294 G_WRITE_REGISTER, 1295 1296 G_SADDO, G_SSUBO, 1297 1298 // TODO: Implement 1299 G_FMINIMUM, G_FMAXIMUM 1300 }).lower(); 1301 1302 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1303 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1304 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1305 .unsupported(); 1306 1307 computeTables(); 1308 verify(*ST.getInstrInfo()); 1309 } 1310 1311 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1312 MachineRegisterInfo &MRI, 1313 MachineIRBuilder &B, 1314 GISelChangeObserver &Observer) const { 1315 switch (MI.getOpcode()) { 1316 case TargetOpcode::G_ADDRSPACE_CAST: 1317 return legalizeAddrSpaceCast(MI, MRI, B); 1318 case TargetOpcode::G_FRINT: 1319 return legalizeFrint(MI, MRI, B); 1320 case TargetOpcode::G_FCEIL: 1321 return legalizeFceil(MI, MRI, B); 1322 case TargetOpcode::G_INTRINSIC_TRUNC: 1323 return legalizeIntrinsicTrunc(MI, MRI, B); 1324 case TargetOpcode::G_SITOFP: 1325 return legalizeITOFP(MI, MRI, B, true); 1326 case TargetOpcode::G_UITOFP: 1327 return legalizeITOFP(MI, MRI, B, false); 1328 case TargetOpcode::G_FPTOSI: 1329 return legalizeFPTOI(MI, MRI, B, true); 1330 case TargetOpcode::G_FPTOUI: 1331 return legalizeFPTOI(MI, MRI, B, false); 1332 case TargetOpcode::G_FMINNUM: 1333 case TargetOpcode::G_FMAXNUM: 1334 case TargetOpcode::G_FMINNUM_IEEE: 1335 case TargetOpcode::G_FMAXNUM_IEEE: 1336 return legalizeMinNumMaxNum(MI, MRI, B); 1337 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1338 return legalizeExtractVectorElt(MI, MRI, B); 1339 case TargetOpcode::G_INSERT_VECTOR_ELT: 1340 return legalizeInsertVectorElt(MI, MRI, B); 1341 case TargetOpcode::G_SHUFFLE_VECTOR: 1342 return legalizeShuffleVector(MI, MRI, B); 1343 case TargetOpcode::G_FSIN: 1344 case TargetOpcode::G_FCOS: 1345 return legalizeSinCos(MI, MRI, B); 1346 case TargetOpcode::G_GLOBAL_VALUE: 1347 return legalizeGlobalValue(MI, MRI, B); 1348 case TargetOpcode::G_LOAD: 1349 return legalizeLoad(MI, MRI, B, Observer); 1350 case TargetOpcode::G_FMAD: 1351 return legalizeFMad(MI, MRI, B); 1352 case TargetOpcode::G_FDIV: 1353 return legalizeFDIV(MI, MRI, B); 1354 case TargetOpcode::G_UDIV: 1355 case TargetOpcode::G_UREM: 1356 return legalizeUDIV_UREM(MI, MRI, B); 1357 case TargetOpcode::G_SDIV: 1358 case TargetOpcode::G_SREM: 1359 return legalizeSDIV_SREM(MI, MRI, B); 1360 case TargetOpcode::G_ATOMIC_CMPXCHG: 1361 return legalizeAtomicCmpXChg(MI, MRI, B); 1362 case TargetOpcode::G_FLOG: 1363 return legalizeFlog(MI, B, 1.0f / numbers::log2ef); 1364 case TargetOpcode::G_FLOG10: 1365 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1366 case TargetOpcode::G_FEXP: 1367 return legalizeFExp(MI, B); 1368 case TargetOpcode::G_FPOW: 1369 return legalizeFPow(MI, B); 1370 case TargetOpcode::G_FFLOOR: 1371 return legalizeFFloor(MI, MRI, B); 1372 case TargetOpcode::G_BUILD_VECTOR: 1373 return legalizeBuildVector(MI, MRI, B); 1374 default: 1375 return false; 1376 } 1377 1378 llvm_unreachable("expected switch to return"); 1379 } 1380 1381 Register AMDGPULegalizerInfo::getSegmentAperture( 1382 unsigned AS, 1383 MachineRegisterInfo &MRI, 1384 MachineIRBuilder &B) const { 1385 MachineFunction &MF = B.getMF(); 1386 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1387 const LLT S32 = LLT::scalar(32); 1388 1389 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1390 1391 if (ST.hasApertureRegs()) { 1392 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1393 // getreg. 1394 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1395 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1396 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1397 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1398 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1399 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1400 unsigned Encoding = 1401 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1402 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1403 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1404 1405 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1406 1407 B.buildInstr(AMDGPU::S_GETREG_B32) 1408 .addDef(GetReg) 1409 .addImm(Encoding); 1410 MRI.setType(GetReg, S32); 1411 1412 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1413 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1414 } 1415 1416 Register QueuePtr = MRI.createGenericVirtualRegister( 1417 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1418 1419 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1420 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1421 return Register(); 1422 1423 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1424 // private_segment_aperture_base_hi. 1425 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1426 1427 // TODO: can we be smarter about machine pointer info? 1428 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1429 MachineMemOperand *MMO = MF.getMachineMemOperand( 1430 PtrInfo, 1431 MachineMemOperand::MOLoad | 1432 MachineMemOperand::MODereferenceable | 1433 MachineMemOperand::MOInvariant, 1434 4, 1435 MinAlign(64, StructOffset)); 1436 1437 Register LoadAddr; 1438 1439 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1440 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1441 } 1442 1443 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1444 MachineInstr &MI, MachineRegisterInfo &MRI, 1445 MachineIRBuilder &B) const { 1446 MachineFunction &MF = B.getMF(); 1447 1448 B.setInstr(MI); 1449 1450 const LLT S32 = LLT::scalar(32); 1451 Register Dst = MI.getOperand(0).getReg(); 1452 Register Src = MI.getOperand(1).getReg(); 1453 1454 LLT DstTy = MRI.getType(Dst); 1455 LLT SrcTy = MRI.getType(Src); 1456 unsigned DestAS = DstTy.getAddressSpace(); 1457 unsigned SrcAS = SrcTy.getAddressSpace(); 1458 1459 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1460 // vector element. 1461 assert(!DstTy.isVector()); 1462 1463 const AMDGPUTargetMachine &TM 1464 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1465 1466 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1467 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1468 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1469 return true; 1470 } 1471 1472 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1473 // Truncate. 1474 B.buildExtract(Dst, Src, 0); 1475 MI.eraseFromParent(); 1476 return true; 1477 } 1478 1479 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1480 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1481 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1482 1483 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1484 // another. Merge operands are required to be the same type, but creating an 1485 // extra ptrtoint would be kind of pointless. 1486 auto HighAddr = B.buildConstant( 1487 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1488 B.buildMerge(Dst, {Src, HighAddr}); 1489 MI.eraseFromParent(); 1490 return true; 1491 } 1492 1493 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1494 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1495 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1496 unsigned NullVal = TM.getNullPointerValue(DestAS); 1497 1498 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1499 auto FlatNull = B.buildConstant(SrcTy, 0); 1500 1501 // Extract low 32-bits of the pointer. 1502 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1503 1504 auto CmpRes = 1505 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1506 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1507 1508 MI.eraseFromParent(); 1509 return true; 1510 } 1511 1512 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1513 return false; 1514 1515 if (!ST.hasFlatAddressSpace()) 1516 return false; 1517 1518 auto SegmentNull = 1519 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1520 auto FlatNull = 1521 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1522 1523 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1524 if (!ApertureReg.isValid()) 1525 return false; 1526 1527 auto CmpRes = 1528 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1529 1530 // Coerce the type of the low half of the result so we can use merge_values. 1531 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1532 1533 // TODO: Should we allow mismatched types but matching sizes in merges to 1534 // avoid the ptrtoint? 1535 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1536 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1537 1538 MI.eraseFromParent(); 1539 return true; 1540 } 1541 1542 bool AMDGPULegalizerInfo::legalizeFrint( 1543 MachineInstr &MI, MachineRegisterInfo &MRI, 1544 MachineIRBuilder &B) const { 1545 B.setInstr(MI); 1546 1547 Register Src = MI.getOperand(1).getReg(); 1548 LLT Ty = MRI.getType(Src); 1549 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1550 1551 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1552 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1553 1554 auto C1 = B.buildFConstant(Ty, C1Val); 1555 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1556 1557 // TODO: Should this propagate fast-math-flags? 1558 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1559 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1560 1561 auto C2 = B.buildFConstant(Ty, C2Val); 1562 auto Fabs = B.buildFAbs(Ty, Src); 1563 1564 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1565 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1566 return true; 1567 } 1568 1569 bool AMDGPULegalizerInfo::legalizeFceil( 1570 MachineInstr &MI, MachineRegisterInfo &MRI, 1571 MachineIRBuilder &B) const { 1572 B.setInstr(MI); 1573 1574 const LLT S1 = LLT::scalar(1); 1575 const LLT S64 = LLT::scalar(64); 1576 1577 Register Src = MI.getOperand(1).getReg(); 1578 assert(MRI.getType(Src) == S64); 1579 1580 // result = trunc(src) 1581 // if (src > 0.0 && src != result) 1582 // result += 1.0 1583 1584 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1585 1586 const auto Zero = B.buildFConstant(S64, 0.0); 1587 const auto One = B.buildFConstant(S64, 1.0); 1588 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1589 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1590 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1591 auto Add = B.buildSelect(S64, And, One, Zero); 1592 1593 // TODO: Should this propagate fast-math-flags? 1594 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1595 return true; 1596 } 1597 1598 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1599 MachineIRBuilder &B) { 1600 const unsigned FractBits = 52; 1601 const unsigned ExpBits = 11; 1602 LLT S32 = LLT::scalar(32); 1603 1604 auto Const0 = B.buildConstant(S32, FractBits - 32); 1605 auto Const1 = B.buildConstant(S32, ExpBits); 1606 1607 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1608 .addUse(Const0.getReg(0)) 1609 .addUse(Const1.getReg(0)); 1610 1611 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1612 } 1613 1614 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1615 MachineInstr &MI, MachineRegisterInfo &MRI, 1616 MachineIRBuilder &B) const { 1617 B.setInstr(MI); 1618 1619 const LLT S1 = LLT::scalar(1); 1620 const LLT S32 = LLT::scalar(32); 1621 const LLT S64 = LLT::scalar(64); 1622 1623 Register Src = MI.getOperand(1).getReg(); 1624 assert(MRI.getType(Src) == S64); 1625 1626 // TODO: Should this use extract since the low half is unused? 1627 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1628 Register Hi = Unmerge.getReg(1); 1629 1630 // Extract the upper half, since this is where we will find the sign and 1631 // exponent. 1632 auto Exp = extractF64Exponent(Hi, B); 1633 1634 const unsigned FractBits = 52; 1635 1636 // Extract the sign bit. 1637 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1638 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1639 1640 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1641 1642 const auto Zero32 = B.buildConstant(S32, 0); 1643 1644 // Extend back to 64-bits. 1645 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1646 1647 auto Shr = B.buildAShr(S64, FractMask, Exp); 1648 auto Not = B.buildNot(S64, Shr); 1649 auto Tmp0 = B.buildAnd(S64, Src, Not); 1650 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1651 1652 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1653 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1654 1655 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1656 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1657 return true; 1658 } 1659 1660 bool AMDGPULegalizerInfo::legalizeITOFP( 1661 MachineInstr &MI, MachineRegisterInfo &MRI, 1662 MachineIRBuilder &B, bool Signed) const { 1663 B.setInstr(MI); 1664 1665 Register Dst = MI.getOperand(0).getReg(); 1666 Register Src = MI.getOperand(1).getReg(); 1667 1668 const LLT S64 = LLT::scalar(64); 1669 const LLT S32 = LLT::scalar(32); 1670 1671 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1672 1673 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1674 1675 auto CvtHi = Signed ? 1676 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1677 B.buildUITOFP(S64, Unmerge.getReg(1)); 1678 1679 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1680 1681 auto ThirtyTwo = B.buildConstant(S32, 32); 1682 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1683 .addUse(CvtHi.getReg(0)) 1684 .addUse(ThirtyTwo.getReg(0)); 1685 1686 // TODO: Should this propagate fast-math-flags? 1687 B.buildFAdd(Dst, LdExp, CvtLo); 1688 MI.eraseFromParent(); 1689 return true; 1690 } 1691 1692 // TODO: Copied from DAG implementation. Verify logic and document how this 1693 // actually works. 1694 bool AMDGPULegalizerInfo::legalizeFPTOI( 1695 MachineInstr &MI, MachineRegisterInfo &MRI, 1696 MachineIRBuilder &B, bool Signed) const { 1697 B.setInstr(MI); 1698 1699 Register Dst = MI.getOperand(0).getReg(); 1700 Register Src = MI.getOperand(1).getReg(); 1701 1702 const LLT S64 = LLT::scalar(64); 1703 const LLT S32 = LLT::scalar(32); 1704 1705 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1706 1707 unsigned Flags = MI.getFlags(); 1708 1709 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1710 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1711 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1712 1713 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1714 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1715 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1716 1717 auto Hi = Signed ? 1718 B.buildFPTOSI(S32, FloorMul) : 1719 B.buildFPTOUI(S32, FloorMul); 1720 auto Lo = B.buildFPTOUI(S32, Fma); 1721 1722 B.buildMerge(Dst, { Lo, Hi }); 1723 MI.eraseFromParent(); 1724 1725 return true; 1726 } 1727 1728 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1729 MachineInstr &MI, MachineRegisterInfo &MRI, 1730 MachineIRBuilder &B) const { 1731 MachineFunction &MF = B.getMF(); 1732 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1733 1734 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1735 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1736 1737 // With ieee_mode disabled, the instructions have the correct behavior 1738 // already for G_FMINNUM/G_FMAXNUM 1739 if (!MFI->getMode().IEEE) 1740 return !IsIEEEOp; 1741 1742 if (IsIEEEOp) 1743 return true; 1744 1745 MachineIRBuilder HelperBuilder(MI); 1746 GISelObserverWrapper DummyObserver; 1747 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1748 HelperBuilder.setInstr(MI); 1749 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1750 } 1751 1752 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1753 MachineInstr &MI, MachineRegisterInfo &MRI, 1754 MachineIRBuilder &B) const { 1755 // TODO: Should move some of this into LegalizerHelper. 1756 1757 // TODO: Promote dynamic indexing of s16 to s32 1758 1759 // FIXME: Artifact combiner probably should have replaced the truncated 1760 // constant before this, so we shouldn't need 1761 // getConstantVRegValWithLookThrough. 1762 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1763 MI.getOperand(2).getReg(), MRI); 1764 if (!IdxVal) // Dynamic case will be selected to register indexing. 1765 return true; 1766 1767 Register Dst = MI.getOperand(0).getReg(); 1768 Register Vec = MI.getOperand(1).getReg(); 1769 1770 LLT VecTy = MRI.getType(Vec); 1771 LLT EltTy = VecTy.getElementType(); 1772 assert(EltTy == MRI.getType(Dst)); 1773 1774 B.setInstr(MI); 1775 1776 if (IdxVal->Value < VecTy.getNumElements()) 1777 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1778 else 1779 B.buildUndef(Dst); 1780 1781 MI.eraseFromParent(); 1782 return true; 1783 } 1784 1785 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1786 MachineInstr &MI, MachineRegisterInfo &MRI, 1787 MachineIRBuilder &B) const { 1788 // TODO: Should move some of this into LegalizerHelper. 1789 1790 // TODO: Promote dynamic indexing of s16 to s32 1791 1792 // FIXME: Artifact combiner probably should have replaced the truncated 1793 // constant before this, so we shouldn't need 1794 // getConstantVRegValWithLookThrough. 1795 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1796 MI.getOperand(3).getReg(), MRI); 1797 if (!IdxVal) // Dynamic case will be selected to register indexing. 1798 return true; 1799 1800 Register Dst = MI.getOperand(0).getReg(); 1801 Register Vec = MI.getOperand(1).getReg(); 1802 Register Ins = MI.getOperand(2).getReg(); 1803 1804 LLT VecTy = MRI.getType(Vec); 1805 LLT EltTy = VecTy.getElementType(); 1806 assert(EltTy == MRI.getType(Ins)); 1807 1808 B.setInstr(MI); 1809 1810 if (IdxVal->Value < VecTy.getNumElements()) 1811 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 1812 else 1813 B.buildUndef(Dst); 1814 1815 MI.eraseFromParent(); 1816 return true; 1817 } 1818 1819 static bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask) { 1820 assert(Mask.size() == 2); 1821 1822 // If one half is undef, the other is trivially in the same reg. 1823 if (Mask[0] == -1 || Mask[1] == -1) 1824 return true; 1825 return ((Mask[0] == 0 || Mask[0] == 1) && (Mask[1] == 0 || Mask[1] == 1)) || 1826 ((Mask[0] == 2 || Mask[0] == 3) && (Mask[1] == 2 || Mask[1] == 3)); 1827 } 1828 1829 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1830 MachineInstr &MI, MachineRegisterInfo &MRI, 1831 MachineIRBuilder &B) const { 1832 const LLT V2S16 = LLT::vector(2, 16); 1833 1834 Register Dst = MI.getOperand(0).getReg(); 1835 Register Src0 = MI.getOperand(1).getReg(); 1836 LLT DstTy = MRI.getType(Dst); 1837 LLT SrcTy = MRI.getType(Src0); 1838 1839 if (SrcTy == V2S16 && DstTy == V2S16 && 1840 isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1841 return true; 1842 1843 MachineIRBuilder HelperBuilder(MI); 1844 GISelObserverWrapper DummyObserver; 1845 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1846 HelperBuilder.setInstr(MI); 1847 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1848 } 1849 1850 bool AMDGPULegalizerInfo::legalizeSinCos( 1851 MachineInstr &MI, MachineRegisterInfo &MRI, 1852 MachineIRBuilder &B) const { 1853 B.setInstr(MI); 1854 1855 Register DstReg = MI.getOperand(0).getReg(); 1856 Register SrcReg = MI.getOperand(1).getReg(); 1857 LLT Ty = MRI.getType(DstReg); 1858 unsigned Flags = MI.getFlags(); 1859 1860 Register TrigVal; 1861 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1862 if (ST.hasTrigReducedRange()) { 1863 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1864 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1865 .addUse(MulVal.getReg(0)) 1866 .setMIFlags(Flags).getReg(0); 1867 } else 1868 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1869 1870 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1871 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1872 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1873 .addUse(TrigVal) 1874 .setMIFlags(Flags); 1875 MI.eraseFromParent(); 1876 return true; 1877 } 1878 1879 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1880 Register DstReg, LLT PtrTy, 1881 MachineIRBuilder &B, const GlobalValue *GV, 1882 unsigned Offset, unsigned GAFlags) const { 1883 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1884 // to the following code sequence: 1885 // 1886 // For constant address space: 1887 // s_getpc_b64 s[0:1] 1888 // s_add_u32 s0, s0, $symbol 1889 // s_addc_u32 s1, s1, 0 1890 // 1891 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1892 // a fixup or relocation is emitted to replace $symbol with a literal 1893 // constant, which is a pc-relative offset from the encoding of the $symbol 1894 // operand to the global variable. 1895 // 1896 // For global address space: 1897 // s_getpc_b64 s[0:1] 1898 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1899 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1900 // 1901 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1902 // fixups or relocations are emitted to replace $symbol@*@lo and 1903 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1904 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1905 // operand to the global variable. 1906 // 1907 // What we want here is an offset from the value returned by s_getpc 1908 // (which is the address of the s_add_u32 instruction) to the global 1909 // variable, but since the encoding of $symbol starts 4 bytes after the start 1910 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1911 // small. This requires us to add 4 to the global variable offset in order to 1912 // compute the correct address. 1913 1914 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1915 1916 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1917 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1918 1919 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1920 .addDef(PCReg); 1921 1922 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1923 if (GAFlags == SIInstrInfo::MO_NONE) 1924 MIB.addImm(0); 1925 else 1926 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1927 1928 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1929 1930 if (PtrTy.getSizeInBits() == 32) 1931 B.buildExtract(DstReg, PCReg, 0); 1932 return true; 1933 } 1934 1935 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1936 MachineInstr &MI, MachineRegisterInfo &MRI, 1937 MachineIRBuilder &B) const { 1938 Register DstReg = MI.getOperand(0).getReg(); 1939 LLT Ty = MRI.getType(DstReg); 1940 unsigned AS = Ty.getAddressSpace(); 1941 1942 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1943 MachineFunction &MF = B.getMF(); 1944 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1945 B.setInstr(MI); 1946 1947 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1948 if (!MFI->isEntryFunction()) { 1949 const Function &Fn = MF.getFunction(); 1950 DiagnosticInfoUnsupported BadLDSDecl( 1951 Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 1952 Fn.getContext().diagnose(BadLDSDecl); 1953 } 1954 1955 // TODO: We could emit code to handle the initialization somewhere. 1956 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1957 const SITargetLowering *TLI = ST.getTargetLowering(); 1958 if (!TLI->shouldUseLDSConstAddress(GV)) { 1959 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 1960 return true; // Leave in place; 1961 } 1962 1963 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1964 MI.eraseFromParent(); 1965 return true; 1966 } 1967 1968 const Function &Fn = MF.getFunction(); 1969 DiagnosticInfoUnsupported BadInit( 1970 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1971 Fn.getContext().diagnose(BadInit); 1972 return true; 1973 } 1974 1975 const SITargetLowering *TLI = ST.getTargetLowering(); 1976 1977 if (TLI->shouldEmitFixup(GV)) { 1978 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 1979 MI.eraseFromParent(); 1980 return true; 1981 } 1982 1983 if (TLI->shouldEmitPCReloc(GV)) { 1984 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 1985 MI.eraseFromParent(); 1986 return true; 1987 } 1988 1989 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1990 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 1991 1992 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 1993 MachinePointerInfo::getGOT(MF), 1994 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1995 MachineMemOperand::MOInvariant, 1996 8 /*Size*/, 8 /*Align*/); 1997 1998 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 1999 2000 if (Ty.getSizeInBits() == 32) { 2001 // Truncate if this is a 32-bit constant adrdess. 2002 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2003 B.buildExtract(DstReg, Load, 0); 2004 } else 2005 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2006 2007 MI.eraseFromParent(); 2008 return true; 2009 } 2010 2011 bool AMDGPULegalizerInfo::legalizeLoad( 2012 MachineInstr &MI, MachineRegisterInfo &MRI, 2013 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2014 B.setInstr(MI); 2015 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2016 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2017 Observer.changingInstr(MI); 2018 MI.getOperand(1).setReg(Cast.getReg(0)); 2019 Observer.changedInstr(MI); 2020 return true; 2021 } 2022 2023 bool AMDGPULegalizerInfo::legalizeFMad( 2024 MachineInstr &MI, MachineRegisterInfo &MRI, 2025 MachineIRBuilder &B) const { 2026 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2027 assert(Ty.isScalar()); 2028 2029 MachineFunction &MF = B.getMF(); 2030 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2031 2032 // TODO: Always legal with future ftz flag. 2033 // FIXME: Do we need just output? 2034 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2035 return true; 2036 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2037 return true; 2038 2039 MachineIRBuilder HelperBuilder(MI); 2040 GISelObserverWrapper DummyObserver; 2041 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2042 HelperBuilder.setMBB(*MI.getParent()); 2043 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2044 } 2045 2046 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2047 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2048 Register DstReg = MI.getOperand(0).getReg(); 2049 Register PtrReg = MI.getOperand(1).getReg(); 2050 Register CmpVal = MI.getOperand(2).getReg(); 2051 Register NewVal = MI.getOperand(3).getReg(); 2052 2053 assert(SITargetLowering::isFlatGlobalAddrSpace( 2054 MRI.getType(PtrReg).getAddressSpace()) && 2055 "this should not have been custom lowered"); 2056 2057 LLT ValTy = MRI.getType(CmpVal); 2058 LLT VecTy = LLT::vector(2, ValTy); 2059 2060 B.setInstr(MI); 2061 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2062 2063 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2064 .addDef(DstReg) 2065 .addUse(PtrReg) 2066 .addUse(PackedVal) 2067 .setMemRefs(MI.memoperands()); 2068 2069 MI.eraseFromParent(); 2070 return true; 2071 } 2072 2073 bool AMDGPULegalizerInfo::legalizeFlog( 2074 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2075 Register Dst = MI.getOperand(0).getReg(); 2076 Register Src = MI.getOperand(1).getReg(); 2077 LLT Ty = B.getMRI()->getType(Dst); 2078 unsigned Flags = MI.getFlags(); 2079 B.setInstr(MI); 2080 2081 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2082 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2083 2084 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2085 MI.eraseFromParent(); 2086 return true; 2087 } 2088 2089 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2090 MachineIRBuilder &B) const { 2091 Register Dst = MI.getOperand(0).getReg(); 2092 Register Src = MI.getOperand(1).getReg(); 2093 unsigned Flags = MI.getFlags(); 2094 LLT Ty = B.getMRI()->getType(Dst); 2095 B.setInstr(MI); 2096 2097 auto K = B.buildFConstant(Ty, numbers::log2e); 2098 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2099 B.buildFExp2(Dst, Mul, Flags); 2100 MI.eraseFromParent(); 2101 return true; 2102 } 2103 2104 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2105 MachineIRBuilder &B) const { 2106 Register Dst = MI.getOperand(0).getReg(); 2107 Register Src0 = MI.getOperand(1).getReg(); 2108 Register Src1 = MI.getOperand(2).getReg(); 2109 unsigned Flags = MI.getFlags(); 2110 LLT Ty = B.getMRI()->getType(Dst); 2111 B.setInstr(MI); 2112 const LLT S16 = LLT::scalar(16); 2113 const LLT S32 = LLT::scalar(32); 2114 2115 if (Ty == S32) { 2116 auto Log = B.buildFLog2(S32, Src0, Flags); 2117 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2118 .addUse(Log.getReg(0)) 2119 .addUse(Src1) 2120 .setMIFlags(Flags); 2121 B.buildFExp2(Dst, Mul, Flags); 2122 } else if (Ty == S16) { 2123 // There's no f16 fmul_legacy, so we need to convert for it. 2124 auto Log = B.buildFLog2(S16, Src0, Flags); 2125 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2126 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2127 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2128 .addUse(Ext0.getReg(0)) 2129 .addUse(Ext1.getReg(0)) 2130 .setMIFlags(Flags); 2131 2132 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2133 } else 2134 return false; 2135 2136 MI.eraseFromParent(); 2137 return true; 2138 } 2139 2140 // Find a source register, ignoring any possible source modifiers. 2141 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2142 Register ModSrc = OrigSrc; 2143 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2144 ModSrc = SrcFNeg->getOperand(1).getReg(); 2145 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2146 ModSrc = SrcFAbs->getOperand(1).getReg(); 2147 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2148 ModSrc = SrcFAbs->getOperand(1).getReg(); 2149 return ModSrc; 2150 } 2151 2152 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2153 MachineRegisterInfo &MRI, 2154 MachineIRBuilder &B) const { 2155 B.setInstr(MI); 2156 2157 const LLT S1 = LLT::scalar(1); 2158 const LLT S64 = LLT::scalar(64); 2159 Register Dst = MI.getOperand(0).getReg(); 2160 Register OrigSrc = MI.getOperand(1).getReg(); 2161 unsigned Flags = MI.getFlags(); 2162 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2163 "this should not have been custom lowered"); 2164 2165 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2166 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2167 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2168 // V_FRACT bug is: 2169 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2170 // 2171 // Convert floor(x) to (x - fract(x)) 2172 2173 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2174 .addUse(OrigSrc) 2175 .setMIFlags(Flags); 2176 2177 // Give source modifier matching some assistance before obscuring a foldable 2178 // pattern. 2179 2180 // TODO: We can avoid the neg on the fract? The input sign to fract 2181 // shouldn't matter? 2182 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2183 2184 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2185 2186 Register Min = MRI.createGenericVirtualRegister(S64); 2187 2188 // We don't need to concern ourselves with the snan handling difference, so 2189 // use the one which will directly select. 2190 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2191 if (MFI->getMode().IEEE) 2192 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2193 else 2194 B.buildFMinNum(Min, Fract, Const, Flags); 2195 2196 Register CorrectedFract = Min; 2197 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2198 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2199 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2200 } 2201 2202 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2203 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2204 2205 MI.eraseFromParent(); 2206 return true; 2207 } 2208 2209 // Turn an illegal packed v2s16 build vector into bit operations. 2210 // TODO: This should probably be a bitcast action in LegalizerHelper. 2211 bool AMDGPULegalizerInfo::legalizeBuildVector( 2212 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2213 Register Dst = MI.getOperand(0).getReg(); 2214 LLT DstTy = MRI.getType(Dst); 2215 const LLT S32 = LLT::scalar(32); 2216 const LLT V2S16 = LLT::vector(2, 16); 2217 (void)DstTy; 2218 (void)V2S16; 2219 assert(DstTy == V2S16); 2220 2221 Register Src0 = MI.getOperand(1).getReg(); 2222 Register Src1 = MI.getOperand(2).getReg(); 2223 assert(MRI.getType(Src0) == LLT::scalar(16)); 2224 2225 B.setInstr(MI); 2226 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2227 B.buildBitcast(Dst, Merge); 2228 2229 MI.eraseFromParent(); 2230 return true; 2231 } 2232 2233 // Return the use branch instruction, otherwise null if the usage is invalid. 2234 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2235 MachineRegisterInfo &MRI, 2236 MachineInstr *&Br) { 2237 Register CondDef = MI.getOperand(0).getReg(); 2238 if (!MRI.hasOneNonDBGUse(CondDef)) 2239 return nullptr; 2240 2241 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2242 if (UseMI.getParent() != MI.getParent() || 2243 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2244 return nullptr; 2245 2246 // Make sure the cond br is followed by a G_BR 2247 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2248 if (Next != MI.getParent()->end()) { 2249 if (Next->getOpcode() != AMDGPU::G_BR) 2250 return nullptr; 2251 Br = &*Next; 2252 } 2253 2254 return &UseMI; 2255 } 2256 2257 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 2258 Register Reg, LLT Ty) const { 2259 Register LiveIn = MRI.getLiveInVirtReg(Reg); 2260 if (LiveIn) 2261 return LiveIn; 2262 2263 Register NewReg = MRI.createGenericVirtualRegister(Ty); 2264 MRI.addLiveIn(Reg, NewReg); 2265 return NewReg; 2266 } 2267 2268 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2269 const ArgDescriptor *Arg) const { 2270 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2271 return false; // TODO: Handle these 2272 2273 assert(Arg->getRegister().isPhysical()); 2274 2275 MachineRegisterInfo &MRI = *B.getMRI(); 2276 2277 LLT Ty = MRI.getType(DstReg); 2278 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 2279 2280 if (Arg->isMasked()) { 2281 // TODO: Should we try to emit this once in the entry block? 2282 const LLT S32 = LLT::scalar(32); 2283 const unsigned Mask = Arg->getMask(); 2284 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2285 2286 Register AndMaskSrc = LiveIn; 2287 2288 if (Shift != 0) { 2289 auto ShiftAmt = B.buildConstant(S32, Shift); 2290 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2291 } 2292 2293 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2294 } else 2295 B.buildCopy(DstReg, LiveIn); 2296 2297 // Insert the argument copy if it doens't already exist. 2298 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2299 if (!MRI.getVRegDef(LiveIn)) { 2300 // FIXME: Should have scoped insert pt 2301 MachineBasicBlock &OrigInsBB = B.getMBB(); 2302 auto OrigInsPt = B.getInsertPt(); 2303 2304 MachineBasicBlock &EntryMBB = B.getMF().front(); 2305 EntryMBB.addLiveIn(Arg->getRegister()); 2306 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2307 B.buildCopy(LiveIn, Arg->getRegister()); 2308 2309 B.setInsertPt(OrigInsBB, OrigInsPt); 2310 } 2311 2312 return true; 2313 } 2314 2315 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2316 MachineInstr &MI, 2317 MachineRegisterInfo &MRI, 2318 MachineIRBuilder &B, 2319 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2320 B.setInstr(MI); 2321 2322 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2323 2324 const ArgDescriptor *Arg; 2325 const TargetRegisterClass *RC; 2326 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 2327 if (!Arg) { 2328 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2329 return false; 2330 } 2331 2332 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 2333 MI.eraseFromParent(); 2334 return true; 2335 } 2336 2337 return false; 2338 } 2339 2340 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2341 MachineRegisterInfo &MRI, 2342 MachineIRBuilder &B) const { 2343 B.setInstr(MI); 2344 Register Dst = MI.getOperand(0).getReg(); 2345 LLT DstTy = MRI.getType(Dst); 2346 LLT S16 = LLT::scalar(16); 2347 LLT S32 = LLT::scalar(32); 2348 LLT S64 = LLT::scalar(64); 2349 2350 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2351 return true; 2352 2353 if (DstTy == S16) 2354 return legalizeFDIV16(MI, MRI, B); 2355 if (DstTy == S32) 2356 return legalizeFDIV32(MI, MRI, B); 2357 if (DstTy == S64) 2358 return legalizeFDIV64(MI, MRI, B); 2359 2360 return false; 2361 } 2362 2363 static Register buildDivRCP(MachineIRBuilder &B, Register Src) { 2364 const LLT S32 = LLT::scalar(32); 2365 2366 auto Cvt0 = B.buildUITOFP(S32, Src); 2367 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0}); 2368 auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000)); 2369 auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1); 2370 return B.buildFPTOUI(S32, Mul).getReg(0); 2371 } 2372 2373 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2374 Register DstReg, 2375 Register Num, 2376 Register Den, 2377 bool IsRem) const { 2378 const LLT S1 = LLT::scalar(1); 2379 const LLT S32 = LLT::scalar(32); 2380 2381 // RCP = URECIP(Den) = 2^32 / Den + e 2382 // e is rounding error. 2383 auto RCP = buildDivRCP(B, Den); 2384 2385 // RCP_LO = mul(RCP, Den) 2386 auto RCP_LO = B.buildMul(S32, RCP, Den); 2387 2388 // RCP_HI = mulhu (RCP, Den) */ 2389 auto RCP_HI = B.buildUMulH(S32, RCP, Den); 2390 2391 // NEG_RCP_LO = -RCP_LO 2392 auto Zero = B.buildConstant(S32, 0); 2393 auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO); 2394 2395 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 2396 auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero); 2397 auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO); 2398 2399 // Calculate the rounding error from the URECIP instruction 2400 // E = mulhu(ABS_RCP_LO, RCP) 2401 auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP); 2402 2403 // RCP_A_E = RCP + E 2404 auto RCP_A_E = B.buildAdd(S32, RCP, E); 2405 2406 // RCP_S_E = RCP - E 2407 auto RCP_S_E = B.buildSub(S32, RCP, E); 2408 2409 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 2410 auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E); 2411 2412 // Quotient = mulhu(Tmp0, Num)stmp 2413 auto Quotient = B.buildUMulH(S32, Tmp0, Num); 2414 2415 // Num_S_Remainder = Quotient * Den 2416 auto Num_S_Remainder = B.buildMul(S32, Quotient, Den); 2417 2418 // Remainder = Num - Num_S_Remainder 2419 auto Remainder = B.buildSub(S32, Num, Num_S_Remainder); 2420 2421 // Remainder_GE_Den = Remainder >= Den 2422 auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den); 2423 2424 // Remainder_GE_Zero = Num >= Num_S_Remainder; 2425 auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1, 2426 Num, Num_S_Remainder); 2427 2428 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 2429 auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero); 2430 2431 // Calculate Division result: 2432 2433 // Quotient_A_One = Quotient + 1 2434 auto One = B.buildConstant(S32, 1); 2435 auto Quotient_A_One = B.buildAdd(S32, Quotient, One); 2436 2437 // Quotient_S_One = Quotient - 1 2438 auto Quotient_S_One = B.buildSub(S32, Quotient, One); 2439 2440 // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient) 2441 auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One); 2442 2443 // Div = (Remainder_GE_Zero ? Div : Quotient_S_One) 2444 if (IsRem) { 2445 Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One); 2446 2447 // Calculate Rem result: 2448 auto Remainder_S_Den = B.buildSub(S32, Remainder, Den); 2449 2450 // Remainder_A_Den = Remainder + Den 2451 auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den); 2452 2453 // Rem = (Tmp1 ? Remainder_S_Den : Remainder) 2454 auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder); 2455 2456 // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den) 2457 B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den); 2458 } else { 2459 B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One); 2460 } 2461 } 2462 2463 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2464 MachineRegisterInfo &MRI, 2465 MachineIRBuilder &B) const { 2466 B.setInstr(MI); 2467 const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM; 2468 Register DstReg = MI.getOperand(0).getReg(); 2469 Register Num = MI.getOperand(1).getReg(); 2470 Register Den = MI.getOperand(2).getReg(); 2471 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem); 2472 MI.eraseFromParent(); 2473 return true; 2474 } 2475 2476 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2477 MachineRegisterInfo &MRI, 2478 MachineIRBuilder &B) const { 2479 if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32)) 2480 return legalizeUDIV_UREM32(MI, MRI, B); 2481 return false; 2482 } 2483 2484 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI, 2485 MachineRegisterInfo &MRI, 2486 MachineIRBuilder &B) const { 2487 B.setInstr(MI); 2488 const LLT S32 = LLT::scalar(32); 2489 2490 const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM; 2491 Register DstReg = MI.getOperand(0).getReg(); 2492 Register LHS = MI.getOperand(1).getReg(); 2493 Register RHS = MI.getOperand(2).getReg(); 2494 2495 auto ThirtyOne = B.buildConstant(S32, 31); 2496 auto LHSign = B.buildAShr(S32, LHS, ThirtyOne); 2497 auto RHSign = B.buildAShr(S32, LHS, ThirtyOne); 2498 2499 LHS = B.buildAdd(S32, LHS, LHSign).getReg(0); 2500 RHS = B.buildAdd(S32, RHS, RHSign).getReg(0); 2501 2502 LHS = B.buildXor(S32, LHS, LHSign).getReg(0); 2503 RHS = B.buildXor(S32, RHS, RHSign).getReg(0); 2504 2505 Register UDivRem = MRI.createGenericVirtualRegister(S32); 2506 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem); 2507 2508 if (IsRem) { 2509 auto RSign = LHSign; // Remainder sign is the same as LHS 2510 UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0); 2511 B.buildSub(DstReg, UDivRem, RSign); 2512 } else { 2513 auto DSign = B.buildXor(S32, LHSign, RHSign); 2514 UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0); 2515 B.buildSub(DstReg, UDivRem, DSign); 2516 } 2517 2518 MI.eraseFromParent(); 2519 return true; 2520 } 2521 2522 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2523 MachineRegisterInfo &MRI, 2524 MachineIRBuilder &B) const { 2525 if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32)) 2526 return legalizeSDIV_SREM32(MI, MRI, B); 2527 return false; 2528 } 2529 2530 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2531 MachineRegisterInfo &MRI, 2532 MachineIRBuilder &B) const { 2533 Register Res = MI.getOperand(0).getReg(); 2534 Register LHS = MI.getOperand(1).getReg(); 2535 Register RHS = MI.getOperand(2).getReg(); 2536 2537 uint16_t Flags = MI.getFlags(); 2538 2539 LLT ResTy = MRI.getType(Res); 2540 LLT S32 = LLT::scalar(32); 2541 LLT S64 = LLT::scalar(64); 2542 2543 const MachineFunction &MF = B.getMF(); 2544 bool Unsafe = 2545 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2546 2547 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2548 return false; 2549 2550 if (!Unsafe && ResTy == S32 && 2551 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2552 return false; 2553 2554 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2555 // 1 / x -> RCP(x) 2556 if (CLHS->isExactlyValue(1.0)) { 2557 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2558 .addUse(RHS) 2559 .setMIFlags(Flags); 2560 2561 MI.eraseFromParent(); 2562 return true; 2563 } 2564 2565 // -1 / x -> RCP( FNEG(x) ) 2566 if (CLHS->isExactlyValue(-1.0)) { 2567 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2568 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2569 .addUse(FNeg.getReg(0)) 2570 .setMIFlags(Flags); 2571 2572 MI.eraseFromParent(); 2573 return true; 2574 } 2575 } 2576 2577 // x / y -> x * (1.0 / y) 2578 if (Unsafe) { 2579 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2580 .addUse(RHS) 2581 .setMIFlags(Flags); 2582 B.buildFMul(Res, LHS, RCP, Flags); 2583 2584 MI.eraseFromParent(); 2585 return true; 2586 } 2587 2588 return false; 2589 } 2590 2591 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2592 MachineRegisterInfo &MRI, 2593 MachineIRBuilder &B) const { 2594 B.setInstr(MI); 2595 Register Res = MI.getOperand(0).getReg(); 2596 Register LHS = MI.getOperand(1).getReg(); 2597 Register RHS = MI.getOperand(2).getReg(); 2598 2599 uint16_t Flags = MI.getFlags(); 2600 2601 LLT S16 = LLT::scalar(16); 2602 LLT S32 = LLT::scalar(32); 2603 2604 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2605 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2606 2607 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2608 .addUse(RHSExt.getReg(0)) 2609 .setMIFlags(Flags); 2610 2611 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2612 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2613 2614 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2615 .addUse(RDst.getReg(0)) 2616 .addUse(RHS) 2617 .addUse(LHS) 2618 .setMIFlags(Flags); 2619 2620 MI.eraseFromParent(); 2621 return true; 2622 } 2623 2624 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2625 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2626 static void toggleSPDenormMode(bool Enable, 2627 MachineIRBuilder &B, 2628 const GCNSubtarget &ST, 2629 AMDGPU::SIModeRegisterDefaults Mode) { 2630 // Set SP denorm mode to this value. 2631 unsigned SPDenormMode = 2632 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2633 2634 if (ST.hasDenormModeInst()) { 2635 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2636 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2637 2638 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2639 B.buildInstr(AMDGPU::S_DENORM_MODE) 2640 .addImm(NewDenormModeValue); 2641 2642 } else { 2643 // Select FP32 bit field in mode register. 2644 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2645 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2646 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2647 2648 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2649 .addImm(SPDenormMode) 2650 .addImm(SPDenormModeBitField); 2651 } 2652 } 2653 2654 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2655 MachineRegisterInfo &MRI, 2656 MachineIRBuilder &B) const { 2657 B.setInstr(MI); 2658 Register Res = MI.getOperand(0).getReg(); 2659 Register LHS = MI.getOperand(1).getReg(); 2660 Register RHS = MI.getOperand(2).getReg(); 2661 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2662 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2663 2664 uint16_t Flags = MI.getFlags(); 2665 2666 LLT S32 = LLT::scalar(32); 2667 LLT S1 = LLT::scalar(1); 2668 2669 auto One = B.buildFConstant(S32, 1.0f); 2670 2671 auto DenominatorScaled = 2672 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2673 .addUse(RHS) 2674 .addUse(LHS) 2675 .addImm(1) 2676 .setMIFlags(Flags); 2677 auto NumeratorScaled = 2678 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2679 .addUse(LHS) 2680 .addUse(RHS) 2681 .addImm(0) 2682 .setMIFlags(Flags); 2683 2684 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2685 .addUse(DenominatorScaled.getReg(0)) 2686 .setMIFlags(Flags); 2687 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2688 2689 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2690 // aren't modeled as reading it. 2691 if (!Mode.allFP32Denormals()) 2692 toggleSPDenormMode(true, B, ST, Mode); 2693 2694 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2695 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2696 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2697 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2698 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2699 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2700 2701 if (!Mode.allFP32Denormals()) 2702 toggleSPDenormMode(false, B, ST, Mode); 2703 2704 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2705 .addUse(Fma4.getReg(0)) 2706 .addUse(Fma1.getReg(0)) 2707 .addUse(Fma3.getReg(0)) 2708 .addUse(NumeratorScaled.getReg(1)) 2709 .setMIFlags(Flags); 2710 2711 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2712 .addUse(Fmas.getReg(0)) 2713 .addUse(RHS) 2714 .addUse(LHS) 2715 .setMIFlags(Flags); 2716 2717 MI.eraseFromParent(); 2718 return true; 2719 } 2720 2721 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2722 MachineRegisterInfo &MRI, 2723 MachineIRBuilder &B) const { 2724 B.setInstr(MI); 2725 Register Res = MI.getOperand(0).getReg(); 2726 Register LHS = MI.getOperand(1).getReg(); 2727 Register RHS = MI.getOperand(2).getReg(); 2728 2729 uint16_t Flags = MI.getFlags(); 2730 2731 LLT S64 = LLT::scalar(64); 2732 LLT S1 = LLT::scalar(1); 2733 2734 auto One = B.buildFConstant(S64, 1.0); 2735 2736 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2737 .addUse(LHS) 2738 .addUse(RHS) 2739 .addImm(1) 2740 .setMIFlags(Flags); 2741 2742 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2743 2744 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2745 .addUse(DivScale0.getReg(0)) 2746 .setMIFlags(Flags); 2747 2748 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2749 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2750 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2751 2752 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2753 .addUse(LHS) 2754 .addUse(RHS) 2755 .addImm(0) 2756 .setMIFlags(Flags); 2757 2758 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2759 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); 2760 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2761 2762 Register Scale; 2763 if (!ST.hasUsableDivScaleConditionOutput()) { 2764 // Workaround a hardware bug on SI where the condition output from div_scale 2765 // is not usable. 2766 2767 LLT S32 = LLT::scalar(32); 2768 2769 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2770 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2771 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2772 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 2773 2774 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 2775 Scale1Unmerge.getReg(1)); 2776 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 2777 Scale0Unmerge.getReg(1)); 2778 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 2779 } else { 2780 Scale = DivScale1.getReg(1); 2781 } 2782 2783 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 2784 .addUse(Fma4.getReg(0)) 2785 .addUse(Fma3.getReg(0)) 2786 .addUse(Mul.getReg(0)) 2787 .addUse(Scale) 2788 .setMIFlags(Flags); 2789 2790 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 2791 .addUse(Fmas.getReg(0)) 2792 .addUse(RHS) 2793 .addUse(LHS) 2794 .setMIFlags(Flags); 2795 2796 MI.eraseFromParent(); 2797 return true; 2798 } 2799 2800 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 2801 MachineRegisterInfo &MRI, 2802 MachineIRBuilder &B) const { 2803 B.setInstr(MI); 2804 Register Res = MI.getOperand(0).getReg(); 2805 Register LHS = MI.getOperand(2).getReg(); 2806 Register RHS = MI.getOperand(3).getReg(); 2807 uint16_t Flags = MI.getFlags(); 2808 2809 LLT S32 = LLT::scalar(32); 2810 LLT S1 = LLT::scalar(1); 2811 2812 auto Abs = B.buildFAbs(S32, RHS, Flags); 2813 const APFloat C0Val(1.0f); 2814 2815 auto C0 = B.buildConstant(S32, 0x6f800000); 2816 auto C1 = B.buildConstant(S32, 0x2f800000); 2817 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 2818 2819 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 2820 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 2821 2822 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 2823 2824 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2825 .addUse(Mul0.getReg(0)) 2826 .setMIFlags(Flags); 2827 2828 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 2829 2830 B.buildFMul(Res, Sel, Mul1, Flags); 2831 2832 MI.eraseFromParent(); 2833 return true; 2834 } 2835 2836 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 2837 MachineRegisterInfo &MRI, 2838 MachineIRBuilder &B) const { 2839 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2840 if (!MFI->isEntryFunction()) { 2841 return legalizePreloadedArgIntrin(MI, MRI, B, 2842 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 2843 } 2844 2845 B.setInstr(MI); 2846 2847 uint64_t Offset = 2848 ST.getTargetLowering()->getImplicitParameterOffset( 2849 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 2850 Register DstReg = MI.getOperand(0).getReg(); 2851 LLT DstTy = MRI.getType(DstReg); 2852 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 2853 2854 const ArgDescriptor *Arg; 2855 const TargetRegisterClass *RC; 2856 std::tie(Arg, RC) 2857 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2858 if (!Arg) 2859 return false; 2860 2861 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 2862 if (!loadInputValue(KernargPtrReg, B, Arg)) 2863 return false; 2864 2865 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 2866 MI.eraseFromParent(); 2867 return true; 2868 } 2869 2870 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 2871 MachineRegisterInfo &MRI, 2872 MachineIRBuilder &B, 2873 unsigned AddrSpace) const { 2874 B.setInstr(MI); 2875 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 2876 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 2877 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 2878 MI.eraseFromParent(); 2879 return true; 2880 } 2881 2882 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 2883 // offset (the offset that is included in bounds checking and swizzling, to be 2884 // split between the instruction's voffset and immoffset fields) and soffset 2885 // (the offset that is excluded from bounds checking and swizzling, to go in 2886 // the instruction's soffset field). This function takes the first kind of 2887 // offset and figures out how to split it between voffset and immoffset. 2888 std::tuple<Register, unsigned, unsigned> 2889 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 2890 Register OrigOffset) const { 2891 const unsigned MaxImm = 4095; 2892 Register BaseReg; 2893 unsigned TotalConstOffset; 2894 MachineInstr *OffsetDef; 2895 const LLT S32 = LLT::scalar(32); 2896 2897 std::tie(BaseReg, TotalConstOffset, OffsetDef) 2898 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 2899 2900 unsigned ImmOffset = TotalConstOffset; 2901 2902 // If the immediate value is too big for the immoffset field, put the value 2903 // and -4096 into the immoffset field so that the value that is copied/added 2904 // for the voffset field is a multiple of 4096, and it stands more chance 2905 // of being CSEd with the copy/add for another similar load/store. 2906 // However, do not do that rounding down to a multiple of 4096 if that is a 2907 // negative number, as it appears to be illegal to have a negative offset 2908 // in the vgpr, even if adding the immediate offset makes it positive. 2909 unsigned Overflow = ImmOffset & ~MaxImm; 2910 ImmOffset -= Overflow; 2911 if ((int32_t)Overflow < 0) { 2912 Overflow += ImmOffset; 2913 ImmOffset = 0; 2914 } 2915 2916 if (Overflow != 0) { 2917 if (!BaseReg) { 2918 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 2919 } else { 2920 auto OverflowVal = B.buildConstant(S32, Overflow); 2921 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 2922 } 2923 } 2924 2925 if (!BaseReg) 2926 BaseReg = B.buildConstant(S32, 0).getReg(0); 2927 2928 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 2929 } 2930 2931 /// Handle register layout difference for f16 images for some subtargets. 2932 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 2933 MachineRegisterInfo &MRI, 2934 Register Reg) const { 2935 if (!ST.hasUnpackedD16VMem()) 2936 return Reg; 2937 2938 const LLT S16 = LLT::scalar(16); 2939 const LLT S32 = LLT::scalar(32); 2940 LLT StoreVT = MRI.getType(Reg); 2941 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 2942 2943 auto Unmerge = B.buildUnmerge(S16, Reg); 2944 2945 SmallVector<Register, 4> WideRegs; 2946 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 2947 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 2948 2949 int NumElts = StoreVT.getNumElements(); 2950 2951 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 2952 } 2953 2954 Register AMDGPULegalizerInfo::fixStoreSourceType( 2955 MachineIRBuilder &B, Register VData, bool IsFormat) const { 2956 MachineRegisterInfo *MRI = B.getMRI(); 2957 LLT Ty = MRI->getType(VData); 2958 2959 const LLT S16 = LLT::scalar(16); 2960 2961 // Fixup illegal register types for i8 stores. 2962 if (Ty == LLT::scalar(8) || Ty == S16) { 2963 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 2964 return AnyExt; 2965 } 2966 2967 if (Ty.isVector()) { 2968 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 2969 if (IsFormat) 2970 return handleD16VData(B, *MRI, VData); 2971 } 2972 } 2973 2974 return VData; 2975 } 2976 2977 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 2978 MachineRegisterInfo &MRI, 2979 MachineIRBuilder &B, 2980 bool IsTyped, 2981 bool IsFormat) const { 2982 B.setInstr(MI); 2983 2984 Register VData = MI.getOperand(1).getReg(); 2985 LLT Ty = MRI.getType(VData); 2986 LLT EltTy = Ty.getScalarType(); 2987 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 2988 const LLT S32 = LLT::scalar(32); 2989 2990 VData = fixStoreSourceType(B, VData, IsFormat); 2991 Register RSrc = MI.getOperand(2).getReg(); 2992 2993 MachineMemOperand *MMO = *MI.memoperands_begin(); 2994 const int MemSize = MMO->getSize(); 2995 2996 unsigned ImmOffset; 2997 unsigned TotalOffset; 2998 2999 // The typed intrinsics add an immediate after the registers. 3000 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3001 3002 // The struct intrinsic variants add one additional operand over raw. 3003 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3004 Register VIndex; 3005 int OpOffset = 0; 3006 if (HasVIndex) { 3007 VIndex = MI.getOperand(3).getReg(); 3008 OpOffset = 1; 3009 } 3010 3011 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3012 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3013 3014 unsigned Format = 0; 3015 if (IsTyped) { 3016 Format = MI.getOperand(5 + OpOffset).getImm(); 3017 ++OpOffset; 3018 } 3019 3020 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3021 3022 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3023 if (TotalOffset != 0) 3024 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3025 3026 unsigned Opc; 3027 if (IsTyped) { 3028 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3029 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3030 } else if (IsFormat) { 3031 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3032 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3033 } else { 3034 switch (MemSize) { 3035 case 1: 3036 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3037 break; 3038 case 2: 3039 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3040 break; 3041 default: 3042 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3043 break; 3044 } 3045 } 3046 3047 if (!VIndex) 3048 VIndex = B.buildConstant(S32, 0).getReg(0); 3049 3050 auto MIB = B.buildInstr(Opc) 3051 .addUse(VData) // vdata 3052 .addUse(RSrc) // rsrc 3053 .addUse(VIndex) // vindex 3054 .addUse(VOffset) // voffset 3055 .addUse(SOffset) // soffset 3056 .addImm(ImmOffset); // offset(imm) 3057 3058 if (IsTyped) 3059 MIB.addImm(Format); 3060 3061 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3062 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3063 .addMemOperand(MMO); 3064 3065 MI.eraseFromParent(); 3066 return true; 3067 } 3068 3069 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3070 MachineRegisterInfo &MRI, 3071 MachineIRBuilder &B, 3072 bool IsFormat, 3073 bool IsTyped) const { 3074 B.setInstr(MI); 3075 3076 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3077 MachineMemOperand *MMO = *MI.memoperands_begin(); 3078 const int MemSize = MMO->getSize(); 3079 const LLT S32 = LLT::scalar(32); 3080 3081 Register Dst = MI.getOperand(0).getReg(); 3082 Register RSrc = MI.getOperand(2).getReg(); 3083 3084 // The typed intrinsics add an immediate after the registers. 3085 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3086 3087 // The struct intrinsic variants add one additional operand over raw. 3088 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3089 Register VIndex; 3090 int OpOffset = 0; 3091 if (HasVIndex) { 3092 VIndex = MI.getOperand(3).getReg(); 3093 OpOffset = 1; 3094 } 3095 3096 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3097 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3098 3099 unsigned Format = 0; 3100 if (IsTyped) { 3101 Format = MI.getOperand(5 + OpOffset).getImm(); 3102 ++OpOffset; 3103 } 3104 3105 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3106 unsigned ImmOffset; 3107 unsigned TotalOffset; 3108 3109 LLT Ty = MRI.getType(Dst); 3110 LLT EltTy = Ty.getScalarType(); 3111 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3112 const bool Unpacked = ST.hasUnpackedD16VMem(); 3113 3114 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3115 if (TotalOffset != 0) 3116 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3117 3118 unsigned Opc; 3119 3120 if (IsTyped) { 3121 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3122 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3123 } else if (IsFormat) { 3124 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3125 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3126 } else { 3127 switch (MemSize) { 3128 case 1: 3129 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3130 break; 3131 case 2: 3132 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3133 break; 3134 default: 3135 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3136 break; 3137 } 3138 } 3139 3140 Register LoadDstReg; 3141 3142 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3143 LLT UnpackedTy = Ty.changeElementSize(32); 3144 3145 if (IsExtLoad) 3146 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3147 else if (Unpacked && IsD16 && Ty.isVector()) 3148 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3149 else 3150 LoadDstReg = Dst; 3151 3152 if (!VIndex) 3153 VIndex = B.buildConstant(S32, 0).getReg(0); 3154 3155 auto MIB = B.buildInstr(Opc) 3156 .addDef(LoadDstReg) // vdata 3157 .addUse(RSrc) // rsrc 3158 .addUse(VIndex) // vindex 3159 .addUse(VOffset) // voffset 3160 .addUse(SOffset) // soffset 3161 .addImm(ImmOffset); // offset(imm) 3162 3163 if (IsTyped) 3164 MIB.addImm(Format); 3165 3166 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3167 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3168 .addMemOperand(MMO); 3169 3170 if (LoadDstReg != Dst) { 3171 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3172 3173 // Widen result for extending loads was widened. 3174 if (IsExtLoad) 3175 B.buildTrunc(Dst, LoadDstReg); 3176 else { 3177 // Repack to original 16-bit vector result 3178 // FIXME: G_TRUNC should work, but legalization currently fails 3179 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3180 SmallVector<Register, 4> Repack; 3181 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3182 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3183 B.buildMerge(Dst, Repack); 3184 } 3185 } 3186 3187 MI.eraseFromParent(); 3188 return true; 3189 } 3190 3191 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3192 MachineIRBuilder &B, 3193 bool IsInc) const { 3194 B.setInstr(MI); 3195 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3196 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3197 B.buildInstr(Opc) 3198 .addDef(MI.getOperand(0).getReg()) 3199 .addUse(MI.getOperand(2).getReg()) 3200 .addUse(MI.getOperand(3).getReg()) 3201 .cloneMemRefs(MI); 3202 MI.eraseFromParent(); 3203 return true; 3204 } 3205 3206 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3207 switch (IntrID) { 3208 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3209 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3210 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3211 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3212 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3213 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3214 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3215 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3216 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3217 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3218 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3219 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3220 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3221 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3222 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3223 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3224 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3225 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3226 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3227 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3228 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3229 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3230 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3231 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3232 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3233 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3234 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3235 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3236 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3237 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3238 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3239 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3240 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3241 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3242 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3243 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3244 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3245 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3246 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3247 default: 3248 llvm_unreachable("unhandled atomic opcode"); 3249 } 3250 } 3251 3252 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3253 MachineIRBuilder &B, 3254 Intrinsic::ID IID) const { 3255 B.setInstr(MI); 3256 3257 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3258 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3259 3260 Register Dst = MI.getOperand(0).getReg(); 3261 Register VData = MI.getOperand(2).getReg(); 3262 3263 Register CmpVal; 3264 int OpOffset = 0; 3265 3266 if (IsCmpSwap) { 3267 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3268 ++OpOffset; 3269 } 3270 3271 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3272 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3273 3274 // The struct intrinsic variants add one additional operand over raw. 3275 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3276 Register VIndex; 3277 if (HasVIndex) { 3278 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3279 ++OpOffset; 3280 } 3281 3282 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3283 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3284 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3285 3286 MachineMemOperand *MMO = *MI.memoperands_begin(); 3287 3288 unsigned ImmOffset; 3289 unsigned TotalOffset; 3290 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3291 if (TotalOffset != 0) 3292 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3293 3294 if (!VIndex) 3295 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3296 3297 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3298 .addDef(Dst) 3299 .addUse(VData); // vdata 3300 3301 if (IsCmpSwap) 3302 MIB.addReg(CmpVal); 3303 3304 MIB.addUse(RSrc) // rsrc 3305 .addUse(VIndex) // vindex 3306 .addUse(VOffset) // voffset 3307 .addUse(SOffset) // soffset 3308 .addImm(ImmOffset) // offset(imm) 3309 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3310 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3311 .addMemOperand(MMO); 3312 3313 MI.eraseFromParent(); 3314 return true; 3315 } 3316 3317 // Produce a vector of s16 elements from s32 pieces. 3318 static void truncToS16Vector(MachineIRBuilder &B, Register DstReg, 3319 ArrayRef<Register> UnmergeParts) { 3320 const LLT S16 = LLT::scalar(16); 3321 3322 SmallVector<Register, 4> RemergeParts(UnmergeParts.size()); 3323 for (int I = 0, E = UnmergeParts.size(); I != E; ++I) 3324 RemergeParts[I] = B.buildTrunc(S16, UnmergeParts[I]).getReg(0); 3325 3326 B.buildBuildVector(DstReg, RemergeParts); 3327 } 3328 3329 /// Convert a set of s32 registers to a result vector with s16 elements. 3330 static void bitcastToS16Vector(MachineIRBuilder &B, Register DstReg, 3331 ArrayRef<Register> UnmergeParts) { 3332 MachineRegisterInfo &MRI = *B.getMRI(); 3333 const LLT V2S16 = LLT::vector(2, 16); 3334 LLT TargetTy = MRI.getType(DstReg); 3335 int NumElts = UnmergeParts.size(); 3336 3337 if (NumElts == 1) { 3338 assert(TargetTy == V2S16); 3339 B.buildBitcast(DstReg, UnmergeParts[0]); 3340 return; 3341 } 3342 3343 SmallVector<Register, 4> RemergeParts(NumElts); 3344 for (int I = 0; I != NumElts; ++I) 3345 RemergeParts[I] = B.buildBitcast(V2S16, UnmergeParts[I]).getReg(0); 3346 3347 if (TargetTy.getSizeInBits() == 32u * NumElts) { 3348 B.buildConcatVectors(DstReg, RemergeParts); 3349 return; 3350 } 3351 3352 const LLT V3S16 = LLT::vector(3, 16); 3353 const LLT V6S16 = LLT::vector(6, 16); 3354 3355 // Widen to v6s16 and unpack v3 parts. 3356 assert(TargetTy == V3S16); 3357 3358 RemergeParts.push_back(B.buildUndef(V2S16).getReg(0)); 3359 auto Concat = B.buildConcatVectors(V6S16, RemergeParts); 3360 B.buildUnmerge({DstReg, MRI.createGenericVirtualRegister(V3S16)}, Concat); 3361 } 3362 3363 // FIXME: Just vector trunc should be sufficent, but legalization currently 3364 // broken. 3365 static void repackUnpackedD16Load(MachineIRBuilder &B, Register DstReg, 3366 Register WideDstReg) { 3367 const LLT S32 = LLT::scalar(32); 3368 const LLT S16 = LLT::scalar(16); 3369 3370 auto Unmerge = B.buildUnmerge(S32, WideDstReg); 3371 3372 int NumOps = Unmerge->getNumOperands() - 1; 3373 SmallVector<Register, 4> RemergeParts(NumOps); 3374 for (int I = 0; I != NumOps; ++I) 3375 RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0); 3376 3377 B.buildBuildVector(DstReg, RemergeParts); 3378 } 3379 3380 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3381 MachineInstr &MI, MachineIRBuilder &B, 3382 GISelChangeObserver &Observer, 3383 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3384 bool IsTFE = MI.getNumExplicitDefs() == 2; 3385 3386 // We are only processing the operands of d16 image operations on subtargets 3387 // that use the unpacked register layout, or need to repack the TFE result. 3388 3389 // TODO: Need to handle a16 images too 3390 // TODO: Do we need to guard against already legalized intrinsics? 3391 if (!IsTFE && !ST.hasUnpackedD16VMem()) 3392 return true; 3393 3394 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3395 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3396 3397 if (BaseOpcode->Atomic) // No d16 atomics, or TFE. 3398 return true; 3399 3400 B.setInstr(MI); 3401 3402 MachineRegisterInfo *MRI = B.getMRI(); 3403 const LLT S32 = LLT::scalar(32); 3404 const LLT S16 = LLT::scalar(16); 3405 3406 if (BaseOpcode->Store) { // No TFE for stores? 3407 Register VData = MI.getOperand(1).getReg(); 3408 LLT Ty = MRI->getType(VData); 3409 if (!Ty.isVector() || Ty.getElementType() != S16) 3410 return true; 3411 3412 B.setInstr(MI); 3413 3414 Observer.changingInstr(MI); 3415 MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData)); 3416 Observer.changedInstr(MI); 3417 return true; 3418 } 3419 3420 Register DstReg = MI.getOperand(0).getReg(); 3421 LLT Ty = MRI->getType(DstReg); 3422 const LLT EltTy = Ty.getScalarType(); 3423 const bool IsD16 = Ty.getScalarType() == S16; 3424 const unsigned NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3425 3426 if (IsTFE) { 3427 // In the IR, TFE is supposed to be used with a 2 element struct return 3428 // type. The intruction really returns these two values in one contiguous 3429 // register, with one additional dword beyond the loaded data. Rewrite the 3430 // return type to use a single register result. 3431 Register Dst1Reg = MI.getOperand(1).getReg(); 3432 if (MRI->getType(Dst1Reg) != S32) 3433 return false; 3434 3435 // TODO: Make sure the TFE operand bit is set. 3436 3437 // The raw dword aligned data component of the load. The only legal cases 3438 // where this matters should be when using the packed D16 format, for 3439 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3440 LLT RoundedTy; 3441 LLT TFETy; 3442 3443 if (IsD16 && ST.hasUnpackedD16VMem()) { 3444 RoundedTy = LLT::scalarOrVector(NumElts, 32); 3445 TFETy = LLT::vector(NumElts + 1, 32); 3446 } else { 3447 unsigned EltSize = Ty.getScalarSizeInBits(); 3448 unsigned RoundedElts = (Ty.getSizeInBits() + 31) / 32; 3449 unsigned RoundedSize = 32 * RoundedElts; 3450 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3451 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3452 } 3453 3454 Register TFEReg = MRI->createGenericVirtualRegister(TFETy); 3455 Observer.changingInstr(MI); 3456 3457 MI.getOperand(0).setReg(TFEReg); 3458 MI.RemoveOperand(1); 3459 3460 Observer.changedInstr(MI); 3461 3462 // Insert after the instruction. 3463 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3464 3465 // Now figure out how to copy the new result register back into the old 3466 // result. 3467 3468 SmallVector<Register, 5> UnmergeResults(TFETy.getNumElements(), Dst1Reg); 3469 int NumDataElts = TFETy.getNumElements() - 1; 3470 3471 if (!Ty.isVector()) { 3472 // Simplest case is a trivial unmerge (plus a truncate for d16). 3473 UnmergeResults[0] = Ty == S32 ? 3474 DstReg : MRI->createGenericVirtualRegister(S32); 3475 3476 B.buildUnmerge(UnmergeResults, TFEReg); 3477 if (Ty != S32) 3478 B.buildTrunc(DstReg, UnmergeResults[0]); 3479 return true; 3480 } 3481 3482 // We have to repack into a new vector of some kind. 3483 for (int I = 0; I != NumDataElts; ++I) 3484 UnmergeResults[I] = MRI->createGenericVirtualRegister(S32); 3485 B.buildUnmerge(UnmergeResults, TFEReg); 3486 3487 // Drop the final TFE element. 3488 ArrayRef<Register> DataPart(UnmergeResults.data(), NumDataElts); 3489 3490 if (EltTy == S32) 3491 B.buildBuildVector(DstReg, DataPart); 3492 else if (ST.hasUnpackedD16VMem()) 3493 truncToS16Vector(B, DstReg, DataPart); 3494 else 3495 bitcastToS16Vector(B, DstReg, DataPart); 3496 3497 return true; 3498 } 3499 3500 // Must be an image load. 3501 if (!Ty.isVector() || Ty.getElementType() != S16) 3502 return true; 3503 3504 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3505 3506 LLT WidenedTy = Ty.changeElementType(S32); 3507 Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy); 3508 3509 Observer.changingInstr(MI); 3510 MI.getOperand(0).setReg(WideDstReg); 3511 Observer.changedInstr(MI); 3512 3513 repackUnpackedD16Load(B, DstReg, WideDstReg); 3514 return true; 3515 } 3516 3517 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 3518 MachineInstr &MI, MachineIRBuilder &B, 3519 GISelChangeObserver &Observer) const { 3520 Register Dst = MI.getOperand(0).getReg(); 3521 LLT Ty = B.getMRI()->getType(Dst); 3522 unsigned Size = Ty.getSizeInBits(); 3523 MachineFunction &MF = B.getMF(); 3524 3525 Observer.changingInstr(MI); 3526 3527 // FIXME: We don't really need this intermediate instruction. The intrinsic 3528 // should be fixed to have a memory operand. Since it's readnone, we're not 3529 // allowed to add one. 3530 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 3531 MI.RemoveOperand(1); // Remove intrinsic ID 3532 3533 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 3534 // TODO: Should this use datalayout alignment? 3535 const unsigned MemSize = (Size + 7) / 8; 3536 const unsigned MemAlign = 4; 3537 MachineMemOperand *MMO = MF.getMachineMemOperand( 3538 MachinePointerInfo(), 3539 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 3540 MachineMemOperand::MOInvariant, MemSize, MemAlign); 3541 MI.addMemOperand(MF, MMO); 3542 3543 // There are no 96-bit result scalar loads, but widening to 128-bit should 3544 // always be legal. We may need to restore this to a 96-bit result if it turns 3545 // out this needs to be converted to a vector load during RegBankSelect. 3546 if (!isPowerOf2_32(Size)) { 3547 LegalizerHelper Helper(MF, *this, Observer, B); 3548 B.setInstr(MI); 3549 3550 if (Ty.isVector()) 3551 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 3552 else 3553 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 3554 } 3555 3556 Observer.changedInstr(MI); 3557 return true; 3558 } 3559 3560 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 3561 MachineIRBuilder &B, 3562 GISelChangeObserver &Observer) const { 3563 MachineRegisterInfo &MRI = *B.getMRI(); 3564 3565 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 3566 auto IntrID = MI.getIntrinsicID(); 3567 switch (IntrID) { 3568 case Intrinsic::amdgcn_if: 3569 case Intrinsic::amdgcn_else: { 3570 MachineInstr *Br = nullptr; 3571 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 3572 const SIRegisterInfo *TRI 3573 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 3574 3575 B.setInstr(*BrCond); 3576 Register Def = MI.getOperand(1).getReg(); 3577 Register Use = MI.getOperand(3).getReg(); 3578 3579 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 3580 if (Br) 3581 BrTarget = Br->getOperand(0).getMBB(); 3582 3583 if (IntrID == Intrinsic::amdgcn_if) { 3584 B.buildInstr(AMDGPU::SI_IF) 3585 .addDef(Def) 3586 .addUse(Use) 3587 .addMBB(BrTarget); 3588 } else { 3589 B.buildInstr(AMDGPU::SI_ELSE) 3590 .addDef(Def) 3591 .addUse(Use) 3592 .addMBB(BrTarget) 3593 .addImm(0); 3594 } 3595 3596 if (Br) 3597 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 3598 3599 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 3600 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 3601 MI.eraseFromParent(); 3602 BrCond->eraseFromParent(); 3603 return true; 3604 } 3605 3606 return false; 3607 } 3608 case Intrinsic::amdgcn_loop: { 3609 MachineInstr *Br = nullptr; 3610 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 3611 const SIRegisterInfo *TRI 3612 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 3613 3614 B.setInstr(*BrCond); 3615 3616 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 3617 if (Br) 3618 BrTarget = Br->getOperand(0).getMBB(); 3619 3620 Register Reg = MI.getOperand(2).getReg(); 3621 B.buildInstr(AMDGPU::SI_LOOP) 3622 .addUse(Reg) 3623 .addMBB(BrTarget); 3624 3625 if (Br) 3626 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 3627 3628 MI.eraseFromParent(); 3629 BrCond->eraseFromParent(); 3630 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 3631 return true; 3632 } 3633 3634 return false; 3635 } 3636 case Intrinsic::amdgcn_kernarg_segment_ptr: 3637 return legalizePreloadedArgIntrin( 3638 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3639 case Intrinsic::amdgcn_implicitarg_ptr: 3640 return legalizeImplicitArgPtr(MI, MRI, B); 3641 case Intrinsic::amdgcn_workitem_id_x: 3642 return legalizePreloadedArgIntrin(MI, MRI, B, 3643 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 3644 case Intrinsic::amdgcn_workitem_id_y: 3645 return legalizePreloadedArgIntrin(MI, MRI, B, 3646 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 3647 case Intrinsic::amdgcn_workitem_id_z: 3648 return legalizePreloadedArgIntrin(MI, MRI, B, 3649 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 3650 case Intrinsic::amdgcn_workgroup_id_x: 3651 return legalizePreloadedArgIntrin(MI, MRI, B, 3652 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 3653 case Intrinsic::amdgcn_workgroup_id_y: 3654 return legalizePreloadedArgIntrin(MI, MRI, B, 3655 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 3656 case Intrinsic::amdgcn_workgroup_id_z: 3657 return legalizePreloadedArgIntrin(MI, MRI, B, 3658 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 3659 case Intrinsic::amdgcn_dispatch_ptr: 3660 return legalizePreloadedArgIntrin(MI, MRI, B, 3661 AMDGPUFunctionArgInfo::DISPATCH_PTR); 3662 case Intrinsic::amdgcn_queue_ptr: 3663 return legalizePreloadedArgIntrin(MI, MRI, B, 3664 AMDGPUFunctionArgInfo::QUEUE_PTR); 3665 case Intrinsic::amdgcn_implicit_buffer_ptr: 3666 return legalizePreloadedArgIntrin( 3667 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 3668 case Intrinsic::amdgcn_dispatch_id: 3669 return legalizePreloadedArgIntrin(MI, MRI, B, 3670 AMDGPUFunctionArgInfo::DISPATCH_ID); 3671 case Intrinsic::amdgcn_fdiv_fast: 3672 return legalizeFDIVFastIntrin(MI, MRI, B); 3673 case Intrinsic::amdgcn_is_shared: 3674 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 3675 case Intrinsic::amdgcn_is_private: 3676 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 3677 case Intrinsic::amdgcn_wavefrontsize: { 3678 B.setInstr(MI); 3679 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 3680 MI.eraseFromParent(); 3681 return true; 3682 } 3683 case Intrinsic::amdgcn_s_buffer_load: 3684 return legalizeSBufferLoad(MI, B, Observer); 3685 case Intrinsic::amdgcn_raw_buffer_store: 3686 case Intrinsic::amdgcn_struct_buffer_store: 3687 return legalizeBufferStore(MI, MRI, B, false, false); 3688 case Intrinsic::amdgcn_raw_buffer_store_format: 3689 case Intrinsic::amdgcn_struct_buffer_store_format: 3690 return legalizeBufferStore(MI, MRI, B, false, true); 3691 case Intrinsic::amdgcn_raw_tbuffer_store: 3692 case Intrinsic::amdgcn_struct_tbuffer_store: 3693 return legalizeBufferStore(MI, MRI, B, true, true); 3694 case Intrinsic::amdgcn_raw_buffer_load: 3695 case Intrinsic::amdgcn_struct_buffer_load: 3696 return legalizeBufferLoad(MI, MRI, B, false, false); 3697 case Intrinsic::amdgcn_raw_buffer_load_format: 3698 case Intrinsic::amdgcn_struct_buffer_load_format: 3699 return legalizeBufferLoad(MI, MRI, B, true, false); 3700 case Intrinsic::amdgcn_raw_tbuffer_load: 3701 case Intrinsic::amdgcn_struct_tbuffer_load: 3702 return legalizeBufferLoad(MI, MRI, B, true, true); 3703 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3704 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3705 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3706 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3707 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3708 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3709 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3710 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3711 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3712 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3713 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3714 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3715 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3716 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3717 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3718 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3719 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3720 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3721 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3722 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3723 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3724 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3725 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3726 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3727 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3728 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3729 return legalizeBufferAtomic(MI, B, IntrID); 3730 case Intrinsic::amdgcn_atomic_inc: 3731 return legalizeAtomicIncDec(MI, B, true); 3732 case Intrinsic::amdgcn_atomic_dec: 3733 return legalizeAtomicIncDec(MI, B, false); 3734 default: { 3735 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 3736 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 3737 return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr); 3738 return true; 3739 } 3740 } 3741 3742 return true; 3743 } 3744