1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPULegalizerInfo.h" 22 23 #include "AMDGPU.h" 24 #include "AMDGPUGlobalISelUtils.h" 25 #include "AMDGPUTargetMachine.h" 26 #include "SIMachineFunctionInfo.h" 27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 30 #include "llvm/CodeGen/TargetOpcodes.h" 31 #include "llvm/CodeGen/ValueTypes.h" 32 #include "llvm/IR/DerivedTypes.h" 33 #include "llvm/IR/DiagnosticInfo.h" 34 #include "llvm/IR/Type.h" 35 #include "llvm/Support/Debug.h" 36 37 #define DEBUG_TYPE "amdgpu-legalinfo" 38 39 using namespace llvm; 40 using namespace LegalizeActions; 41 using namespace LegalizeMutations; 42 using namespace LegalityPredicates; 43 using namespace MIPatternMatch; 44 45 // Round the number of elements to the next power of two elements 46 static LLT getPow2VectorType(LLT Ty) { 47 unsigned NElts = Ty.getNumElements(); 48 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 49 return Ty.changeNumElements(Pow2NElts); 50 } 51 52 // Round the number of bits to the next power of two bits 53 static LLT getPow2ScalarType(LLT Ty) { 54 unsigned Bits = Ty.getSizeInBits(); 55 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 56 return LLT::scalar(Pow2Bits); 57 } 58 59 static LegalityPredicate isMultiple32(unsigned TypeIdx, 60 unsigned MaxSize = 1024) { 61 return [=](const LegalityQuery &Query) { 62 const LLT Ty = Query.Types[TypeIdx]; 63 const LLT EltTy = Ty.getScalarType(); 64 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 65 }; 66 } 67 68 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 69 return [=](const LegalityQuery &Query) { 70 return Query.Types[TypeIdx].getSizeInBits() == Size; 71 }; 72 } 73 74 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 75 return [=](const LegalityQuery &Query) { 76 const LLT Ty = Query.Types[TypeIdx]; 77 return Ty.isVector() && 78 Ty.getNumElements() % 2 != 0 && 79 Ty.getElementType().getSizeInBits() < 32 && 80 Ty.getSizeInBits() % 32 != 0; 81 }; 82 } 83 84 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 85 return [=](const LegalityQuery &Query) { 86 const LLT Ty = Query.Types[TypeIdx]; 87 const LLT EltTy = Ty.getScalarType(); 88 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 89 }; 90 } 91 92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 93 return [=](const LegalityQuery &Query) { 94 const LLT Ty = Query.Types[TypeIdx]; 95 const LLT EltTy = Ty.getElementType(); 96 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 97 }; 98 } 99 100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 101 return [=](const LegalityQuery &Query) { 102 const LLT Ty = Query.Types[TypeIdx]; 103 const LLT EltTy = Ty.getElementType(); 104 unsigned Size = Ty.getSizeInBits(); 105 unsigned Pieces = (Size + 63) / 64; 106 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 107 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 108 }; 109 } 110 111 // Increase the number of vector elements to reach the next multiple of 32-bit 112 // type. 113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 114 return [=](const LegalityQuery &Query) { 115 const LLT Ty = Query.Types[TypeIdx]; 116 117 const LLT EltTy = Ty.getElementType(); 118 const int Size = Ty.getSizeInBits(); 119 const int EltSize = EltTy.getSizeInBits(); 120 const int NextMul32 = (Size + 31) / 32; 121 122 assert(EltSize < 32); 123 124 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 125 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 126 }; 127 } 128 129 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 130 return [=](const LegalityQuery &Query) { 131 const LLT QueryTy = Query.Types[TypeIdx]; 132 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 133 }; 134 } 135 136 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 137 return [=](const LegalityQuery &Query) { 138 const LLT QueryTy = Query.Types[TypeIdx]; 139 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 140 }; 141 } 142 143 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 144 return [=](const LegalityQuery &Query) { 145 const LLT QueryTy = Query.Types[TypeIdx]; 146 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 147 }; 148 } 149 150 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 151 // v2s16. 152 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 153 return [=](const LegalityQuery &Query) { 154 const LLT Ty = Query.Types[TypeIdx]; 155 if (Ty.isVector()) { 156 const int EltSize = Ty.getElementType().getSizeInBits(); 157 return EltSize == 32 || EltSize == 64 || 158 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 159 EltSize == 128 || EltSize == 256; 160 } 161 162 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 163 }; 164 } 165 166 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 167 return [=](const LegalityQuery &Query) { 168 const LLT QueryTy = Query.Types[TypeIdx]; 169 return QueryTy.isVector() && QueryTy.getElementType() == Type; 170 }; 171 } 172 173 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 174 return [=](const LegalityQuery &Query) { 175 const LLT Ty = Query.Types[TypeIdx]; 176 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 177 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 178 }; 179 } 180 181 static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) { 182 return [=](const LegalityQuery &Query) { 183 return Query.Types[TypeIdx0].getSizeInBits() < 184 Query.Types[TypeIdx1].getSizeInBits(); 185 }; 186 } 187 188 static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) { 189 return [=](const LegalityQuery &Query) { 190 return Query.Types[TypeIdx0].getSizeInBits() > 191 Query.Types[TypeIdx1].getSizeInBits(); 192 }; 193 } 194 195 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 196 const GCNTargetMachine &TM) 197 : ST(ST_) { 198 using namespace TargetOpcode; 199 200 auto GetAddrSpacePtr = [&TM](unsigned AS) { 201 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 202 }; 203 204 const LLT S1 = LLT::scalar(1); 205 const LLT S16 = LLT::scalar(16); 206 const LLT S32 = LLT::scalar(32); 207 const LLT S64 = LLT::scalar(64); 208 const LLT S128 = LLT::scalar(128); 209 const LLT S256 = LLT::scalar(256); 210 const LLT S1024 = LLT::scalar(1024); 211 212 const LLT V2S16 = LLT::vector(2, 16); 213 const LLT V4S16 = LLT::vector(4, 16); 214 215 const LLT V2S32 = LLT::vector(2, 32); 216 const LLT V3S32 = LLT::vector(3, 32); 217 const LLT V4S32 = LLT::vector(4, 32); 218 const LLT V5S32 = LLT::vector(5, 32); 219 const LLT V6S32 = LLT::vector(6, 32); 220 const LLT V7S32 = LLT::vector(7, 32); 221 const LLT V8S32 = LLT::vector(8, 32); 222 const LLT V9S32 = LLT::vector(9, 32); 223 const LLT V10S32 = LLT::vector(10, 32); 224 const LLT V11S32 = LLT::vector(11, 32); 225 const LLT V12S32 = LLT::vector(12, 32); 226 const LLT V13S32 = LLT::vector(13, 32); 227 const LLT V14S32 = LLT::vector(14, 32); 228 const LLT V15S32 = LLT::vector(15, 32); 229 const LLT V16S32 = LLT::vector(16, 32); 230 const LLT V32S32 = LLT::vector(32, 32); 231 232 const LLT V2S64 = LLT::vector(2, 64); 233 const LLT V3S64 = LLT::vector(3, 64); 234 const LLT V4S64 = LLT::vector(4, 64); 235 const LLT V5S64 = LLT::vector(5, 64); 236 const LLT V6S64 = LLT::vector(6, 64); 237 const LLT V7S64 = LLT::vector(7, 64); 238 const LLT V8S64 = LLT::vector(8, 64); 239 const LLT V16S64 = LLT::vector(16, 64); 240 241 std::initializer_list<LLT> AllS32Vectors = 242 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 243 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 244 std::initializer_list<LLT> AllS64Vectors = 245 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 246 247 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 248 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 249 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 250 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 251 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 252 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 253 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 254 255 const LLT CodePtr = FlatPtr; 256 257 const std::initializer_list<LLT> AddrSpaces64 = { 258 GlobalPtr, ConstantPtr, FlatPtr 259 }; 260 261 const std::initializer_list<LLT> AddrSpaces32 = { 262 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 263 }; 264 265 const std::initializer_list<LLT> FPTypesBase = { 266 S32, S64 267 }; 268 269 const std::initializer_list<LLT> FPTypes16 = { 270 S32, S64, S16 271 }; 272 273 const std::initializer_list<LLT> FPTypesPK16 = { 274 S32, S64, S16, V2S16 275 }; 276 277 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 278 279 setAction({G_BRCOND, S1}, Legal); // VCC branches 280 setAction({G_BRCOND, S32}, Legal); // SCC branches 281 282 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 283 // elements for v3s16 284 getActionDefinitionsBuilder(G_PHI) 285 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 286 .legalFor(AllS32Vectors) 287 .legalFor(AllS64Vectors) 288 .legalFor(AddrSpaces64) 289 .legalFor(AddrSpaces32) 290 .clampScalar(0, S32, S256) 291 .widenScalarToNextPow2(0, 32) 292 .clampMaxNumElements(0, S32, 16) 293 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 294 .legalIf(isPointer(0)); 295 296 if (ST.has16BitInsts()) { 297 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 298 .legalFor({S32, S16}) 299 .clampScalar(0, S16, S32) 300 .scalarize(0) 301 .widenScalarToNextPow2(0, 32); 302 } else { 303 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 304 .legalFor({S32}) 305 .clampScalar(0, S32, S32) 306 .scalarize(0); 307 } 308 309 // FIXME: Not really legal. Placeholder for custom lowering. 310 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 311 .customFor({S32, S64}) 312 .clampScalar(0, S32, S64) 313 .widenScalarToNextPow2(0, 32) 314 .scalarize(0); 315 316 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 317 .legalFor({S32}) 318 .clampScalar(0, S32, S32) 319 .scalarize(0); 320 321 // Report legal for any types we can handle anywhere. For the cases only legal 322 // on the SALU, RegBankSelect will be able to re-legalize. 323 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 324 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 325 .clampScalar(0, S32, S64) 326 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 327 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 328 .widenScalarToNextPow2(0) 329 .scalarize(0); 330 331 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 332 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 333 .legalFor({{S32, S1}, {S32, S32}}) 334 .clampScalar(0, S32, S32) 335 .scalarize(0); // TODO: Implement. 336 337 getActionDefinitionsBuilder(G_BITCAST) 338 // Don't worry about the size constraint. 339 .legalIf(all(isRegisterType(0), isRegisterType(1))) 340 .lower(); 341 342 343 getActionDefinitionsBuilder(G_CONSTANT) 344 .legalFor({S1, S32, S64, S16, GlobalPtr, 345 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 346 .clampScalar(0, S32, S64) 347 .widenScalarToNextPow2(0) 348 .legalIf(isPointer(0)); 349 350 getActionDefinitionsBuilder(G_FCONSTANT) 351 .legalFor({S32, S64, S16}) 352 .clampScalar(0, S16, S64); 353 354 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 355 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 356 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 357 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 358 .clampScalarOrElt(0, S32, S1024) 359 .legalIf(isMultiple32(0)) 360 .widenScalarToNextPow2(0, 32) 361 .clampMaxNumElements(0, S32, 16); 362 363 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 364 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 365 .unsupportedFor({PrivatePtr}) 366 .custom(); 367 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 368 369 auto &FPOpActions = getActionDefinitionsBuilder( 370 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 371 .legalFor({S32, S64}); 372 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 373 .customFor({S32, S64}); 374 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 375 .customFor({S32, S64}); 376 377 if (ST.has16BitInsts()) { 378 if (ST.hasVOP3PInsts()) 379 FPOpActions.legalFor({S16, V2S16}); 380 else 381 FPOpActions.legalFor({S16}); 382 383 TrigActions.customFor({S16}); 384 FDIVActions.customFor({S16}); 385 } 386 387 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 388 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 389 390 if (ST.hasVOP3PInsts()) { 391 MinNumMaxNum.customFor(FPTypesPK16) 392 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 393 .clampMaxNumElements(0, S16, 2) 394 .clampScalar(0, S16, S64) 395 .scalarize(0); 396 } else if (ST.has16BitInsts()) { 397 MinNumMaxNum.customFor(FPTypes16) 398 .clampScalar(0, S16, S64) 399 .scalarize(0); 400 } else { 401 MinNumMaxNum.customFor(FPTypesBase) 402 .clampScalar(0, S32, S64) 403 .scalarize(0); 404 } 405 406 if (ST.hasVOP3PInsts()) 407 FPOpActions.clampMaxNumElements(0, S16, 2); 408 409 FPOpActions 410 .scalarize(0) 411 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 412 413 TrigActions 414 .scalarize(0) 415 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 416 417 FDIVActions 418 .scalarize(0) 419 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 420 421 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 422 .legalFor(FPTypesPK16) 423 .clampMaxNumElements(0, S16, 2) 424 .scalarize(0) 425 .clampScalar(0, S16, S64); 426 427 if (ST.has16BitInsts()) { 428 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 429 .legalFor({S32, S64, S16}) 430 .scalarize(0) 431 .clampScalar(0, S16, S64); 432 } else { 433 getActionDefinitionsBuilder(G_FSQRT) 434 .legalFor({S32, S64}) 435 .scalarize(0) 436 .clampScalar(0, S32, S64); 437 438 if (ST.hasFractBug()) { 439 getActionDefinitionsBuilder(G_FFLOOR) 440 .customFor({S64}) 441 .legalFor({S32, S64}) 442 .scalarize(0) 443 .clampScalar(0, S32, S64); 444 } else { 445 getActionDefinitionsBuilder(G_FFLOOR) 446 .legalFor({S32, S64}) 447 .scalarize(0) 448 .clampScalar(0, S32, S64); 449 } 450 } 451 452 getActionDefinitionsBuilder(G_FPTRUNC) 453 .legalFor({{S32, S64}, {S16, S32}}) 454 .scalarize(0) 455 .lower(); 456 457 getActionDefinitionsBuilder(G_FPEXT) 458 .legalFor({{S64, S32}, {S32, S16}}) 459 .lowerFor({{S64, S16}}) // FIXME: Implement 460 .scalarize(0); 461 462 getActionDefinitionsBuilder(G_FSUB) 463 // Use actual fsub instruction 464 .legalFor({S32}) 465 // Must use fadd + fneg 466 .lowerFor({S64, S16, V2S16}) 467 .scalarize(0) 468 .clampScalar(0, S32, S64); 469 470 // Whether this is legal depends on the floating point mode for the function. 471 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 472 if (ST.hasMadF16()) 473 FMad.customFor({S32, S16}); 474 else 475 FMad.customFor({S32}); 476 FMad.scalarize(0) 477 .lower(); 478 479 getActionDefinitionsBuilder(G_TRUNC) 480 .alwaysLegal(); 481 482 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 483 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 484 {S32, S1}, {S64, S1}, {S16, S1}}) 485 .scalarize(0) 486 .clampScalar(0, S32, S64) 487 .widenScalarToNextPow2(1, 32); 488 489 // TODO: Split s1->s64 during regbankselect for VALU. 490 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 491 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 492 .lowerFor({{S32, S64}}) 493 .lowerIf(typeIs(1, S1)) 494 .customFor({{S64, S64}}); 495 if (ST.has16BitInsts()) 496 IToFP.legalFor({{S16, S16}}); 497 IToFP.clampScalar(1, S32, S64) 498 .scalarize(0) 499 .widenScalarToNextPow2(1); 500 501 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 502 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 503 .customFor({{S64, S64}}); 504 if (ST.has16BitInsts()) 505 FPToI.legalFor({{S16, S16}}); 506 else 507 FPToI.minScalar(1, S32); 508 509 FPToI.minScalar(0, S32) 510 .scalarize(0) 511 .lower(); 512 513 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 514 .scalarize(0) 515 .lower(); 516 517 if (ST.has16BitInsts()) { 518 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 519 .legalFor({S16, S32, S64}) 520 .clampScalar(0, S16, S64) 521 .scalarize(0); 522 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 523 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 524 .legalFor({S32, S64}) 525 .clampScalar(0, S32, S64) 526 .scalarize(0); 527 } else { 528 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 529 .legalFor({S32}) 530 .customFor({S64}) 531 .clampScalar(0, S32, S64) 532 .scalarize(0); 533 } 534 535 getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK}) 536 .scalarize(0) 537 .alwaysLegal(); 538 539 auto &CmpBuilder = 540 getActionDefinitionsBuilder(G_ICMP) 541 // The compare output type differs based on the register bank of the output, 542 // so make both s1 and s32 legal. 543 // 544 // Scalar compares producing output in scc will be promoted to s32, as that 545 // is the allocatable register type that will be needed for the copy from 546 // scc. This will be promoted during RegBankSelect, and we assume something 547 // before that won't try to use s32 result types. 548 // 549 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 550 // bank. 551 .legalForCartesianProduct( 552 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 553 .legalForCartesianProduct( 554 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 555 if (ST.has16BitInsts()) { 556 CmpBuilder.legalFor({{S1, S16}}); 557 } 558 559 CmpBuilder 560 .widenScalarToNextPow2(1) 561 .clampScalar(1, S32, S64) 562 .scalarize(0) 563 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 564 565 getActionDefinitionsBuilder(G_FCMP) 566 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 567 .widenScalarToNextPow2(1) 568 .clampScalar(1, S32, S64) 569 .scalarize(0); 570 571 // FIXME: fpow has a selection pattern that should move to custom lowering. 572 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 573 if (ST.has16BitInsts()) 574 Exp2Ops.legalFor({S32, S16}); 575 else 576 Exp2Ops.legalFor({S32}); 577 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 578 Exp2Ops.scalarize(0); 579 580 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 581 if (ST.has16BitInsts()) 582 ExpOps.customFor({{S32}, {S16}}); 583 else 584 ExpOps.customFor({S32}); 585 ExpOps.clampScalar(0, MinScalarFPTy, S32) 586 .scalarize(0); 587 588 // The 64-bit versions produce 32-bit results, but only on the SALU. 589 getActionDefinitionsBuilder(G_CTPOP) 590 .legalFor({{S32, S32}, {S32, S64}}) 591 .clampScalar(0, S32, S32) 592 .clampScalar(1, S32, S64) 593 .scalarize(0) 594 .widenScalarToNextPow2(0, 32) 595 .widenScalarToNextPow2(1, 32); 596 597 // The hardware instructions return a different result on 0 than the generic 598 // instructions expect. The hardware produces -1, but these produce the 599 // bitwidth. 600 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 601 .scalarize(0) 602 .clampScalar(0, S32, S32) 603 .clampScalar(1, S32, S64) 604 .widenScalarToNextPow2(0, 32) 605 .widenScalarToNextPow2(1, 32) 606 .lower(); 607 608 // The 64-bit versions produce 32-bit results, but only on the SALU. 609 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 610 .legalFor({{S32, S32}, {S32, S64}}) 611 .clampScalar(0, S32, S32) 612 .clampScalar(1, S32, S64) 613 .scalarize(0) 614 .widenScalarToNextPow2(0, 32) 615 .widenScalarToNextPow2(1, 32); 616 617 getActionDefinitionsBuilder(G_BITREVERSE) 618 .legalFor({S32}) 619 .clampScalar(0, S32, S32) 620 .scalarize(0); 621 622 if (ST.has16BitInsts()) { 623 getActionDefinitionsBuilder(G_BSWAP) 624 .legalFor({S16, S32, V2S16}) 625 .clampMaxNumElements(0, S16, 2) 626 // FIXME: Fixing non-power-of-2 before clamp is workaround for 627 // narrowScalar limitation. 628 .widenScalarToNextPow2(0) 629 .clampScalar(0, S16, S32) 630 .scalarize(0); 631 632 if (ST.hasVOP3PInsts()) { 633 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 634 .legalFor({S32, S16, V2S16}) 635 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 636 .clampMaxNumElements(0, S16, 2) 637 .clampScalar(0, S16, S32) 638 .widenScalarToNextPow2(0) 639 .scalarize(0); 640 } else { 641 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 642 .legalFor({S32, S16}) 643 .widenScalarToNextPow2(0) 644 .clampScalar(0, S16, S32) 645 .scalarize(0); 646 } 647 } else { 648 // TODO: Should have same legality without v_perm_b32 649 getActionDefinitionsBuilder(G_BSWAP) 650 .legalFor({S32}) 651 .lowerIf(narrowerThan(0, 32)) 652 // FIXME: Fixing non-power-of-2 before clamp is workaround for 653 // narrowScalar limitation. 654 .widenScalarToNextPow2(0) 655 .maxScalar(0, S32) 656 .scalarize(0) 657 .lower(); 658 659 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 660 .legalFor({S32}) 661 .clampScalar(0, S32, S32) 662 .widenScalarToNextPow2(0) 663 .scalarize(0); 664 } 665 666 getActionDefinitionsBuilder(G_INTTOPTR) 667 // List the common cases 668 .legalForCartesianProduct(AddrSpaces64, {S64}) 669 .legalForCartesianProduct(AddrSpaces32, {S32}) 670 .scalarize(0) 671 // Accept any address space as long as the size matches 672 .legalIf(sameSize(0, 1)) 673 .widenScalarIf(smallerThan(1, 0), 674 [](const LegalityQuery &Query) { 675 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 676 }) 677 .narrowScalarIf(greaterThan(1, 0), 678 [](const LegalityQuery &Query) { 679 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 680 }); 681 682 getActionDefinitionsBuilder(G_PTRTOINT) 683 // List the common cases 684 .legalForCartesianProduct(AddrSpaces64, {S64}) 685 .legalForCartesianProduct(AddrSpaces32, {S32}) 686 .scalarize(0) 687 // Accept any address space as long as the size matches 688 .legalIf(sameSize(0, 1)) 689 .widenScalarIf(smallerThan(0, 1), 690 [](const LegalityQuery &Query) { 691 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 692 }) 693 .narrowScalarIf( 694 greaterThan(0, 1), 695 [](const LegalityQuery &Query) { 696 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 697 }); 698 699 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 700 .scalarize(0) 701 .custom(); 702 703 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 704 // handle some operations by just promoting the register during 705 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 706 auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned { 707 switch (AS) { 708 // FIXME: Private element size. 709 case AMDGPUAS::PRIVATE_ADDRESS: 710 return 32; 711 // FIXME: Check subtarget 712 case AMDGPUAS::LOCAL_ADDRESS: 713 return ST.useDS128() ? 128 : 64; 714 715 // Treat constant and global as identical. SMRD loads are sometimes usable 716 // for global loads (ideally constant address space should be eliminated) 717 // depending on the context. Legality cannot be context dependent, but 718 // RegBankSelect can split the load as necessary depending on the pointer 719 // register bank/uniformity and if the memory is invariant or not written in 720 // a kernel. 721 case AMDGPUAS::CONSTANT_ADDRESS: 722 case AMDGPUAS::GLOBAL_ADDRESS: 723 return IsLoad ? 512 : 128; 724 default: 725 return 128; 726 } 727 }; 728 729 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 730 bool IsLoad) -> bool { 731 const LLT DstTy = Query.Types[0]; 732 733 // Split vector extloads. 734 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 735 unsigned Align = Query.MMODescrs[0].AlignInBits; 736 737 if (MemSize < DstTy.getSizeInBits()) 738 MemSize = std::max(MemSize, Align); 739 740 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 741 return true; 742 743 const LLT PtrTy = Query.Types[1]; 744 unsigned AS = PtrTy.getAddressSpace(); 745 if (MemSize > maxSizeForAddrSpace(AS, IsLoad)) 746 return true; 747 748 // Catch weird sized loads that don't evenly divide into the access sizes 749 // TODO: May be able to widen depending on alignment etc. 750 unsigned NumRegs = (MemSize + 31) / 32; 751 if (NumRegs == 3) { 752 if (!ST.hasDwordx3LoadStores()) 753 return true; 754 } else { 755 // If the alignment allows, these should have been widened. 756 if (!isPowerOf2_32(NumRegs)) 757 return true; 758 } 759 760 if (Align < MemSize) { 761 const SITargetLowering *TLI = ST.getTargetLowering(); 762 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 763 } 764 765 return false; 766 }; 767 768 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool { 769 unsigned Size = Query.Types[0].getSizeInBits(); 770 if (isPowerOf2_32(Size)) 771 return false; 772 773 if (Size == 96 && ST.hasDwordx3LoadStores()) 774 return false; 775 776 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 777 if (Size >= maxSizeForAddrSpace(AddrSpace, true)) 778 return false; 779 780 unsigned Align = Query.MMODescrs[0].AlignInBits; 781 unsigned RoundedSize = NextPowerOf2(Size); 782 return (Align >= RoundedSize); 783 }; 784 785 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 786 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 787 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 788 789 // TODO: Refine based on subtargets which support unaligned access or 128-bit 790 // LDS 791 // TODO: Unsupported flat for SI. 792 793 for (unsigned Op : {G_LOAD, G_STORE}) { 794 const bool IsStore = Op == G_STORE; 795 796 auto &Actions = getActionDefinitionsBuilder(Op); 797 // Whitelist the common cases. 798 // TODO: Loads to s16 on gfx9 799 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 800 {V2S32, GlobalPtr, 64, GlobalAlign32}, 801 {V4S32, GlobalPtr, 128, GlobalAlign32}, 802 {S128, GlobalPtr, 128, GlobalAlign32}, 803 {S64, GlobalPtr, 64, GlobalAlign32}, 804 {V2S64, GlobalPtr, 128, GlobalAlign32}, 805 {V2S16, GlobalPtr, 32, GlobalAlign32}, 806 {S32, GlobalPtr, 8, GlobalAlign8}, 807 {S32, GlobalPtr, 16, GlobalAlign16}, 808 809 {S32, LocalPtr, 32, 32}, 810 {S64, LocalPtr, 64, 32}, 811 {V2S32, LocalPtr, 64, 32}, 812 {S32, LocalPtr, 8, 8}, 813 {S32, LocalPtr, 16, 16}, 814 {V2S16, LocalPtr, 32, 32}, 815 816 {S32, PrivatePtr, 32, 32}, 817 {S32, PrivatePtr, 8, 8}, 818 {S32, PrivatePtr, 16, 16}, 819 {V2S16, PrivatePtr, 32, 32}, 820 821 {S32, FlatPtr, 32, GlobalAlign32}, 822 {S32, FlatPtr, 16, GlobalAlign16}, 823 {S32, FlatPtr, 8, GlobalAlign8}, 824 {V2S16, FlatPtr, 32, GlobalAlign32}, 825 826 {S32, ConstantPtr, 32, GlobalAlign32}, 827 {V2S32, ConstantPtr, 64, GlobalAlign32}, 828 {V4S32, ConstantPtr, 128, GlobalAlign32}, 829 {S64, ConstantPtr, 64, GlobalAlign32}, 830 {S128, ConstantPtr, 128, GlobalAlign32}, 831 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 832 Actions 833 .customIf(typeIs(1, Constant32Ptr)) 834 // Widen suitably aligned loads by loading extra elements. 835 .moreElementsIf([=](const LegalityQuery &Query) { 836 const LLT Ty = Query.Types[0]; 837 return Op == G_LOAD && Ty.isVector() && 838 shouldWidenLoadResult(Query); 839 }, moreElementsToNextPow2(0)) 840 .widenScalarIf([=](const LegalityQuery &Query) { 841 const LLT Ty = Query.Types[0]; 842 return Op == G_LOAD && !Ty.isVector() && 843 shouldWidenLoadResult(Query); 844 }, widenScalarOrEltToNextPow2(0)) 845 .narrowScalarIf( 846 [=](const LegalityQuery &Query) -> bool { 847 return !Query.Types[0].isVector() && 848 needToSplitMemOp(Query, Op == G_LOAD); 849 }, 850 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 851 const LLT DstTy = Query.Types[0]; 852 const LLT PtrTy = Query.Types[1]; 853 854 const unsigned DstSize = DstTy.getSizeInBits(); 855 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 856 857 // Split extloads. 858 if (DstSize > MemSize) 859 return std::make_pair(0, LLT::scalar(MemSize)); 860 861 if (!isPowerOf2_32(DstSize)) { 862 // We're probably decomposing an odd sized store. Try to split 863 // to the widest type. TODO: Account for alignment. As-is it 864 // should be OK, since the new parts will be further legalized. 865 unsigned FloorSize = PowerOf2Floor(DstSize); 866 return std::make_pair(0, LLT::scalar(FloorSize)); 867 } 868 869 if (DstSize > 32 && (DstSize % 32 != 0)) { 870 // FIXME: Need a way to specify non-extload of larger size if 871 // suitably aligned. 872 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 873 } 874 875 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 876 Op == G_LOAD); 877 if (MemSize > MaxSize) 878 return std::make_pair(0, LLT::scalar(MaxSize)); 879 880 unsigned Align = Query.MMODescrs[0].AlignInBits; 881 return std::make_pair(0, LLT::scalar(Align)); 882 }) 883 .fewerElementsIf( 884 [=](const LegalityQuery &Query) -> bool { 885 return Query.Types[0].isVector() && 886 needToSplitMemOp(Query, Op == G_LOAD); 887 }, 888 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 889 const LLT DstTy = Query.Types[0]; 890 const LLT PtrTy = Query.Types[1]; 891 892 LLT EltTy = DstTy.getElementType(); 893 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 894 Op == G_LOAD); 895 896 // FIXME: Handle widened to power of 2 results better. This ends 897 // up scalarizing. 898 // FIXME: 3 element stores scalarized on SI 899 900 // Split if it's too large for the address space. 901 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 902 unsigned NumElts = DstTy.getNumElements(); 903 unsigned EltSize = EltTy.getSizeInBits(); 904 905 if (MaxSize % EltSize == 0) { 906 return std::make_pair( 907 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 908 } 909 910 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 911 912 // FIXME: Refine when odd breakdowns handled 913 // The scalars will need to be re-legalized. 914 if (NumPieces == 1 || NumPieces >= NumElts || 915 NumElts % NumPieces != 0) 916 return std::make_pair(0, EltTy); 917 918 return std::make_pair(0, 919 LLT::vector(NumElts / NumPieces, EltTy)); 920 } 921 922 // FIXME: We could probably handle weird extending loads better. 923 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 924 if (DstTy.getSizeInBits() > MemSize) 925 return std::make_pair(0, EltTy); 926 927 unsigned EltSize = EltTy.getSizeInBits(); 928 unsigned DstSize = DstTy.getSizeInBits(); 929 if (!isPowerOf2_32(DstSize)) { 930 // We're probably decomposing an odd sized store. Try to split 931 // to the widest type. TODO: Account for alignment. As-is it 932 // should be OK, since the new parts will be further legalized. 933 unsigned FloorSize = PowerOf2Floor(DstSize); 934 return std::make_pair( 935 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 936 } 937 938 // Need to split because of alignment. 939 unsigned Align = Query.MMODescrs[0].AlignInBits; 940 if (EltSize > Align && 941 (EltSize / Align < DstTy.getNumElements())) { 942 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 943 } 944 945 // May need relegalization for the scalars. 946 return std::make_pair(0, EltTy); 947 }) 948 .minScalar(0, S32); 949 950 if (IsStore) 951 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 952 953 // TODO: Need a bitcast lower option? 954 Actions 955 .legalIf([=](const LegalityQuery &Query) { 956 const LLT Ty0 = Query.Types[0]; 957 unsigned Size = Ty0.getSizeInBits(); 958 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 959 unsigned Align = Query.MMODescrs[0].AlignInBits; 960 961 // FIXME: Widening store from alignment not valid. 962 if (MemSize < Size) 963 MemSize = std::max(MemSize, Align); 964 965 // No extending vector loads. 966 if (Size > MemSize && Ty0.isVector()) 967 return false; 968 969 switch (MemSize) { 970 case 8: 971 case 16: 972 return Size == 32; 973 case 32: 974 case 64: 975 case 128: 976 return true; 977 case 96: 978 return ST.hasDwordx3LoadStores(); 979 case 256: 980 case 512: 981 return true; 982 default: 983 return false; 984 } 985 }) 986 .widenScalarToNextPow2(0) 987 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 988 } 989 990 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 991 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 992 {S32, GlobalPtr, 16, 2 * 8}, 993 {S32, LocalPtr, 8, 8}, 994 {S32, LocalPtr, 16, 16}, 995 {S32, PrivatePtr, 8, 8}, 996 {S32, PrivatePtr, 16, 16}, 997 {S32, ConstantPtr, 8, 8}, 998 {S32, ConstantPtr, 16, 2 * 8}}); 999 if (ST.hasFlatAddressSpace()) { 1000 ExtLoads.legalForTypesWithMemDesc( 1001 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1002 } 1003 1004 ExtLoads.clampScalar(0, S32, S32) 1005 .widenScalarToNextPow2(0) 1006 .unsupportedIfMemSizeNotPow2() 1007 .lower(); 1008 1009 auto &Atomics = getActionDefinitionsBuilder( 1010 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1011 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1012 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1013 G_ATOMICRMW_UMIN}) 1014 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1015 {S64, GlobalPtr}, {S64, LocalPtr}}); 1016 if (ST.hasFlatAddressSpace()) { 1017 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1018 } 1019 1020 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1021 .legalFor({{S32, LocalPtr}}); 1022 1023 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1024 // demarshalling 1025 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1026 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1027 {S32, FlatPtr}, {S64, FlatPtr}}) 1028 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1029 {S32, RegionPtr}, {S64, RegionPtr}}); 1030 // TODO: Pointer types, any 32-bit or 64-bit vector 1031 1032 // Condition should be s32 for scalar, s1 for vector. 1033 getActionDefinitionsBuilder(G_SELECT) 1034 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1035 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1036 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1037 .clampScalar(0, S16, S64) 1038 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1039 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1040 .scalarize(1) 1041 .clampMaxNumElements(0, S32, 2) 1042 .clampMaxNumElements(0, LocalPtr, 2) 1043 .clampMaxNumElements(0, PrivatePtr, 2) 1044 .scalarize(0) 1045 .widenScalarToNextPow2(0) 1046 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1047 1048 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1049 // be more flexible with the shift amount type. 1050 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1051 .legalFor({{S32, S32}, {S64, S32}}); 1052 if (ST.has16BitInsts()) { 1053 if (ST.hasVOP3PInsts()) { 1054 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 1055 .clampMaxNumElements(0, S16, 2); 1056 } else 1057 Shifts.legalFor({{S16, S32}, {S16, S16}}); 1058 1059 // TODO: Support 16-bit shift amounts 1060 Shifts.clampScalar(1, S32, S32); 1061 Shifts.clampScalar(0, S16, S64); 1062 Shifts.widenScalarToNextPow2(0, 16); 1063 } else { 1064 // Make sure we legalize the shift amount type first, as the general 1065 // expansion for the shifted type will produce much worse code if it hasn't 1066 // been truncated already. 1067 Shifts.clampScalar(1, S32, S32); 1068 Shifts.clampScalar(0, S32, S64); 1069 Shifts.widenScalarToNextPow2(0, 32); 1070 } 1071 Shifts.scalarize(0); 1072 1073 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1074 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1075 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1076 unsigned IdxTypeIdx = 2; 1077 1078 getActionDefinitionsBuilder(Op) 1079 .customIf([=](const LegalityQuery &Query) { 1080 const LLT EltTy = Query.Types[EltTypeIdx]; 1081 const LLT VecTy = Query.Types[VecTypeIdx]; 1082 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1083 return (EltTy.getSizeInBits() == 16 || 1084 EltTy.getSizeInBits() % 32 == 0) && 1085 VecTy.getSizeInBits() % 32 == 0 && 1086 VecTy.getSizeInBits() <= 1024 && 1087 IdxTy.getSizeInBits() == 32; 1088 }) 1089 .clampScalar(EltTypeIdx, S32, S64) 1090 .clampScalar(VecTypeIdx, S32, S64) 1091 .clampScalar(IdxTypeIdx, S32, S32); 1092 } 1093 1094 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1095 .unsupportedIf([=](const LegalityQuery &Query) { 1096 const LLT &EltTy = Query.Types[1].getElementType(); 1097 return Query.Types[0] != EltTy; 1098 }); 1099 1100 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1101 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1102 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1103 1104 // FIXME: Doesn't handle extract of illegal sizes. 1105 getActionDefinitionsBuilder(Op) 1106 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1107 // FIXME: Multiples of 16 should not be legal. 1108 .legalIf([=](const LegalityQuery &Query) { 1109 const LLT BigTy = Query.Types[BigTyIdx]; 1110 const LLT LitTy = Query.Types[LitTyIdx]; 1111 return (BigTy.getSizeInBits() % 32 == 0) && 1112 (LitTy.getSizeInBits() % 16 == 0); 1113 }) 1114 .widenScalarIf( 1115 [=](const LegalityQuery &Query) { 1116 const LLT BigTy = Query.Types[BigTyIdx]; 1117 return (BigTy.getScalarSizeInBits() < 16); 1118 }, 1119 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1120 .widenScalarIf( 1121 [=](const LegalityQuery &Query) { 1122 const LLT LitTy = Query.Types[LitTyIdx]; 1123 return (LitTy.getScalarSizeInBits() < 16); 1124 }, 1125 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1126 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1127 .widenScalarToNextPow2(BigTyIdx, 32); 1128 1129 } 1130 1131 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1132 .legalForCartesianProduct(AllS32Vectors, {S32}) 1133 .legalForCartesianProduct(AllS64Vectors, {S64}) 1134 .clampNumElements(0, V16S32, V32S32) 1135 .clampNumElements(0, V2S64, V16S64) 1136 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1137 1138 if (ST.hasScalarPackInsts()) { 1139 BuildVector 1140 // FIXME: Should probably widen s1 vectors straight to s32 1141 .minScalarOrElt(0, S16) 1142 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1143 .minScalar(1, S32); 1144 1145 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1146 .legalFor({V2S16, S32}) 1147 .lower(); 1148 BuildVector.minScalarOrElt(0, S32); 1149 } else { 1150 BuildVector.customFor({V2S16, S16}); 1151 BuildVector.minScalarOrElt(0, S32); 1152 1153 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1154 .customFor({V2S16, S32}) 1155 .lower(); 1156 } 1157 1158 BuildVector.legalIf(isRegisterType(0)); 1159 1160 // FIXME: Clamp maximum size 1161 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1162 .legalIf(isRegisterType(0)); 1163 1164 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1165 // pre-legalize. 1166 if (ST.hasVOP3PInsts()) { 1167 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1168 .customFor({V2S16, V2S16}) 1169 .lower(); 1170 } else 1171 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1172 1173 // Merge/Unmerge 1174 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1175 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1176 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1177 1178 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1179 const LLT &Ty = Query.Types[TypeIdx]; 1180 if (Ty.isVector()) { 1181 const LLT &EltTy = Ty.getElementType(); 1182 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 1183 return true; 1184 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1185 return true; 1186 } 1187 return false; 1188 }; 1189 1190 auto &Builder = getActionDefinitionsBuilder(Op) 1191 // Try to widen to s16 first for small types. 1192 // TODO: Only do this on targets with legal s16 shifts 1193 .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1194 1195 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1196 .lowerFor({{S16, V2S16}}) 1197 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1198 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1199 elementTypeIs(1, S16)), 1200 changeTo(1, V2S16)) 1201 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1202 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1203 // valid. 1204 .clampScalar(LitTyIdx, S32, S256) 1205 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1206 // Break up vectors with weird elements into scalars 1207 .fewerElementsIf( 1208 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 1209 scalarize(0)) 1210 .fewerElementsIf( 1211 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 1212 scalarize(1)) 1213 .clampScalar(BigTyIdx, S32, S1024); 1214 1215 if (Op == G_MERGE_VALUES) { 1216 Builder.widenScalarIf( 1217 // TODO: Use 16-bit shifts if legal for 8-bit values? 1218 [=](const LegalityQuery &Query) { 1219 const LLT Ty = Query.Types[LitTyIdx]; 1220 return Ty.getSizeInBits() < 32; 1221 }, 1222 changeTo(LitTyIdx, S32)); 1223 } 1224 1225 Builder.widenScalarIf( 1226 [=](const LegalityQuery &Query) { 1227 const LLT Ty = Query.Types[BigTyIdx]; 1228 return !isPowerOf2_32(Ty.getSizeInBits()) && 1229 Ty.getSizeInBits() % 16 != 0; 1230 }, 1231 [=](const LegalityQuery &Query) { 1232 // Pick the next power of 2, or a multiple of 64 over 128. 1233 // Whichever is smaller. 1234 const LLT &Ty = Query.Types[BigTyIdx]; 1235 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1236 if (NewSizeInBits >= 256) { 1237 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1238 if (RoundedTo < NewSizeInBits) 1239 NewSizeInBits = RoundedTo; 1240 } 1241 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1242 }) 1243 .legalIf([=](const LegalityQuery &Query) { 1244 const LLT &BigTy = Query.Types[BigTyIdx]; 1245 const LLT &LitTy = Query.Types[LitTyIdx]; 1246 1247 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1248 return false; 1249 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1250 return false; 1251 1252 return BigTy.getSizeInBits() % 16 == 0 && 1253 LitTy.getSizeInBits() % 16 == 0 && 1254 BigTy.getSizeInBits() <= 1024; 1255 }) 1256 // Any vectors left are the wrong size. Scalarize them. 1257 .scalarize(0) 1258 .scalarize(1); 1259 } 1260 1261 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1262 // RegBankSelect. 1263 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1264 .legalFor({{S32}, {S64}}); 1265 1266 if (ST.hasVOP3PInsts()) { 1267 SextInReg.lowerFor({{V2S16}}) 1268 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1269 // get more vector shift opportunities, since we'll get those when 1270 // expanded. 1271 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1272 } else if (ST.has16BitInsts()) { 1273 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1274 } else { 1275 // Prefer to promote to s32 before lowering if we don't have 16-bit 1276 // shifts. This avoid a lot of intermediate truncate and extend operations. 1277 SextInReg.lowerFor({{S32}, {S64}}); 1278 } 1279 1280 SextInReg 1281 .scalarize(0) 1282 .clampScalar(0, S32, S64) 1283 .lower(); 1284 1285 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1286 .legalFor({S64}); 1287 1288 getActionDefinitionsBuilder({ 1289 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1290 G_FCOPYSIGN, 1291 1292 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1293 G_READ_REGISTER, 1294 G_WRITE_REGISTER, 1295 1296 G_SADDO, G_SSUBO, 1297 1298 // TODO: Implement 1299 G_FMINIMUM, G_FMAXIMUM 1300 }).lower(); 1301 1302 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1303 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1304 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1305 .unsupported(); 1306 1307 computeTables(); 1308 verify(*ST.getInstrInfo()); 1309 } 1310 1311 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1312 MachineRegisterInfo &MRI, 1313 MachineIRBuilder &B, 1314 GISelChangeObserver &Observer) const { 1315 switch (MI.getOpcode()) { 1316 case TargetOpcode::G_ADDRSPACE_CAST: 1317 return legalizeAddrSpaceCast(MI, MRI, B); 1318 case TargetOpcode::G_FRINT: 1319 return legalizeFrint(MI, MRI, B); 1320 case TargetOpcode::G_FCEIL: 1321 return legalizeFceil(MI, MRI, B); 1322 case TargetOpcode::G_INTRINSIC_TRUNC: 1323 return legalizeIntrinsicTrunc(MI, MRI, B); 1324 case TargetOpcode::G_SITOFP: 1325 return legalizeITOFP(MI, MRI, B, true); 1326 case TargetOpcode::G_UITOFP: 1327 return legalizeITOFP(MI, MRI, B, false); 1328 case TargetOpcode::G_FPTOSI: 1329 return legalizeFPTOI(MI, MRI, B, true); 1330 case TargetOpcode::G_FPTOUI: 1331 return legalizeFPTOI(MI, MRI, B, false); 1332 case TargetOpcode::G_FMINNUM: 1333 case TargetOpcode::G_FMAXNUM: 1334 case TargetOpcode::G_FMINNUM_IEEE: 1335 case TargetOpcode::G_FMAXNUM_IEEE: 1336 return legalizeMinNumMaxNum(MI, MRI, B); 1337 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1338 return legalizeExtractVectorElt(MI, MRI, B); 1339 case TargetOpcode::G_INSERT_VECTOR_ELT: 1340 return legalizeInsertVectorElt(MI, MRI, B); 1341 case TargetOpcode::G_SHUFFLE_VECTOR: 1342 return legalizeShuffleVector(MI, MRI, B); 1343 case TargetOpcode::G_FSIN: 1344 case TargetOpcode::G_FCOS: 1345 return legalizeSinCos(MI, MRI, B); 1346 case TargetOpcode::G_GLOBAL_VALUE: 1347 return legalizeGlobalValue(MI, MRI, B); 1348 case TargetOpcode::G_LOAD: 1349 return legalizeLoad(MI, MRI, B, Observer); 1350 case TargetOpcode::G_FMAD: 1351 return legalizeFMad(MI, MRI, B); 1352 case TargetOpcode::G_FDIV: 1353 return legalizeFDIV(MI, MRI, B); 1354 case TargetOpcode::G_UDIV: 1355 case TargetOpcode::G_UREM: 1356 return legalizeUDIV_UREM(MI, MRI, B); 1357 case TargetOpcode::G_SDIV: 1358 case TargetOpcode::G_SREM: 1359 return legalizeSDIV_SREM(MI, MRI, B); 1360 case TargetOpcode::G_ATOMIC_CMPXCHG: 1361 return legalizeAtomicCmpXChg(MI, MRI, B); 1362 case TargetOpcode::G_FLOG: 1363 return legalizeFlog(MI, B, 1.0f / numbers::log2ef); 1364 case TargetOpcode::G_FLOG10: 1365 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1366 case TargetOpcode::G_FEXP: 1367 return legalizeFExp(MI, B); 1368 case TargetOpcode::G_FPOW: 1369 return legalizeFPow(MI, B); 1370 case TargetOpcode::G_FFLOOR: 1371 return legalizeFFloor(MI, MRI, B); 1372 case TargetOpcode::G_BUILD_VECTOR: 1373 return legalizeBuildVector(MI, MRI, B); 1374 default: 1375 return false; 1376 } 1377 1378 llvm_unreachable("expected switch to return"); 1379 } 1380 1381 Register AMDGPULegalizerInfo::getSegmentAperture( 1382 unsigned AS, 1383 MachineRegisterInfo &MRI, 1384 MachineIRBuilder &B) const { 1385 MachineFunction &MF = B.getMF(); 1386 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1387 const LLT S32 = LLT::scalar(32); 1388 1389 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1390 1391 if (ST.hasApertureRegs()) { 1392 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1393 // getreg. 1394 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1395 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1396 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1397 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1398 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1399 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1400 unsigned Encoding = 1401 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1402 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1403 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1404 1405 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1406 1407 B.buildInstr(AMDGPU::S_GETREG_B32) 1408 .addDef(GetReg) 1409 .addImm(Encoding); 1410 MRI.setType(GetReg, S32); 1411 1412 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1413 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1414 } 1415 1416 Register QueuePtr = MRI.createGenericVirtualRegister( 1417 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1418 1419 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1420 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1421 return Register(); 1422 1423 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1424 // private_segment_aperture_base_hi. 1425 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1426 1427 // TODO: can we be smarter about machine pointer info? 1428 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1429 MachineMemOperand *MMO = MF.getMachineMemOperand( 1430 PtrInfo, 1431 MachineMemOperand::MOLoad | 1432 MachineMemOperand::MODereferenceable | 1433 MachineMemOperand::MOInvariant, 1434 4, 1435 MinAlign(64, StructOffset)); 1436 1437 Register LoadAddr; 1438 1439 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1440 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1441 } 1442 1443 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1444 MachineInstr &MI, MachineRegisterInfo &MRI, 1445 MachineIRBuilder &B) const { 1446 MachineFunction &MF = B.getMF(); 1447 1448 B.setInstr(MI); 1449 1450 const LLT S32 = LLT::scalar(32); 1451 Register Dst = MI.getOperand(0).getReg(); 1452 Register Src = MI.getOperand(1).getReg(); 1453 1454 LLT DstTy = MRI.getType(Dst); 1455 LLT SrcTy = MRI.getType(Src); 1456 unsigned DestAS = DstTy.getAddressSpace(); 1457 unsigned SrcAS = SrcTy.getAddressSpace(); 1458 1459 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1460 // vector element. 1461 assert(!DstTy.isVector()); 1462 1463 const AMDGPUTargetMachine &TM 1464 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1465 1466 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1467 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1468 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1469 return true; 1470 } 1471 1472 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1473 // Truncate. 1474 B.buildExtract(Dst, Src, 0); 1475 MI.eraseFromParent(); 1476 return true; 1477 } 1478 1479 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1480 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1481 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1482 1483 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1484 // another. Merge operands are required to be the same type, but creating an 1485 // extra ptrtoint would be kind of pointless. 1486 auto HighAddr = B.buildConstant( 1487 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1488 B.buildMerge(Dst, {Src, HighAddr}); 1489 MI.eraseFromParent(); 1490 return true; 1491 } 1492 1493 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1494 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1495 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1496 unsigned NullVal = TM.getNullPointerValue(DestAS); 1497 1498 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1499 auto FlatNull = B.buildConstant(SrcTy, 0); 1500 1501 // Extract low 32-bits of the pointer. 1502 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1503 1504 auto CmpRes = 1505 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1506 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1507 1508 MI.eraseFromParent(); 1509 return true; 1510 } 1511 1512 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1513 return false; 1514 1515 if (!ST.hasFlatAddressSpace()) 1516 return false; 1517 1518 auto SegmentNull = 1519 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1520 auto FlatNull = 1521 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1522 1523 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1524 if (!ApertureReg.isValid()) 1525 return false; 1526 1527 auto CmpRes = 1528 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1529 1530 // Coerce the type of the low half of the result so we can use merge_values. 1531 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1532 1533 // TODO: Should we allow mismatched types but matching sizes in merges to 1534 // avoid the ptrtoint? 1535 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1536 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1537 1538 MI.eraseFromParent(); 1539 return true; 1540 } 1541 1542 bool AMDGPULegalizerInfo::legalizeFrint( 1543 MachineInstr &MI, MachineRegisterInfo &MRI, 1544 MachineIRBuilder &B) const { 1545 B.setInstr(MI); 1546 1547 Register Src = MI.getOperand(1).getReg(); 1548 LLT Ty = MRI.getType(Src); 1549 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1550 1551 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1552 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1553 1554 auto C1 = B.buildFConstant(Ty, C1Val); 1555 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1556 1557 // TODO: Should this propagate fast-math-flags? 1558 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1559 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1560 1561 auto C2 = B.buildFConstant(Ty, C2Val); 1562 auto Fabs = B.buildFAbs(Ty, Src); 1563 1564 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1565 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1566 return true; 1567 } 1568 1569 bool AMDGPULegalizerInfo::legalizeFceil( 1570 MachineInstr &MI, MachineRegisterInfo &MRI, 1571 MachineIRBuilder &B) const { 1572 B.setInstr(MI); 1573 1574 const LLT S1 = LLT::scalar(1); 1575 const LLT S64 = LLT::scalar(64); 1576 1577 Register Src = MI.getOperand(1).getReg(); 1578 assert(MRI.getType(Src) == S64); 1579 1580 // result = trunc(src) 1581 // if (src > 0.0 && src != result) 1582 // result += 1.0 1583 1584 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1585 1586 const auto Zero = B.buildFConstant(S64, 0.0); 1587 const auto One = B.buildFConstant(S64, 1.0); 1588 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1589 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1590 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1591 auto Add = B.buildSelect(S64, And, One, Zero); 1592 1593 // TODO: Should this propagate fast-math-flags? 1594 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1595 return true; 1596 } 1597 1598 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1599 MachineIRBuilder &B) { 1600 const unsigned FractBits = 52; 1601 const unsigned ExpBits = 11; 1602 LLT S32 = LLT::scalar(32); 1603 1604 auto Const0 = B.buildConstant(S32, FractBits - 32); 1605 auto Const1 = B.buildConstant(S32, ExpBits); 1606 1607 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1608 .addUse(Const0.getReg(0)) 1609 .addUse(Const1.getReg(0)); 1610 1611 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1612 } 1613 1614 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1615 MachineInstr &MI, MachineRegisterInfo &MRI, 1616 MachineIRBuilder &B) const { 1617 B.setInstr(MI); 1618 1619 const LLT S1 = LLT::scalar(1); 1620 const LLT S32 = LLT::scalar(32); 1621 const LLT S64 = LLT::scalar(64); 1622 1623 Register Src = MI.getOperand(1).getReg(); 1624 assert(MRI.getType(Src) == S64); 1625 1626 // TODO: Should this use extract since the low half is unused? 1627 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1628 Register Hi = Unmerge.getReg(1); 1629 1630 // Extract the upper half, since this is where we will find the sign and 1631 // exponent. 1632 auto Exp = extractF64Exponent(Hi, B); 1633 1634 const unsigned FractBits = 52; 1635 1636 // Extract the sign bit. 1637 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1638 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1639 1640 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1641 1642 const auto Zero32 = B.buildConstant(S32, 0); 1643 1644 // Extend back to 64-bits. 1645 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1646 1647 auto Shr = B.buildAShr(S64, FractMask, Exp); 1648 auto Not = B.buildNot(S64, Shr); 1649 auto Tmp0 = B.buildAnd(S64, Src, Not); 1650 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1651 1652 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1653 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1654 1655 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1656 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1657 return true; 1658 } 1659 1660 bool AMDGPULegalizerInfo::legalizeITOFP( 1661 MachineInstr &MI, MachineRegisterInfo &MRI, 1662 MachineIRBuilder &B, bool Signed) const { 1663 B.setInstr(MI); 1664 1665 Register Dst = MI.getOperand(0).getReg(); 1666 Register Src = MI.getOperand(1).getReg(); 1667 1668 const LLT S64 = LLT::scalar(64); 1669 const LLT S32 = LLT::scalar(32); 1670 1671 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1672 1673 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1674 1675 auto CvtHi = Signed ? 1676 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1677 B.buildUITOFP(S64, Unmerge.getReg(1)); 1678 1679 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1680 1681 auto ThirtyTwo = B.buildConstant(S32, 32); 1682 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1683 .addUse(CvtHi.getReg(0)) 1684 .addUse(ThirtyTwo.getReg(0)); 1685 1686 // TODO: Should this propagate fast-math-flags? 1687 B.buildFAdd(Dst, LdExp, CvtLo); 1688 MI.eraseFromParent(); 1689 return true; 1690 } 1691 1692 // TODO: Copied from DAG implementation. Verify logic and document how this 1693 // actually works. 1694 bool AMDGPULegalizerInfo::legalizeFPTOI( 1695 MachineInstr &MI, MachineRegisterInfo &MRI, 1696 MachineIRBuilder &B, bool Signed) const { 1697 B.setInstr(MI); 1698 1699 Register Dst = MI.getOperand(0).getReg(); 1700 Register Src = MI.getOperand(1).getReg(); 1701 1702 const LLT S64 = LLT::scalar(64); 1703 const LLT S32 = LLT::scalar(32); 1704 1705 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1706 1707 unsigned Flags = MI.getFlags(); 1708 1709 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1710 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1711 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1712 1713 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1714 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1715 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1716 1717 auto Hi = Signed ? 1718 B.buildFPTOSI(S32, FloorMul) : 1719 B.buildFPTOUI(S32, FloorMul); 1720 auto Lo = B.buildFPTOUI(S32, Fma); 1721 1722 B.buildMerge(Dst, { Lo, Hi }); 1723 MI.eraseFromParent(); 1724 1725 return true; 1726 } 1727 1728 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1729 MachineInstr &MI, MachineRegisterInfo &MRI, 1730 MachineIRBuilder &B) const { 1731 MachineFunction &MF = B.getMF(); 1732 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1733 1734 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1735 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1736 1737 // With ieee_mode disabled, the instructions have the correct behavior 1738 // already for G_FMINNUM/G_FMAXNUM 1739 if (!MFI->getMode().IEEE) 1740 return !IsIEEEOp; 1741 1742 if (IsIEEEOp) 1743 return true; 1744 1745 MachineIRBuilder HelperBuilder(MI); 1746 GISelObserverWrapper DummyObserver; 1747 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1748 HelperBuilder.setInstr(MI); 1749 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1750 } 1751 1752 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1753 MachineInstr &MI, MachineRegisterInfo &MRI, 1754 MachineIRBuilder &B) const { 1755 // TODO: Should move some of this into LegalizerHelper. 1756 1757 // TODO: Promote dynamic indexing of s16 to s32 1758 1759 // FIXME: Artifact combiner probably should have replaced the truncated 1760 // constant before this, so we shouldn't need 1761 // getConstantVRegValWithLookThrough. 1762 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1763 MI.getOperand(2).getReg(), MRI); 1764 if (!IdxVal) // Dynamic case will be selected to register indexing. 1765 return true; 1766 1767 Register Dst = MI.getOperand(0).getReg(); 1768 Register Vec = MI.getOperand(1).getReg(); 1769 1770 LLT VecTy = MRI.getType(Vec); 1771 LLT EltTy = VecTy.getElementType(); 1772 assert(EltTy == MRI.getType(Dst)); 1773 1774 B.setInstr(MI); 1775 1776 if (IdxVal->Value < VecTy.getNumElements()) 1777 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1778 else 1779 B.buildUndef(Dst); 1780 1781 MI.eraseFromParent(); 1782 return true; 1783 } 1784 1785 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1786 MachineInstr &MI, MachineRegisterInfo &MRI, 1787 MachineIRBuilder &B) const { 1788 // TODO: Should move some of this into LegalizerHelper. 1789 1790 // TODO: Promote dynamic indexing of s16 to s32 1791 1792 // FIXME: Artifact combiner probably should have replaced the truncated 1793 // constant before this, so we shouldn't need 1794 // getConstantVRegValWithLookThrough. 1795 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1796 MI.getOperand(3).getReg(), MRI); 1797 if (!IdxVal) // Dynamic case will be selected to register indexing. 1798 return true; 1799 1800 Register Dst = MI.getOperand(0).getReg(); 1801 Register Vec = MI.getOperand(1).getReg(); 1802 Register Ins = MI.getOperand(2).getReg(); 1803 1804 LLT VecTy = MRI.getType(Vec); 1805 LLT EltTy = VecTy.getElementType(); 1806 assert(EltTy == MRI.getType(Ins)); 1807 1808 B.setInstr(MI); 1809 1810 if (IdxVal->Value < VecTy.getNumElements()) 1811 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 1812 else 1813 B.buildUndef(Dst); 1814 1815 MI.eraseFromParent(); 1816 return true; 1817 } 1818 1819 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1820 MachineInstr &MI, MachineRegisterInfo &MRI, 1821 MachineIRBuilder &B) const { 1822 const LLT V2S16 = LLT::vector(2, 16); 1823 1824 Register Dst = MI.getOperand(0).getReg(); 1825 Register Src0 = MI.getOperand(1).getReg(); 1826 LLT DstTy = MRI.getType(Dst); 1827 LLT SrcTy = MRI.getType(Src0); 1828 1829 if (SrcTy == V2S16 && DstTy == V2S16 && 1830 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1831 return true; 1832 1833 MachineIRBuilder HelperBuilder(MI); 1834 GISelObserverWrapper DummyObserver; 1835 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1836 HelperBuilder.setInstr(MI); 1837 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1838 } 1839 1840 bool AMDGPULegalizerInfo::legalizeSinCos( 1841 MachineInstr &MI, MachineRegisterInfo &MRI, 1842 MachineIRBuilder &B) const { 1843 B.setInstr(MI); 1844 1845 Register DstReg = MI.getOperand(0).getReg(); 1846 Register SrcReg = MI.getOperand(1).getReg(); 1847 LLT Ty = MRI.getType(DstReg); 1848 unsigned Flags = MI.getFlags(); 1849 1850 Register TrigVal; 1851 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1852 if (ST.hasTrigReducedRange()) { 1853 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1854 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1855 .addUse(MulVal.getReg(0)) 1856 .setMIFlags(Flags).getReg(0); 1857 } else 1858 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1859 1860 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1861 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1862 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1863 .addUse(TrigVal) 1864 .setMIFlags(Flags); 1865 MI.eraseFromParent(); 1866 return true; 1867 } 1868 1869 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1870 Register DstReg, LLT PtrTy, 1871 MachineIRBuilder &B, const GlobalValue *GV, 1872 unsigned Offset, unsigned GAFlags) const { 1873 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1874 // to the following code sequence: 1875 // 1876 // For constant address space: 1877 // s_getpc_b64 s[0:1] 1878 // s_add_u32 s0, s0, $symbol 1879 // s_addc_u32 s1, s1, 0 1880 // 1881 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1882 // a fixup or relocation is emitted to replace $symbol with a literal 1883 // constant, which is a pc-relative offset from the encoding of the $symbol 1884 // operand to the global variable. 1885 // 1886 // For global address space: 1887 // s_getpc_b64 s[0:1] 1888 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1889 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1890 // 1891 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1892 // fixups or relocations are emitted to replace $symbol@*@lo and 1893 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1894 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1895 // operand to the global variable. 1896 // 1897 // What we want here is an offset from the value returned by s_getpc 1898 // (which is the address of the s_add_u32 instruction) to the global 1899 // variable, but since the encoding of $symbol starts 4 bytes after the start 1900 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1901 // small. This requires us to add 4 to the global variable offset in order to 1902 // compute the correct address. 1903 1904 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1905 1906 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1907 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1908 1909 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1910 .addDef(PCReg); 1911 1912 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1913 if (GAFlags == SIInstrInfo::MO_NONE) 1914 MIB.addImm(0); 1915 else 1916 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1917 1918 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1919 1920 if (PtrTy.getSizeInBits() == 32) 1921 B.buildExtract(DstReg, PCReg, 0); 1922 return true; 1923 } 1924 1925 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1926 MachineInstr &MI, MachineRegisterInfo &MRI, 1927 MachineIRBuilder &B) const { 1928 Register DstReg = MI.getOperand(0).getReg(); 1929 LLT Ty = MRI.getType(DstReg); 1930 unsigned AS = Ty.getAddressSpace(); 1931 1932 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1933 MachineFunction &MF = B.getMF(); 1934 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1935 B.setInstr(MI); 1936 1937 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1938 if (!MFI->isEntryFunction()) { 1939 const Function &Fn = MF.getFunction(); 1940 DiagnosticInfoUnsupported BadLDSDecl( 1941 Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 1942 Fn.getContext().diagnose(BadLDSDecl); 1943 } 1944 1945 // TODO: We could emit code to handle the initialization somewhere. 1946 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1947 const SITargetLowering *TLI = ST.getTargetLowering(); 1948 if (!TLI->shouldUseLDSConstAddress(GV)) { 1949 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 1950 return true; // Leave in place; 1951 } 1952 1953 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1954 MI.eraseFromParent(); 1955 return true; 1956 } 1957 1958 const Function &Fn = MF.getFunction(); 1959 DiagnosticInfoUnsupported BadInit( 1960 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1961 Fn.getContext().diagnose(BadInit); 1962 return true; 1963 } 1964 1965 const SITargetLowering *TLI = ST.getTargetLowering(); 1966 1967 if (TLI->shouldEmitFixup(GV)) { 1968 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 1969 MI.eraseFromParent(); 1970 return true; 1971 } 1972 1973 if (TLI->shouldEmitPCReloc(GV)) { 1974 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 1975 MI.eraseFromParent(); 1976 return true; 1977 } 1978 1979 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1980 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 1981 1982 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 1983 MachinePointerInfo::getGOT(MF), 1984 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1985 MachineMemOperand::MOInvariant, 1986 8 /*Size*/, 8 /*Align*/); 1987 1988 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 1989 1990 if (Ty.getSizeInBits() == 32) { 1991 // Truncate if this is a 32-bit constant adrdess. 1992 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 1993 B.buildExtract(DstReg, Load, 0); 1994 } else 1995 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 1996 1997 MI.eraseFromParent(); 1998 return true; 1999 } 2000 2001 bool AMDGPULegalizerInfo::legalizeLoad( 2002 MachineInstr &MI, MachineRegisterInfo &MRI, 2003 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2004 B.setInstr(MI); 2005 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2006 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2007 Observer.changingInstr(MI); 2008 MI.getOperand(1).setReg(Cast.getReg(0)); 2009 Observer.changedInstr(MI); 2010 return true; 2011 } 2012 2013 bool AMDGPULegalizerInfo::legalizeFMad( 2014 MachineInstr &MI, MachineRegisterInfo &MRI, 2015 MachineIRBuilder &B) const { 2016 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2017 assert(Ty.isScalar()); 2018 2019 MachineFunction &MF = B.getMF(); 2020 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2021 2022 // TODO: Always legal with future ftz flag. 2023 // FIXME: Do we need just output? 2024 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2025 return true; 2026 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2027 return true; 2028 2029 MachineIRBuilder HelperBuilder(MI); 2030 GISelObserverWrapper DummyObserver; 2031 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2032 HelperBuilder.setMBB(*MI.getParent()); 2033 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2034 } 2035 2036 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2037 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2038 Register DstReg = MI.getOperand(0).getReg(); 2039 Register PtrReg = MI.getOperand(1).getReg(); 2040 Register CmpVal = MI.getOperand(2).getReg(); 2041 Register NewVal = MI.getOperand(3).getReg(); 2042 2043 assert(SITargetLowering::isFlatGlobalAddrSpace( 2044 MRI.getType(PtrReg).getAddressSpace()) && 2045 "this should not have been custom lowered"); 2046 2047 LLT ValTy = MRI.getType(CmpVal); 2048 LLT VecTy = LLT::vector(2, ValTy); 2049 2050 B.setInstr(MI); 2051 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2052 2053 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2054 .addDef(DstReg) 2055 .addUse(PtrReg) 2056 .addUse(PackedVal) 2057 .setMemRefs(MI.memoperands()); 2058 2059 MI.eraseFromParent(); 2060 return true; 2061 } 2062 2063 bool AMDGPULegalizerInfo::legalizeFlog( 2064 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2065 Register Dst = MI.getOperand(0).getReg(); 2066 Register Src = MI.getOperand(1).getReg(); 2067 LLT Ty = B.getMRI()->getType(Dst); 2068 unsigned Flags = MI.getFlags(); 2069 B.setInstr(MI); 2070 2071 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2072 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2073 2074 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2075 MI.eraseFromParent(); 2076 return true; 2077 } 2078 2079 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2080 MachineIRBuilder &B) const { 2081 Register Dst = MI.getOperand(0).getReg(); 2082 Register Src = MI.getOperand(1).getReg(); 2083 unsigned Flags = MI.getFlags(); 2084 LLT Ty = B.getMRI()->getType(Dst); 2085 B.setInstr(MI); 2086 2087 auto K = B.buildFConstant(Ty, numbers::log2e); 2088 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2089 B.buildFExp2(Dst, Mul, Flags); 2090 MI.eraseFromParent(); 2091 return true; 2092 } 2093 2094 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2095 MachineIRBuilder &B) const { 2096 Register Dst = MI.getOperand(0).getReg(); 2097 Register Src0 = MI.getOperand(1).getReg(); 2098 Register Src1 = MI.getOperand(2).getReg(); 2099 unsigned Flags = MI.getFlags(); 2100 LLT Ty = B.getMRI()->getType(Dst); 2101 B.setInstr(MI); 2102 const LLT S16 = LLT::scalar(16); 2103 const LLT S32 = LLT::scalar(32); 2104 2105 if (Ty == S32) { 2106 auto Log = B.buildFLog2(S32, Src0, Flags); 2107 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2108 .addUse(Log.getReg(0)) 2109 .addUse(Src1) 2110 .setMIFlags(Flags); 2111 B.buildFExp2(Dst, Mul, Flags); 2112 } else if (Ty == S16) { 2113 // There's no f16 fmul_legacy, so we need to convert for it. 2114 auto Log = B.buildFLog2(S16, Src0, Flags); 2115 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2116 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2117 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2118 .addUse(Ext0.getReg(0)) 2119 .addUse(Ext1.getReg(0)) 2120 .setMIFlags(Flags); 2121 2122 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2123 } else 2124 return false; 2125 2126 MI.eraseFromParent(); 2127 return true; 2128 } 2129 2130 // Find a source register, ignoring any possible source modifiers. 2131 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2132 Register ModSrc = OrigSrc; 2133 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2134 ModSrc = SrcFNeg->getOperand(1).getReg(); 2135 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2136 ModSrc = SrcFAbs->getOperand(1).getReg(); 2137 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2138 ModSrc = SrcFAbs->getOperand(1).getReg(); 2139 return ModSrc; 2140 } 2141 2142 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2143 MachineRegisterInfo &MRI, 2144 MachineIRBuilder &B) const { 2145 B.setInstr(MI); 2146 2147 const LLT S1 = LLT::scalar(1); 2148 const LLT S64 = LLT::scalar(64); 2149 Register Dst = MI.getOperand(0).getReg(); 2150 Register OrigSrc = MI.getOperand(1).getReg(); 2151 unsigned Flags = MI.getFlags(); 2152 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2153 "this should not have been custom lowered"); 2154 2155 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2156 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2157 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2158 // V_FRACT bug is: 2159 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2160 // 2161 // Convert floor(x) to (x - fract(x)) 2162 2163 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2164 .addUse(OrigSrc) 2165 .setMIFlags(Flags); 2166 2167 // Give source modifier matching some assistance before obscuring a foldable 2168 // pattern. 2169 2170 // TODO: We can avoid the neg on the fract? The input sign to fract 2171 // shouldn't matter? 2172 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2173 2174 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2175 2176 Register Min = MRI.createGenericVirtualRegister(S64); 2177 2178 // We don't need to concern ourselves with the snan handling difference, so 2179 // use the one which will directly select. 2180 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2181 if (MFI->getMode().IEEE) 2182 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2183 else 2184 B.buildFMinNum(Min, Fract, Const, Flags); 2185 2186 Register CorrectedFract = Min; 2187 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2188 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2189 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2190 } 2191 2192 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2193 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2194 2195 MI.eraseFromParent(); 2196 return true; 2197 } 2198 2199 // Turn an illegal packed v2s16 build vector into bit operations. 2200 // TODO: This should probably be a bitcast action in LegalizerHelper. 2201 bool AMDGPULegalizerInfo::legalizeBuildVector( 2202 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2203 Register Dst = MI.getOperand(0).getReg(); 2204 LLT DstTy = MRI.getType(Dst); 2205 const LLT S32 = LLT::scalar(32); 2206 const LLT V2S16 = LLT::vector(2, 16); 2207 (void)DstTy; 2208 (void)V2S16; 2209 assert(DstTy == V2S16); 2210 2211 Register Src0 = MI.getOperand(1).getReg(); 2212 Register Src1 = MI.getOperand(2).getReg(); 2213 assert(MRI.getType(Src0) == LLT::scalar(16)); 2214 2215 B.setInstr(MI); 2216 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2217 B.buildBitcast(Dst, Merge); 2218 2219 MI.eraseFromParent(); 2220 return true; 2221 } 2222 2223 // Return the use branch instruction, otherwise null if the usage is invalid. 2224 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2225 MachineRegisterInfo &MRI, 2226 MachineInstr *&Br) { 2227 Register CondDef = MI.getOperand(0).getReg(); 2228 if (!MRI.hasOneNonDBGUse(CondDef)) 2229 return nullptr; 2230 2231 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2232 if (UseMI.getParent() != MI.getParent() || 2233 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2234 return nullptr; 2235 2236 // Make sure the cond br is followed by a G_BR 2237 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2238 if (Next != MI.getParent()->end()) { 2239 if (Next->getOpcode() != AMDGPU::G_BR) 2240 return nullptr; 2241 Br = &*Next; 2242 } 2243 2244 return &UseMI; 2245 } 2246 2247 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 2248 Register Reg, LLT Ty) const { 2249 Register LiveIn = MRI.getLiveInVirtReg(Reg); 2250 if (LiveIn) 2251 return LiveIn; 2252 2253 Register NewReg = MRI.createGenericVirtualRegister(Ty); 2254 MRI.addLiveIn(Reg, NewReg); 2255 return NewReg; 2256 } 2257 2258 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2259 const ArgDescriptor *Arg) const { 2260 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2261 return false; // TODO: Handle these 2262 2263 assert(Arg->getRegister().isPhysical()); 2264 2265 MachineRegisterInfo &MRI = *B.getMRI(); 2266 2267 LLT Ty = MRI.getType(DstReg); 2268 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 2269 2270 if (Arg->isMasked()) { 2271 // TODO: Should we try to emit this once in the entry block? 2272 const LLT S32 = LLT::scalar(32); 2273 const unsigned Mask = Arg->getMask(); 2274 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2275 2276 Register AndMaskSrc = LiveIn; 2277 2278 if (Shift != 0) { 2279 auto ShiftAmt = B.buildConstant(S32, Shift); 2280 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2281 } 2282 2283 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2284 } else 2285 B.buildCopy(DstReg, LiveIn); 2286 2287 // Insert the argument copy if it doens't already exist. 2288 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2289 if (!MRI.getVRegDef(LiveIn)) { 2290 // FIXME: Should have scoped insert pt 2291 MachineBasicBlock &OrigInsBB = B.getMBB(); 2292 auto OrigInsPt = B.getInsertPt(); 2293 2294 MachineBasicBlock &EntryMBB = B.getMF().front(); 2295 EntryMBB.addLiveIn(Arg->getRegister()); 2296 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2297 B.buildCopy(LiveIn, Arg->getRegister()); 2298 2299 B.setInsertPt(OrigInsBB, OrigInsPt); 2300 } 2301 2302 return true; 2303 } 2304 2305 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2306 MachineInstr &MI, 2307 MachineRegisterInfo &MRI, 2308 MachineIRBuilder &B, 2309 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2310 B.setInstr(MI); 2311 2312 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2313 2314 const ArgDescriptor *Arg; 2315 const TargetRegisterClass *RC; 2316 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 2317 if (!Arg) { 2318 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2319 return false; 2320 } 2321 2322 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 2323 MI.eraseFromParent(); 2324 return true; 2325 } 2326 2327 return false; 2328 } 2329 2330 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2331 MachineRegisterInfo &MRI, 2332 MachineIRBuilder &B) const { 2333 B.setInstr(MI); 2334 Register Dst = MI.getOperand(0).getReg(); 2335 LLT DstTy = MRI.getType(Dst); 2336 LLT S16 = LLT::scalar(16); 2337 LLT S32 = LLT::scalar(32); 2338 LLT S64 = LLT::scalar(64); 2339 2340 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2341 return true; 2342 2343 if (DstTy == S16) 2344 return legalizeFDIV16(MI, MRI, B); 2345 if (DstTy == S32) 2346 return legalizeFDIV32(MI, MRI, B); 2347 if (DstTy == S64) 2348 return legalizeFDIV64(MI, MRI, B); 2349 2350 return false; 2351 } 2352 2353 static Register buildDivRCP(MachineIRBuilder &B, Register Src) { 2354 const LLT S32 = LLT::scalar(32); 2355 2356 auto Cvt0 = B.buildUITOFP(S32, Src); 2357 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0}); 2358 auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000)); 2359 auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1); 2360 return B.buildFPTOUI(S32, Mul).getReg(0); 2361 } 2362 2363 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2364 Register DstReg, 2365 Register Num, 2366 Register Den, 2367 bool IsRem) const { 2368 const LLT S1 = LLT::scalar(1); 2369 const LLT S32 = LLT::scalar(32); 2370 2371 // RCP = URECIP(Den) = 2^32 / Den + e 2372 // e is rounding error. 2373 auto RCP = buildDivRCP(B, Den); 2374 2375 // RCP_LO = mul(RCP, Den) 2376 auto RCP_LO = B.buildMul(S32, RCP, Den); 2377 2378 // RCP_HI = mulhu (RCP, Den) */ 2379 auto RCP_HI = B.buildUMulH(S32, RCP, Den); 2380 2381 // NEG_RCP_LO = -RCP_LO 2382 auto Zero = B.buildConstant(S32, 0); 2383 auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO); 2384 2385 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 2386 auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero); 2387 auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO); 2388 2389 // Calculate the rounding error from the URECIP instruction 2390 // E = mulhu(ABS_RCP_LO, RCP) 2391 auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP); 2392 2393 // RCP_A_E = RCP + E 2394 auto RCP_A_E = B.buildAdd(S32, RCP, E); 2395 2396 // RCP_S_E = RCP - E 2397 auto RCP_S_E = B.buildSub(S32, RCP, E); 2398 2399 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 2400 auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E); 2401 2402 // Quotient = mulhu(Tmp0, Num)stmp 2403 auto Quotient = B.buildUMulH(S32, Tmp0, Num); 2404 2405 // Num_S_Remainder = Quotient * Den 2406 auto Num_S_Remainder = B.buildMul(S32, Quotient, Den); 2407 2408 // Remainder = Num - Num_S_Remainder 2409 auto Remainder = B.buildSub(S32, Num, Num_S_Remainder); 2410 2411 // Remainder_GE_Den = Remainder >= Den 2412 auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den); 2413 2414 // Remainder_GE_Zero = Num >= Num_S_Remainder; 2415 auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1, 2416 Num, Num_S_Remainder); 2417 2418 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 2419 auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero); 2420 2421 // Calculate Division result: 2422 2423 // Quotient_A_One = Quotient + 1 2424 auto One = B.buildConstant(S32, 1); 2425 auto Quotient_A_One = B.buildAdd(S32, Quotient, One); 2426 2427 // Quotient_S_One = Quotient - 1 2428 auto Quotient_S_One = B.buildSub(S32, Quotient, One); 2429 2430 // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient) 2431 auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One); 2432 2433 // Div = (Remainder_GE_Zero ? Div : Quotient_S_One) 2434 if (IsRem) { 2435 Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One); 2436 2437 // Calculate Rem result: 2438 auto Remainder_S_Den = B.buildSub(S32, Remainder, Den); 2439 2440 // Remainder_A_Den = Remainder + Den 2441 auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den); 2442 2443 // Rem = (Tmp1 ? Remainder_S_Den : Remainder) 2444 auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder); 2445 2446 // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den) 2447 B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den); 2448 } else { 2449 B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One); 2450 } 2451 } 2452 2453 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2454 MachineRegisterInfo &MRI, 2455 MachineIRBuilder &B) const { 2456 B.setInstr(MI); 2457 const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM; 2458 Register DstReg = MI.getOperand(0).getReg(); 2459 Register Num = MI.getOperand(1).getReg(); 2460 Register Den = MI.getOperand(2).getReg(); 2461 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem); 2462 MI.eraseFromParent(); 2463 return true; 2464 } 2465 2466 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2467 MachineRegisterInfo &MRI, 2468 MachineIRBuilder &B) const { 2469 if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32)) 2470 return legalizeUDIV_UREM32(MI, MRI, B); 2471 return false; 2472 } 2473 2474 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI, 2475 MachineRegisterInfo &MRI, 2476 MachineIRBuilder &B) const { 2477 B.setInstr(MI); 2478 const LLT S32 = LLT::scalar(32); 2479 2480 const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM; 2481 Register DstReg = MI.getOperand(0).getReg(); 2482 Register LHS = MI.getOperand(1).getReg(); 2483 Register RHS = MI.getOperand(2).getReg(); 2484 2485 auto ThirtyOne = B.buildConstant(S32, 31); 2486 auto LHSign = B.buildAShr(S32, LHS, ThirtyOne); 2487 auto RHSign = B.buildAShr(S32, LHS, ThirtyOne); 2488 2489 LHS = B.buildAdd(S32, LHS, LHSign).getReg(0); 2490 RHS = B.buildAdd(S32, RHS, RHSign).getReg(0); 2491 2492 LHS = B.buildXor(S32, LHS, LHSign).getReg(0); 2493 RHS = B.buildXor(S32, RHS, RHSign).getReg(0); 2494 2495 Register UDivRem = MRI.createGenericVirtualRegister(S32); 2496 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem); 2497 2498 if (IsRem) { 2499 auto RSign = LHSign; // Remainder sign is the same as LHS 2500 UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0); 2501 B.buildSub(DstReg, UDivRem, RSign); 2502 } else { 2503 auto DSign = B.buildXor(S32, LHSign, RHSign); 2504 UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0); 2505 B.buildSub(DstReg, UDivRem, DSign); 2506 } 2507 2508 MI.eraseFromParent(); 2509 return true; 2510 } 2511 2512 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2513 MachineRegisterInfo &MRI, 2514 MachineIRBuilder &B) const { 2515 if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32)) 2516 return legalizeSDIV_SREM32(MI, MRI, B); 2517 return false; 2518 } 2519 2520 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2521 MachineRegisterInfo &MRI, 2522 MachineIRBuilder &B) const { 2523 Register Res = MI.getOperand(0).getReg(); 2524 Register LHS = MI.getOperand(1).getReg(); 2525 Register RHS = MI.getOperand(2).getReg(); 2526 2527 uint16_t Flags = MI.getFlags(); 2528 2529 LLT ResTy = MRI.getType(Res); 2530 LLT S32 = LLT::scalar(32); 2531 LLT S64 = LLT::scalar(64); 2532 2533 const MachineFunction &MF = B.getMF(); 2534 bool Unsafe = 2535 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2536 2537 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2538 return false; 2539 2540 if (!Unsafe && ResTy == S32 && 2541 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2542 return false; 2543 2544 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2545 // 1 / x -> RCP(x) 2546 if (CLHS->isExactlyValue(1.0)) { 2547 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2548 .addUse(RHS) 2549 .setMIFlags(Flags); 2550 2551 MI.eraseFromParent(); 2552 return true; 2553 } 2554 2555 // -1 / x -> RCP( FNEG(x) ) 2556 if (CLHS->isExactlyValue(-1.0)) { 2557 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2558 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2559 .addUse(FNeg.getReg(0)) 2560 .setMIFlags(Flags); 2561 2562 MI.eraseFromParent(); 2563 return true; 2564 } 2565 } 2566 2567 // x / y -> x * (1.0 / y) 2568 if (Unsafe) { 2569 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2570 .addUse(RHS) 2571 .setMIFlags(Flags); 2572 B.buildFMul(Res, LHS, RCP, Flags); 2573 2574 MI.eraseFromParent(); 2575 return true; 2576 } 2577 2578 return false; 2579 } 2580 2581 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2582 MachineRegisterInfo &MRI, 2583 MachineIRBuilder &B) const { 2584 B.setInstr(MI); 2585 Register Res = MI.getOperand(0).getReg(); 2586 Register LHS = MI.getOperand(1).getReg(); 2587 Register RHS = MI.getOperand(2).getReg(); 2588 2589 uint16_t Flags = MI.getFlags(); 2590 2591 LLT S16 = LLT::scalar(16); 2592 LLT S32 = LLT::scalar(32); 2593 2594 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2595 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2596 2597 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2598 .addUse(RHSExt.getReg(0)) 2599 .setMIFlags(Flags); 2600 2601 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2602 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2603 2604 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2605 .addUse(RDst.getReg(0)) 2606 .addUse(RHS) 2607 .addUse(LHS) 2608 .setMIFlags(Flags); 2609 2610 MI.eraseFromParent(); 2611 return true; 2612 } 2613 2614 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2615 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2616 static void toggleSPDenormMode(bool Enable, 2617 MachineIRBuilder &B, 2618 const GCNSubtarget &ST, 2619 AMDGPU::SIModeRegisterDefaults Mode) { 2620 // Set SP denorm mode to this value. 2621 unsigned SPDenormMode = 2622 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2623 2624 if (ST.hasDenormModeInst()) { 2625 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2626 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2627 2628 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2629 B.buildInstr(AMDGPU::S_DENORM_MODE) 2630 .addImm(NewDenormModeValue); 2631 2632 } else { 2633 // Select FP32 bit field in mode register. 2634 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2635 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2636 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2637 2638 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2639 .addImm(SPDenormMode) 2640 .addImm(SPDenormModeBitField); 2641 } 2642 } 2643 2644 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2645 MachineRegisterInfo &MRI, 2646 MachineIRBuilder &B) const { 2647 B.setInstr(MI); 2648 Register Res = MI.getOperand(0).getReg(); 2649 Register LHS = MI.getOperand(1).getReg(); 2650 Register RHS = MI.getOperand(2).getReg(); 2651 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2652 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2653 2654 uint16_t Flags = MI.getFlags(); 2655 2656 LLT S32 = LLT::scalar(32); 2657 LLT S1 = LLT::scalar(1); 2658 2659 auto One = B.buildFConstant(S32, 1.0f); 2660 2661 auto DenominatorScaled = 2662 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2663 .addUse(RHS) 2664 .addUse(LHS) 2665 .addImm(1) 2666 .setMIFlags(Flags); 2667 auto NumeratorScaled = 2668 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2669 .addUse(LHS) 2670 .addUse(RHS) 2671 .addImm(0) 2672 .setMIFlags(Flags); 2673 2674 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2675 .addUse(DenominatorScaled.getReg(0)) 2676 .setMIFlags(Flags); 2677 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2678 2679 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2680 // aren't modeled as reading it. 2681 if (!Mode.allFP32Denormals()) 2682 toggleSPDenormMode(true, B, ST, Mode); 2683 2684 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2685 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2686 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2687 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2688 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2689 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2690 2691 if (!Mode.allFP32Denormals()) 2692 toggleSPDenormMode(false, B, ST, Mode); 2693 2694 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2695 .addUse(Fma4.getReg(0)) 2696 .addUse(Fma1.getReg(0)) 2697 .addUse(Fma3.getReg(0)) 2698 .addUse(NumeratorScaled.getReg(1)) 2699 .setMIFlags(Flags); 2700 2701 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2702 .addUse(Fmas.getReg(0)) 2703 .addUse(RHS) 2704 .addUse(LHS) 2705 .setMIFlags(Flags); 2706 2707 MI.eraseFromParent(); 2708 return true; 2709 } 2710 2711 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2712 MachineRegisterInfo &MRI, 2713 MachineIRBuilder &B) const { 2714 B.setInstr(MI); 2715 Register Res = MI.getOperand(0).getReg(); 2716 Register LHS = MI.getOperand(1).getReg(); 2717 Register RHS = MI.getOperand(2).getReg(); 2718 2719 uint16_t Flags = MI.getFlags(); 2720 2721 LLT S64 = LLT::scalar(64); 2722 LLT S1 = LLT::scalar(1); 2723 2724 auto One = B.buildFConstant(S64, 1.0); 2725 2726 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2727 .addUse(LHS) 2728 .addUse(RHS) 2729 .addImm(1) 2730 .setMIFlags(Flags); 2731 2732 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2733 2734 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2735 .addUse(DivScale0.getReg(0)) 2736 .setMIFlags(Flags); 2737 2738 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2739 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2740 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2741 2742 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2743 .addUse(LHS) 2744 .addUse(RHS) 2745 .addImm(0) 2746 .setMIFlags(Flags); 2747 2748 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2749 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); 2750 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2751 2752 Register Scale; 2753 if (!ST.hasUsableDivScaleConditionOutput()) { 2754 // Workaround a hardware bug on SI where the condition output from div_scale 2755 // is not usable. 2756 2757 LLT S32 = LLT::scalar(32); 2758 2759 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2760 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2761 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2762 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 2763 2764 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 2765 Scale1Unmerge.getReg(1)); 2766 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 2767 Scale0Unmerge.getReg(1)); 2768 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 2769 } else { 2770 Scale = DivScale1.getReg(1); 2771 } 2772 2773 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 2774 .addUse(Fma4.getReg(0)) 2775 .addUse(Fma3.getReg(0)) 2776 .addUse(Mul.getReg(0)) 2777 .addUse(Scale) 2778 .setMIFlags(Flags); 2779 2780 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 2781 .addUse(Fmas.getReg(0)) 2782 .addUse(RHS) 2783 .addUse(LHS) 2784 .setMIFlags(Flags); 2785 2786 MI.eraseFromParent(); 2787 return true; 2788 } 2789 2790 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 2791 MachineRegisterInfo &MRI, 2792 MachineIRBuilder &B) const { 2793 B.setInstr(MI); 2794 Register Res = MI.getOperand(0).getReg(); 2795 Register LHS = MI.getOperand(2).getReg(); 2796 Register RHS = MI.getOperand(3).getReg(); 2797 uint16_t Flags = MI.getFlags(); 2798 2799 LLT S32 = LLT::scalar(32); 2800 LLT S1 = LLT::scalar(1); 2801 2802 auto Abs = B.buildFAbs(S32, RHS, Flags); 2803 const APFloat C0Val(1.0f); 2804 2805 auto C0 = B.buildConstant(S32, 0x6f800000); 2806 auto C1 = B.buildConstant(S32, 0x2f800000); 2807 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 2808 2809 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 2810 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 2811 2812 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 2813 2814 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2815 .addUse(Mul0.getReg(0)) 2816 .setMIFlags(Flags); 2817 2818 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 2819 2820 B.buildFMul(Res, Sel, Mul1, Flags); 2821 2822 MI.eraseFromParent(); 2823 return true; 2824 } 2825 2826 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 2827 MachineRegisterInfo &MRI, 2828 MachineIRBuilder &B) const { 2829 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2830 if (!MFI->isEntryFunction()) { 2831 return legalizePreloadedArgIntrin(MI, MRI, B, 2832 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 2833 } 2834 2835 B.setInstr(MI); 2836 2837 uint64_t Offset = 2838 ST.getTargetLowering()->getImplicitParameterOffset( 2839 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 2840 Register DstReg = MI.getOperand(0).getReg(); 2841 LLT DstTy = MRI.getType(DstReg); 2842 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 2843 2844 const ArgDescriptor *Arg; 2845 const TargetRegisterClass *RC; 2846 std::tie(Arg, RC) 2847 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2848 if (!Arg) 2849 return false; 2850 2851 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 2852 if (!loadInputValue(KernargPtrReg, B, Arg)) 2853 return false; 2854 2855 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 2856 MI.eraseFromParent(); 2857 return true; 2858 } 2859 2860 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 2861 MachineRegisterInfo &MRI, 2862 MachineIRBuilder &B, 2863 unsigned AddrSpace) const { 2864 B.setInstr(MI); 2865 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 2866 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 2867 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 2868 MI.eraseFromParent(); 2869 return true; 2870 } 2871 2872 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 2873 // offset (the offset that is included in bounds checking and swizzling, to be 2874 // split between the instruction's voffset and immoffset fields) and soffset 2875 // (the offset that is excluded from bounds checking and swizzling, to go in 2876 // the instruction's soffset field). This function takes the first kind of 2877 // offset and figures out how to split it between voffset and immoffset. 2878 std::tuple<Register, unsigned, unsigned> 2879 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 2880 Register OrigOffset) const { 2881 const unsigned MaxImm = 4095; 2882 Register BaseReg; 2883 unsigned TotalConstOffset; 2884 MachineInstr *OffsetDef; 2885 const LLT S32 = LLT::scalar(32); 2886 2887 std::tie(BaseReg, TotalConstOffset, OffsetDef) 2888 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 2889 2890 unsigned ImmOffset = TotalConstOffset; 2891 2892 // If the immediate value is too big for the immoffset field, put the value 2893 // and -4096 into the immoffset field so that the value that is copied/added 2894 // for the voffset field is a multiple of 4096, and it stands more chance 2895 // of being CSEd with the copy/add for another similar load/store. 2896 // However, do not do that rounding down to a multiple of 4096 if that is a 2897 // negative number, as it appears to be illegal to have a negative offset 2898 // in the vgpr, even if adding the immediate offset makes it positive. 2899 unsigned Overflow = ImmOffset & ~MaxImm; 2900 ImmOffset -= Overflow; 2901 if ((int32_t)Overflow < 0) { 2902 Overflow += ImmOffset; 2903 ImmOffset = 0; 2904 } 2905 2906 if (Overflow != 0) { 2907 if (!BaseReg) { 2908 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 2909 } else { 2910 auto OverflowVal = B.buildConstant(S32, Overflow); 2911 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 2912 } 2913 } 2914 2915 if (!BaseReg) 2916 BaseReg = B.buildConstant(S32, 0).getReg(0); 2917 2918 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 2919 } 2920 2921 /// Handle register layout difference for f16 images for some subtargets. 2922 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 2923 MachineRegisterInfo &MRI, 2924 Register Reg) const { 2925 if (!ST.hasUnpackedD16VMem()) 2926 return Reg; 2927 2928 const LLT S16 = LLT::scalar(16); 2929 const LLT S32 = LLT::scalar(32); 2930 LLT StoreVT = MRI.getType(Reg); 2931 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 2932 2933 auto Unmerge = B.buildUnmerge(S16, Reg); 2934 2935 SmallVector<Register, 4> WideRegs; 2936 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 2937 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 2938 2939 int NumElts = StoreVT.getNumElements(); 2940 2941 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 2942 } 2943 2944 Register AMDGPULegalizerInfo::fixStoreSourceType( 2945 MachineIRBuilder &B, Register VData, bool IsFormat) const { 2946 MachineRegisterInfo *MRI = B.getMRI(); 2947 LLT Ty = MRI->getType(VData); 2948 2949 const LLT S16 = LLT::scalar(16); 2950 2951 // Fixup illegal register types for i8 stores. 2952 if (Ty == LLT::scalar(8) || Ty == S16) { 2953 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 2954 return AnyExt; 2955 } 2956 2957 if (Ty.isVector()) { 2958 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 2959 if (IsFormat) 2960 return handleD16VData(B, *MRI, VData); 2961 } 2962 } 2963 2964 return VData; 2965 } 2966 2967 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 2968 MachineRegisterInfo &MRI, 2969 MachineIRBuilder &B, 2970 bool IsTyped, 2971 bool IsFormat) const { 2972 B.setInstr(MI); 2973 2974 Register VData = MI.getOperand(1).getReg(); 2975 LLT Ty = MRI.getType(VData); 2976 LLT EltTy = Ty.getScalarType(); 2977 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 2978 const LLT S32 = LLT::scalar(32); 2979 2980 VData = fixStoreSourceType(B, VData, IsFormat); 2981 Register RSrc = MI.getOperand(2).getReg(); 2982 2983 MachineMemOperand *MMO = *MI.memoperands_begin(); 2984 const int MemSize = MMO->getSize(); 2985 2986 unsigned ImmOffset; 2987 unsigned TotalOffset; 2988 2989 // The typed intrinsics add an immediate after the registers. 2990 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 2991 2992 // The struct intrinsic variants add one additional operand over raw. 2993 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 2994 Register VIndex; 2995 int OpOffset = 0; 2996 if (HasVIndex) { 2997 VIndex = MI.getOperand(3).getReg(); 2998 OpOffset = 1; 2999 } 3000 3001 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3002 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3003 3004 unsigned Format = 0; 3005 if (IsTyped) { 3006 Format = MI.getOperand(5 + OpOffset).getImm(); 3007 ++OpOffset; 3008 } 3009 3010 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3011 3012 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3013 if (TotalOffset != 0) 3014 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3015 3016 unsigned Opc; 3017 if (IsTyped) { 3018 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3019 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3020 } else if (IsFormat) { 3021 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3022 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3023 } else { 3024 switch (MemSize) { 3025 case 1: 3026 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3027 break; 3028 case 2: 3029 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3030 break; 3031 default: 3032 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3033 break; 3034 } 3035 } 3036 3037 if (!VIndex) 3038 VIndex = B.buildConstant(S32, 0).getReg(0); 3039 3040 auto MIB = B.buildInstr(Opc) 3041 .addUse(VData) // vdata 3042 .addUse(RSrc) // rsrc 3043 .addUse(VIndex) // vindex 3044 .addUse(VOffset) // voffset 3045 .addUse(SOffset) // soffset 3046 .addImm(ImmOffset); // offset(imm) 3047 3048 if (IsTyped) 3049 MIB.addImm(Format); 3050 3051 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3052 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3053 .addMemOperand(MMO); 3054 3055 MI.eraseFromParent(); 3056 return true; 3057 } 3058 3059 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3060 MachineRegisterInfo &MRI, 3061 MachineIRBuilder &B, 3062 bool IsFormat, 3063 bool IsTyped) const { 3064 B.setInstr(MI); 3065 3066 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3067 MachineMemOperand *MMO = *MI.memoperands_begin(); 3068 const int MemSize = MMO->getSize(); 3069 const LLT S32 = LLT::scalar(32); 3070 3071 Register Dst = MI.getOperand(0).getReg(); 3072 Register RSrc = MI.getOperand(2).getReg(); 3073 3074 // The typed intrinsics add an immediate after the registers. 3075 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3076 3077 // The struct intrinsic variants add one additional operand over raw. 3078 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3079 Register VIndex; 3080 int OpOffset = 0; 3081 if (HasVIndex) { 3082 VIndex = MI.getOperand(3).getReg(); 3083 OpOffset = 1; 3084 } 3085 3086 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3087 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3088 3089 unsigned Format = 0; 3090 if (IsTyped) { 3091 Format = MI.getOperand(5 + OpOffset).getImm(); 3092 ++OpOffset; 3093 } 3094 3095 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3096 unsigned ImmOffset; 3097 unsigned TotalOffset; 3098 3099 LLT Ty = MRI.getType(Dst); 3100 LLT EltTy = Ty.getScalarType(); 3101 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3102 const bool Unpacked = ST.hasUnpackedD16VMem(); 3103 3104 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3105 if (TotalOffset != 0) 3106 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3107 3108 unsigned Opc; 3109 3110 if (IsTyped) { 3111 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3112 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3113 } else if (IsFormat) { 3114 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3115 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3116 } else { 3117 switch (MemSize) { 3118 case 1: 3119 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3120 break; 3121 case 2: 3122 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3123 break; 3124 default: 3125 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3126 break; 3127 } 3128 } 3129 3130 Register LoadDstReg; 3131 3132 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3133 LLT UnpackedTy = Ty.changeElementSize(32); 3134 3135 if (IsExtLoad) 3136 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3137 else if (Unpacked && IsD16 && Ty.isVector()) 3138 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3139 else 3140 LoadDstReg = Dst; 3141 3142 if (!VIndex) 3143 VIndex = B.buildConstant(S32, 0).getReg(0); 3144 3145 auto MIB = B.buildInstr(Opc) 3146 .addDef(LoadDstReg) // vdata 3147 .addUse(RSrc) // rsrc 3148 .addUse(VIndex) // vindex 3149 .addUse(VOffset) // voffset 3150 .addUse(SOffset) // soffset 3151 .addImm(ImmOffset); // offset(imm) 3152 3153 if (IsTyped) 3154 MIB.addImm(Format); 3155 3156 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3157 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3158 .addMemOperand(MMO); 3159 3160 if (LoadDstReg != Dst) { 3161 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3162 3163 // Widen result for extending loads was widened. 3164 if (IsExtLoad) 3165 B.buildTrunc(Dst, LoadDstReg); 3166 else { 3167 // Repack to original 16-bit vector result 3168 // FIXME: G_TRUNC should work, but legalization currently fails 3169 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3170 SmallVector<Register, 4> Repack; 3171 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3172 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3173 B.buildMerge(Dst, Repack); 3174 } 3175 } 3176 3177 MI.eraseFromParent(); 3178 return true; 3179 } 3180 3181 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3182 MachineIRBuilder &B, 3183 bool IsInc) const { 3184 B.setInstr(MI); 3185 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3186 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3187 B.buildInstr(Opc) 3188 .addDef(MI.getOperand(0).getReg()) 3189 .addUse(MI.getOperand(2).getReg()) 3190 .addUse(MI.getOperand(3).getReg()) 3191 .cloneMemRefs(MI); 3192 MI.eraseFromParent(); 3193 return true; 3194 } 3195 3196 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3197 switch (IntrID) { 3198 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3199 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3200 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3201 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3202 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3203 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3204 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3205 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3206 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3207 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3208 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3209 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3210 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3211 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3212 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3213 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3214 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3215 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3216 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3217 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3218 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3219 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3220 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3221 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3222 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3223 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3224 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3225 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3226 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3227 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3228 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3229 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3230 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3231 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3232 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3233 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3234 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3235 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3236 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3237 default: 3238 llvm_unreachable("unhandled atomic opcode"); 3239 } 3240 } 3241 3242 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3243 MachineIRBuilder &B, 3244 Intrinsic::ID IID) const { 3245 B.setInstr(MI); 3246 3247 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3248 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3249 3250 Register Dst = MI.getOperand(0).getReg(); 3251 Register VData = MI.getOperand(2).getReg(); 3252 3253 Register CmpVal; 3254 int OpOffset = 0; 3255 3256 if (IsCmpSwap) { 3257 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3258 ++OpOffset; 3259 } 3260 3261 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3262 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3263 3264 // The struct intrinsic variants add one additional operand over raw. 3265 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3266 Register VIndex; 3267 if (HasVIndex) { 3268 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3269 ++OpOffset; 3270 } 3271 3272 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3273 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3274 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3275 3276 MachineMemOperand *MMO = *MI.memoperands_begin(); 3277 3278 unsigned ImmOffset; 3279 unsigned TotalOffset; 3280 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3281 if (TotalOffset != 0) 3282 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3283 3284 if (!VIndex) 3285 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3286 3287 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3288 .addDef(Dst) 3289 .addUse(VData); // vdata 3290 3291 if (IsCmpSwap) 3292 MIB.addReg(CmpVal); 3293 3294 MIB.addUse(RSrc) // rsrc 3295 .addUse(VIndex) // vindex 3296 .addUse(VOffset) // voffset 3297 .addUse(SOffset) // soffset 3298 .addImm(ImmOffset) // offset(imm) 3299 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3300 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3301 .addMemOperand(MMO); 3302 3303 MI.eraseFromParent(); 3304 return true; 3305 } 3306 3307 // Produce a vector of s16 elements from s32 pieces. 3308 static void truncToS16Vector(MachineIRBuilder &B, Register DstReg, 3309 ArrayRef<Register> UnmergeParts) { 3310 const LLT S16 = LLT::scalar(16); 3311 3312 SmallVector<Register, 4> RemergeParts(UnmergeParts.size()); 3313 for (int I = 0, E = UnmergeParts.size(); I != E; ++I) 3314 RemergeParts[I] = B.buildTrunc(S16, UnmergeParts[I]).getReg(0); 3315 3316 B.buildBuildVector(DstReg, RemergeParts); 3317 } 3318 3319 /// Convert a set of s32 registers to a result vector with s16 elements. 3320 static void bitcastToS16Vector(MachineIRBuilder &B, Register DstReg, 3321 ArrayRef<Register> UnmergeParts) { 3322 MachineRegisterInfo &MRI = *B.getMRI(); 3323 const LLT V2S16 = LLT::vector(2, 16); 3324 LLT TargetTy = MRI.getType(DstReg); 3325 int NumElts = UnmergeParts.size(); 3326 3327 if (NumElts == 1) { 3328 assert(TargetTy == V2S16); 3329 B.buildBitcast(DstReg, UnmergeParts[0]); 3330 return; 3331 } 3332 3333 SmallVector<Register, 4> RemergeParts(NumElts); 3334 for (int I = 0; I != NumElts; ++I) 3335 RemergeParts[I] = B.buildBitcast(V2S16, UnmergeParts[I]).getReg(0); 3336 3337 if (TargetTy.getSizeInBits() == 32u * NumElts) { 3338 B.buildConcatVectors(DstReg, RemergeParts); 3339 return; 3340 } 3341 3342 const LLT V3S16 = LLT::vector(3, 16); 3343 const LLT V6S16 = LLT::vector(6, 16); 3344 3345 // Widen to v6s16 and unpack v3 parts. 3346 assert(TargetTy == V3S16); 3347 3348 RemergeParts.push_back(B.buildUndef(V2S16).getReg(0)); 3349 auto Concat = B.buildConcatVectors(V6S16, RemergeParts); 3350 B.buildUnmerge({DstReg, MRI.createGenericVirtualRegister(V3S16)}, Concat); 3351 } 3352 3353 // FIXME: Just vector trunc should be sufficent, but legalization currently 3354 // broken. 3355 static void repackUnpackedD16Load(MachineIRBuilder &B, Register DstReg, 3356 Register WideDstReg) { 3357 const LLT S32 = LLT::scalar(32); 3358 const LLT S16 = LLT::scalar(16); 3359 3360 auto Unmerge = B.buildUnmerge(S32, WideDstReg); 3361 3362 int NumOps = Unmerge->getNumOperands() - 1; 3363 SmallVector<Register, 4> RemergeParts(NumOps); 3364 for (int I = 0; I != NumOps; ++I) 3365 RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0); 3366 3367 B.buildBuildVector(DstReg, RemergeParts); 3368 } 3369 3370 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3371 MachineInstr &MI, MachineIRBuilder &B, 3372 GISelChangeObserver &Observer, 3373 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3374 bool IsTFE = MI.getNumExplicitDefs() == 2; 3375 3376 // We are only processing the operands of d16 image operations on subtargets 3377 // that use the unpacked register layout, or need to repack the TFE result. 3378 3379 // TODO: Need to handle a16 images too 3380 // TODO: Do we need to guard against already legalized intrinsics? 3381 if (!IsTFE && !ST.hasUnpackedD16VMem()) 3382 return true; 3383 3384 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3385 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3386 3387 if (BaseOpcode->Atomic) // No d16 atomics, or TFE. 3388 return true; 3389 3390 B.setInstr(MI); 3391 3392 MachineRegisterInfo *MRI = B.getMRI(); 3393 const LLT S32 = LLT::scalar(32); 3394 const LLT S16 = LLT::scalar(16); 3395 3396 if (BaseOpcode->Store) { // No TFE for stores? 3397 Register VData = MI.getOperand(1).getReg(); 3398 LLT Ty = MRI->getType(VData); 3399 if (!Ty.isVector() || Ty.getElementType() != S16) 3400 return true; 3401 3402 B.setInstr(MI); 3403 3404 Observer.changingInstr(MI); 3405 MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData)); 3406 Observer.changedInstr(MI); 3407 return true; 3408 } 3409 3410 Register DstReg = MI.getOperand(0).getReg(); 3411 LLT Ty = MRI->getType(DstReg); 3412 const LLT EltTy = Ty.getScalarType(); 3413 const bool IsD16 = Ty.getScalarType() == S16; 3414 const unsigned NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3415 3416 if (IsTFE) { 3417 // In the IR, TFE is supposed to be used with a 2 element struct return 3418 // type. The intruction really returns these two values in one contiguous 3419 // register, with one additional dword beyond the loaded data. Rewrite the 3420 // return type to use a single register result. 3421 Register Dst1Reg = MI.getOperand(1).getReg(); 3422 if (MRI->getType(Dst1Reg) != S32) 3423 return false; 3424 3425 // TODO: Make sure the TFE operand bit is set. 3426 3427 // The raw dword aligned data component of the load. The only legal cases 3428 // where this matters should be when using the packed D16 format, for 3429 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3430 LLT RoundedTy; 3431 LLT TFETy; 3432 3433 if (IsD16 && ST.hasUnpackedD16VMem()) { 3434 RoundedTy = LLT::scalarOrVector(NumElts, 32); 3435 TFETy = LLT::vector(NumElts + 1, 32); 3436 } else { 3437 unsigned EltSize = Ty.getScalarSizeInBits(); 3438 unsigned RoundedElts = (Ty.getSizeInBits() + 31) / 32; 3439 unsigned RoundedSize = 32 * RoundedElts; 3440 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3441 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3442 } 3443 3444 Register TFEReg = MRI->createGenericVirtualRegister(TFETy); 3445 Observer.changingInstr(MI); 3446 3447 MI.getOperand(0).setReg(TFEReg); 3448 MI.RemoveOperand(1); 3449 3450 Observer.changedInstr(MI); 3451 3452 // Insert after the instruction. 3453 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3454 3455 // Now figure out how to copy the new result register back into the old 3456 // result. 3457 3458 SmallVector<Register, 5> UnmergeResults(TFETy.getNumElements(), Dst1Reg); 3459 int NumDataElts = TFETy.getNumElements() - 1; 3460 3461 if (!Ty.isVector()) { 3462 // Simplest case is a trivial unmerge (plus a truncate for d16). 3463 UnmergeResults[0] = Ty == S32 ? 3464 DstReg : MRI->createGenericVirtualRegister(S32); 3465 3466 B.buildUnmerge(UnmergeResults, TFEReg); 3467 if (Ty != S32) 3468 B.buildTrunc(DstReg, UnmergeResults[0]); 3469 return true; 3470 } 3471 3472 // We have to repack into a new vector of some kind. 3473 for (int I = 0; I != NumDataElts; ++I) 3474 UnmergeResults[I] = MRI->createGenericVirtualRegister(S32); 3475 B.buildUnmerge(UnmergeResults, TFEReg); 3476 3477 // Drop the final TFE element. 3478 ArrayRef<Register> DataPart(UnmergeResults.data(), NumDataElts); 3479 3480 if (EltTy == S32) 3481 B.buildBuildVector(DstReg, DataPart); 3482 else if (ST.hasUnpackedD16VMem()) 3483 truncToS16Vector(B, DstReg, DataPart); 3484 else 3485 bitcastToS16Vector(B, DstReg, DataPart); 3486 3487 return true; 3488 } 3489 3490 // Must be an image load. 3491 if (!Ty.isVector() || Ty.getElementType() != S16) 3492 return true; 3493 3494 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3495 3496 LLT WidenedTy = Ty.changeElementType(S32); 3497 Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy); 3498 3499 Observer.changingInstr(MI); 3500 MI.getOperand(0).setReg(WideDstReg); 3501 Observer.changedInstr(MI); 3502 3503 repackUnpackedD16Load(B, DstReg, WideDstReg); 3504 return true; 3505 } 3506 3507 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 3508 MachineInstr &MI, MachineIRBuilder &B, 3509 GISelChangeObserver &Observer) const { 3510 Register Dst = MI.getOperand(0).getReg(); 3511 LLT Ty = B.getMRI()->getType(Dst); 3512 unsigned Size = Ty.getSizeInBits(); 3513 MachineFunction &MF = B.getMF(); 3514 3515 Observer.changingInstr(MI); 3516 3517 // FIXME: We don't really need this intermediate instruction. The intrinsic 3518 // should be fixed to have a memory operand. Since it's readnone, we're not 3519 // allowed to add one. 3520 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 3521 MI.RemoveOperand(1); // Remove intrinsic ID 3522 3523 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 3524 // TODO: Should this use datalayout alignment? 3525 const unsigned MemSize = (Size + 7) / 8; 3526 const unsigned MemAlign = 4; 3527 MachineMemOperand *MMO = MF.getMachineMemOperand( 3528 MachinePointerInfo(), 3529 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 3530 MachineMemOperand::MOInvariant, MemSize, MemAlign); 3531 MI.addMemOperand(MF, MMO); 3532 3533 // There are no 96-bit result scalar loads, but widening to 128-bit should 3534 // always be legal. We may need to restore this to a 96-bit result if it turns 3535 // out this needs to be converted to a vector load during RegBankSelect. 3536 if (!isPowerOf2_32(Size)) { 3537 LegalizerHelper Helper(MF, *this, Observer, B); 3538 B.setInstr(MI); 3539 3540 if (Ty.isVector()) 3541 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 3542 else 3543 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 3544 } 3545 3546 Observer.changedInstr(MI); 3547 return true; 3548 } 3549 3550 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 3551 MachineIRBuilder &B, 3552 GISelChangeObserver &Observer) const { 3553 MachineRegisterInfo &MRI = *B.getMRI(); 3554 3555 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 3556 auto IntrID = MI.getIntrinsicID(); 3557 switch (IntrID) { 3558 case Intrinsic::amdgcn_if: 3559 case Intrinsic::amdgcn_else: { 3560 MachineInstr *Br = nullptr; 3561 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 3562 const SIRegisterInfo *TRI 3563 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 3564 3565 B.setInstr(*BrCond); 3566 Register Def = MI.getOperand(1).getReg(); 3567 Register Use = MI.getOperand(3).getReg(); 3568 3569 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 3570 if (Br) 3571 BrTarget = Br->getOperand(0).getMBB(); 3572 3573 if (IntrID == Intrinsic::amdgcn_if) { 3574 B.buildInstr(AMDGPU::SI_IF) 3575 .addDef(Def) 3576 .addUse(Use) 3577 .addMBB(BrTarget); 3578 } else { 3579 B.buildInstr(AMDGPU::SI_ELSE) 3580 .addDef(Def) 3581 .addUse(Use) 3582 .addMBB(BrTarget) 3583 .addImm(0); 3584 } 3585 3586 if (Br) 3587 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 3588 3589 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 3590 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 3591 MI.eraseFromParent(); 3592 BrCond->eraseFromParent(); 3593 return true; 3594 } 3595 3596 return false; 3597 } 3598 case Intrinsic::amdgcn_loop: { 3599 MachineInstr *Br = nullptr; 3600 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 3601 const SIRegisterInfo *TRI 3602 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 3603 3604 B.setInstr(*BrCond); 3605 3606 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 3607 if (Br) 3608 BrTarget = Br->getOperand(0).getMBB(); 3609 3610 Register Reg = MI.getOperand(2).getReg(); 3611 B.buildInstr(AMDGPU::SI_LOOP) 3612 .addUse(Reg) 3613 .addMBB(BrTarget); 3614 3615 if (Br) 3616 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 3617 3618 MI.eraseFromParent(); 3619 BrCond->eraseFromParent(); 3620 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 3621 return true; 3622 } 3623 3624 return false; 3625 } 3626 case Intrinsic::amdgcn_kernarg_segment_ptr: 3627 return legalizePreloadedArgIntrin( 3628 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3629 case Intrinsic::amdgcn_implicitarg_ptr: 3630 return legalizeImplicitArgPtr(MI, MRI, B); 3631 case Intrinsic::amdgcn_workitem_id_x: 3632 return legalizePreloadedArgIntrin(MI, MRI, B, 3633 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 3634 case Intrinsic::amdgcn_workitem_id_y: 3635 return legalizePreloadedArgIntrin(MI, MRI, B, 3636 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 3637 case Intrinsic::amdgcn_workitem_id_z: 3638 return legalizePreloadedArgIntrin(MI, MRI, B, 3639 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 3640 case Intrinsic::amdgcn_workgroup_id_x: 3641 return legalizePreloadedArgIntrin(MI, MRI, B, 3642 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 3643 case Intrinsic::amdgcn_workgroup_id_y: 3644 return legalizePreloadedArgIntrin(MI, MRI, B, 3645 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 3646 case Intrinsic::amdgcn_workgroup_id_z: 3647 return legalizePreloadedArgIntrin(MI, MRI, B, 3648 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 3649 case Intrinsic::amdgcn_dispatch_ptr: 3650 return legalizePreloadedArgIntrin(MI, MRI, B, 3651 AMDGPUFunctionArgInfo::DISPATCH_PTR); 3652 case Intrinsic::amdgcn_queue_ptr: 3653 return legalizePreloadedArgIntrin(MI, MRI, B, 3654 AMDGPUFunctionArgInfo::QUEUE_PTR); 3655 case Intrinsic::amdgcn_implicit_buffer_ptr: 3656 return legalizePreloadedArgIntrin( 3657 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 3658 case Intrinsic::amdgcn_dispatch_id: 3659 return legalizePreloadedArgIntrin(MI, MRI, B, 3660 AMDGPUFunctionArgInfo::DISPATCH_ID); 3661 case Intrinsic::amdgcn_fdiv_fast: 3662 return legalizeFDIVFastIntrin(MI, MRI, B); 3663 case Intrinsic::amdgcn_is_shared: 3664 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 3665 case Intrinsic::amdgcn_is_private: 3666 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 3667 case Intrinsic::amdgcn_wavefrontsize: { 3668 B.setInstr(MI); 3669 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 3670 MI.eraseFromParent(); 3671 return true; 3672 } 3673 case Intrinsic::amdgcn_s_buffer_load: 3674 return legalizeSBufferLoad(MI, B, Observer); 3675 case Intrinsic::amdgcn_raw_buffer_store: 3676 case Intrinsic::amdgcn_struct_buffer_store: 3677 return legalizeBufferStore(MI, MRI, B, false, false); 3678 case Intrinsic::amdgcn_raw_buffer_store_format: 3679 case Intrinsic::amdgcn_struct_buffer_store_format: 3680 return legalizeBufferStore(MI, MRI, B, false, true); 3681 case Intrinsic::amdgcn_raw_tbuffer_store: 3682 case Intrinsic::amdgcn_struct_tbuffer_store: 3683 return legalizeBufferStore(MI, MRI, B, true, true); 3684 case Intrinsic::amdgcn_raw_buffer_load: 3685 case Intrinsic::amdgcn_struct_buffer_load: 3686 return legalizeBufferLoad(MI, MRI, B, false, false); 3687 case Intrinsic::amdgcn_raw_buffer_load_format: 3688 case Intrinsic::amdgcn_struct_buffer_load_format: 3689 return legalizeBufferLoad(MI, MRI, B, true, false); 3690 case Intrinsic::amdgcn_raw_tbuffer_load: 3691 case Intrinsic::amdgcn_struct_tbuffer_load: 3692 return legalizeBufferLoad(MI, MRI, B, true, true); 3693 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3694 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3695 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3696 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3697 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3698 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3699 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3700 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3701 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3702 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3703 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3704 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3705 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3706 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3707 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3708 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3709 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3710 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3711 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3712 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3713 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3714 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3715 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3716 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3717 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3718 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3719 return legalizeBufferAtomic(MI, B, IntrID); 3720 case Intrinsic::amdgcn_atomic_inc: 3721 return legalizeAtomicIncDec(MI, B, true); 3722 case Intrinsic::amdgcn_atomic_dec: 3723 return legalizeAtomicIncDec(MI, B, false); 3724 default: { 3725 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 3726 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 3727 return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr); 3728 return true; 3729 } 3730 } 3731 3732 return true; 3733 } 3734