1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPULegalizerInfo.h" 22 23 #include "AMDGPU.h" 24 #include "AMDGPUGlobalISelUtils.h" 25 #include "AMDGPUTargetMachine.h" 26 #include "SIMachineFunctionInfo.h" 27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 30 #include "llvm/CodeGen/TargetOpcodes.h" 31 #include "llvm/CodeGen/ValueTypes.h" 32 #include "llvm/IR/DerivedTypes.h" 33 #include "llvm/IR/DiagnosticInfo.h" 34 #include "llvm/IR/Type.h" 35 #include "llvm/Support/Debug.h" 36 37 #define DEBUG_TYPE "amdgpu-legalinfo" 38 39 using namespace llvm; 40 using namespace LegalizeActions; 41 using namespace LegalizeMutations; 42 using namespace LegalityPredicates; 43 using namespace MIPatternMatch; 44 45 // Round the number of elements to the next power of two elements 46 static LLT getPow2VectorType(LLT Ty) { 47 unsigned NElts = Ty.getNumElements(); 48 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 49 return Ty.changeNumElements(Pow2NElts); 50 } 51 52 // Round the number of bits to the next power of two bits 53 static LLT getPow2ScalarType(LLT Ty) { 54 unsigned Bits = Ty.getSizeInBits(); 55 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 56 return LLT::scalar(Pow2Bits); 57 } 58 59 static LegalityPredicate isMultiple32(unsigned TypeIdx, 60 unsigned MaxSize = 1024) { 61 return [=](const LegalityQuery &Query) { 62 const LLT Ty = Query.Types[TypeIdx]; 63 const LLT EltTy = Ty.getScalarType(); 64 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 65 }; 66 } 67 68 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 69 return [=](const LegalityQuery &Query) { 70 return Query.Types[TypeIdx].getSizeInBits() == Size; 71 }; 72 } 73 74 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 75 return [=](const LegalityQuery &Query) { 76 const LLT Ty = Query.Types[TypeIdx]; 77 return Ty.isVector() && 78 Ty.getNumElements() % 2 != 0 && 79 Ty.getElementType().getSizeInBits() < 32 && 80 Ty.getSizeInBits() % 32 != 0; 81 }; 82 } 83 84 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 85 return [=](const LegalityQuery &Query) { 86 const LLT Ty = Query.Types[TypeIdx]; 87 const LLT EltTy = Ty.getScalarType(); 88 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 89 }; 90 } 91 92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 93 return [=](const LegalityQuery &Query) { 94 const LLT Ty = Query.Types[TypeIdx]; 95 const LLT EltTy = Ty.getElementType(); 96 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 97 }; 98 } 99 100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 101 return [=](const LegalityQuery &Query) { 102 const LLT Ty = Query.Types[TypeIdx]; 103 const LLT EltTy = Ty.getElementType(); 104 unsigned Size = Ty.getSizeInBits(); 105 unsigned Pieces = (Size + 63) / 64; 106 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 107 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 108 }; 109 } 110 111 // Increase the number of vector elements to reach the next multiple of 32-bit 112 // type. 113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 114 return [=](const LegalityQuery &Query) { 115 const LLT Ty = Query.Types[TypeIdx]; 116 117 const LLT EltTy = Ty.getElementType(); 118 const int Size = Ty.getSizeInBits(); 119 const int EltSize = EltTy.getSizeInBits(); 120 const int NextMul32 = (Size + 31) / 32; 121 122 assert(EltSize < 32); 123 124 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 125 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 126 }; 127 } 128 129 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 130 return [=](const LegalityQuery &Query) { 131 const LLT QueryTy = Query.Types[TypeIdx]; 132 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 133 }; 134 } 135 136 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 137 return [=](const LegalityQuery &Query) { 138 const LLT QueryTy = Query.Types[TypeIdx]; 139 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 140 }; 141 } 142 143 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 144 return [=](const LegalityQuery &Query) { 145 const LLT QueryTy = Query.Types[TypeIdx]; 146 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 147 }; 148 } 149 150 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 151 // v2s16. 152 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 153 return [=](const LegalityQuery &Query) { 154 const LLT Ty = Query.Types[TypeIdx]; 155 if (Ty.isVector()) { 156 const int EltSize = Ty.getElementType().getSizeInBits(); 157 return EltSize == 32 || EltSize == 64 || 158 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 159 EltSize == 128 || EltSize == 256; 160 } 161 162 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 163 }; 164 } 165 166 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 167 return [=](const LegalityQuery &Query) { 168 const LLT QueryTy = Query.Types[TypeIdx]; 169 return QueryTy.isVector() && QueryTy.getElementType() == Type; 170 }; 171 } 172 173 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 174 return [=](const LegalityQuery &Query) { 175 const LLT Ty = Query.Types[TypeIdx]; 176 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 177 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 178 }; 179 } 180 181 static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) { 182 return [=](const LegalityQuery &Query) { 183 return Query.Types[TypeIdx0].getSizeInBits() < 184 Query.Types[TypeIdx1].getSizeInBits(); 185 }; 186 } 187 188 static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) { 189 return [=](const LegalityQuery &Query) { 190 return Query.Types[TypeIdx0].getSizeInBits() > 191 Query.Types[TypeIdx1].getSizeInBits(); 192 }; 193 } 194 195 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 196 const GCNTargetMachine &TM) 197 : ST(ST_) { 198 using namespace TargetOpcode; 199 200 auto GetAddrSpacePtr = [&TM](unsigned AS) { 201 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 202 }; 203 204 const LLT S1 = LLT::scalar(1); 205 const LLT S16 = LLT::scalar(16); 206 const LLT S32 = LLT::scalar(32); 207 const LLT S64 = LLT::scalar(64); 208 const LLT S128 = LLT::scalar(128); 209 const LLT S256 = LLT::scalar(256); 210 const LLT S1024 = LLT::scalar(1024); 211 212 const LLT V2S16 = LLT::vector(2, 16); 213 const LLT V4S16 = LLT::vector(4, 16); 214 215 const LLT V2S32 = LLT::vector(2, 32); 216 const LLT V3S32 = LLT::vector(3, 32); 217 const LLT V4S32 = LLT::vector(4, 32); 218 const LLT V5S32 = LLT::vector(5, 32); 219 const LLT V6S32 = LLT::vector(6, 32); 220 const LLT V7S32 = LLT::vector(7, 32); 221 const LLT V8S32 = LLT::vector(8, 32); 222 const LLT V9S32 = LLT::vector(9, 32); 223 const LLT V10S32 = LLT::vector(10, 32); 224 const LLT V11S32 = LLT::vector(11, 32); 225 const LLT V12S32 = LLT::vector(12, 32); 226 const LLT V13S32 = LLT::vector(13, 32); 227 const LLT V14S32 = LLT::vector(14, 32); 228 const LLT V15S32 = LLT::vector(15, 32); 229 const LLT V16S32 = LLT::vector(16, 32); 230 const LLT V32S32 = LLT::vector(32, 32); 231 232 const LLT V2S64 = LLT::vector(2, 64); 233 const LLT V3S64 = LLT::vector(3, 64); 234 const LLT V4S64 = LLT::vector(4, 64); 235 const LLT V5S64 = LLT::vector(5, 64); 236 const LLT V6S64 = LLT::vector(6, 64); 237 const LLT V7S64 = LLT::vector(7, 64); 238 const LLT V8S64 = LLT::vector(8, 64); 239 const LLT V16S64 = LLT::vector(16, 64); 240 241 std::initializer_list<LLT> AllS32Vectors = 242 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 243 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 244 std::initializer_list<LLT> AllS64Vectors = 245 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 246 247 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 248 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 249 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 250 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 251 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 252 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 253 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 254 255 const LLT CodePtr = FlatPtr; 256 257 const std::initializer_list<LLT> AddrSpaces64 = { 258 GlobalPtr, ConstantPtr, FlatPtr 259 }; 260 261 const std::initializer_list<LLT> AddrSpaces32 = { 262 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 263 }; 264 265 const std::initializer_list<LLT> FPTypesBase = { 266 S32, S64 267 }; 268 269 const std::initializer_list<LLT> FPTypes16 = { 270 S32, S64, S16 271 }; 272 273 const std::initializer_list<LLT> FPTypesPK16 = { 274 S32, S64, S16, V2S16 275 }; 276 277 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 278 279 setAction({G_BRCOND, S1}, Legal); // VCC branches 280 setAction({G_BRCOND, S32}, Legal); // SCC branches 281 282 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 283 // elements for v3s16 284 getActionDefinitionsBuilder(G_PHI) 285 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 286 .legalFor(AllS32Vectors) 287 .legalFor(AllS64Vectors) 288 .legalFor(AddrSpaces64) 289 .legalFor(AddrSpaces32) 290 .clampScalar(0, S32, S256) 291 .widenScalarToNextPow2(0, 32) 292 .clampMaxNumElements(0, S32, 16) 293 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 294 .legalIf(isPointer(0)); 295 296 if (ST.has16BitInsts()) { 297 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 298 .legalFor({S32, S16}) 299 .clampScalar(0, S16, S32) 300 .scalarize(0) 301 .widenScalarToNextPow2(0, 32); 302 } else { 303 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 304 .legalFor({S32}) 305 .clampScalar(0, S32, S32) 306 .scalarize(0); 307 } 308 309 // FIXME: Not really legal. Placeholder for custom lowering. 310 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 311 .customFor({S32, S64}) 312 .clampScalar(0, S32, S64) 313 .widenScalarToNextPow2(0, 32) 314 .scalarize(0); 315 316 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 317 .legalFor({S32}) 318 .clampScalar(0, S32, S32) 319 .scalarize(0); 320 321 // Report legal for any types we can handle anywhere. For the cases only legal 322 // on the SALU, RegBankSelect will be able to re-legalize. 323 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 324 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 325 .clampScalar(0, S32, S64) 326 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 327 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 328 .widenScalarToNextPow2(0) 329 .scalarize(0); 330 331 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 332 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 333 .legalFor({{S32, S1}, {S32, S32}}) 334 .clampScalar(0, S32, S32) 335 .scalarize(0); // TODO: Implement. 336 337 getActionDefinitionsBuilder(G_BITCAST) 338 // Don't worry about the size constraint. 339 .legalIf(all(isRegisterType(0), isRegisterType(1))) 340 .lower(); 341 342 343 getActionDefinitionsBuilder(G_CONSTANT) 344 .legalFor({S1, S32, S64, S16, GlobalPtr, 345 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 346 .clampScalar(0, S32, S64) 347 .widenScalarToNextPow2(0) 348 .legalIf(isPointer(0)); 349 350 getActionDefinitionsBuilder(G_FCONSTANT) 351 .legalFor({S32, S64, S16}) 352 .clampScalar(0, S16, S64); 353 354 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 355 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 356 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 357 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 358 .clampScalarOrElt(0, S32, S1024) 359 .legalIf(isMultiple32(0)) 360 .widenScalarToNextPow2(0, 32) 361 .clampMaxNumElements(0, S32, 16); 362 363 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 364 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 365 .unsupportedFor({PrivatePtr}) 366 .custom(); 367 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 368 369 auto &FPOpActions = getActionDefinitionsBuilder( 370 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 371 .legalFor({S32, S64}); 372 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 373 .customFor({S32, S64}); 374 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 375 .customFor({S32, S64}); 376 377 if (ST.has16BitInsts()) { 378 if (ST.hasVOP3PInsts()) 379 FPOpActions.legalFor({S16, V2S16}); 380 else 381 FPOpActions.legalFor({S16}); 382 383 TrigActions.customFor({S16}); 384 FDIVActions.customFor({S16}); 385 } 386 387 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 388 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 389 390 if (ST.hasVOP3PInsts()) { 391 MinNumMaxNum.customFor(FPTypesPK16) 392 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 393 .clampMaxNumElements(0, S16, 2) 394 .clampScalar(0, S16, S64) 395 .scalarize(0); 396 } else if (ST.has16BitInsts()) { 397 MinNumMaxNum.customFor(FPTypes16) 398 .clampScalar(0, S16, S64) 399 .scalarize(0); 400 } else { 401 MinNumMaxNum.customFor(FPTypesBase) 402 .clampScalar(0, S32, S64) 403 .scalarize(0); 404 } 405 406 if (ST.hasVOP3PInsts()) 407 FPOpActions.clampMaxNumElements(0, S16, 2); 408 409 FPOpActions 410 .scalarize(0) 411 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 412 413 TrigActions 414 .scalarize(0) 415 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 416 417 FDIVActions 418 .scalarize(0) 419 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 420 421 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 422 .legalFor(FPTypesPK16) 423 .clampMaxNumElements(0, S16, 2) 424 .scalarize(0) 425 .clampScalar(0, S16, S64); 426 427 if (ST.has16BitInsts()) { 428 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 429 .legalFor({S32, S64, S16}) 430 .scalarize(0) 431 .clampScalar(0, S16, S64); 432 } else { 433 getActionDefinitionsBuilder(G_FSQRT) 434 .legalFor({S32, S64}) 435 .scalarize(0) 436 .clampScalar(0, S32, S64); 437 438 if (ST.hasFractBug()) { 439 getActionDefinitionsBuilder(G_FFLOOR) 440 .customFor({S64}) 441 .legalFor({S32, S64}) 442 .scalarize(0) 443 .clampScalar(0, S32, S64); 444 } else { 445 getActionDefinitionsBuilder(G_FFLOOR) 446 .legalFor({S32, S64}) 447 .scalarize(0) 448 .clampScalar(0, S32, S64); 449 } 450 } 451 452 getActionDefinitionsBuilder(G_FPTRUNC) 453 .legalFor({{S32, S64}, {S16, S32}}) 454 .scalarize(0) 455 .lower(); 456 457 getActionDefinitionsBuilder(G_FPEXT) 458 .legalFor({{S64, S32}, {S32, S16}}) 459 .lowerFor({{S64, S16}}) // FIXME: Implement 460 .scalarize(0); 461 462 getActionDefinitionsBuilder(G_FSUB) 463 // Use actual fsub instruction 464 .legalFor({S32}) 465 // Must use fadd + fneg 466 .lowerFor({S64, S16, V2S16}) 467 .scalarize(0) 468 .clampScalar(0, S32, S64); 469 470 // Whether this is legal depends on the floating point mode for the function. 471 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 472 if (ST.hasMadF16()) 473 FMad.customFor({S32, S16}); 474 else 475 FMad.customFor({S32}); 476 FMad.scalarize(0) 477 .lower(); 478 479 getActionDefinitionsBuilder(G_TRUNC) 480 .alwaysLegal(); 481 482 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 483 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 484 {S32, S1}, {S64, S1}, {S16, S1}}) 485 .scalarize(0) 486 .clampScalar(0, S32, S64) 487 .widenScalarToNextPow2(1, 32); 488 489 // TODO: Split s1->s64 during regbankselect for VALU. 490 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 491 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 492 .lowerFor({{S32, S64}}) 493 .lowerIf(typeIs(1, S1)) 494 .customFor({{S64, S64}}); 495 if (ST.has16BitInsts()) 496 IToFP.legalFor({{S16, S16}}); 497 IToFP.clampScalar(1, S32, S64) 498 .scalarize(0) 499 .widenScalarToNextPow2(1); 500 501 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 502 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 503 .customFor({{S64, S64}}); 504 if (ST.has16BitInsts()) 505 FPToI.legalFor({{S16, S16}}); 506 else 507 FPToI.minScalar(1, S32); 508 509 FPToI.minScalar(0, S32) 510 .scalarize(0) 511 .lower(); 512 513 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 514 .scalarize(0) 515 .lower(); 516 517 if (ST.has16BitInsts()) { 518 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 519 .legalFor({S16, S32, S64}) 520 .clampScalar(0, S16, S64) 521 .scalarize(0); 522 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 523 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 524 .legalFor({S32, S64}) 525 .clampScalar(0, S32, S64) 526 .scalarize(0); 527 } else { 528 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 529 .legalFor({S32}) 530 .customFor({S64}) 531 .clampScalar(0, S32, S64) 532 .scalarize(0); 533 } 534 535 getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK}) 536 .scalarize(0) 537 .alwaysLegal(); 538 539 auto &CmpBuilder = 540 getActionDefinitionsBuilder(G_ICMP) 541 // The compare output type differs based on the register bank of the output, 542 // so make both s1 and s32 legal. 543 // 544 // Scalar compares producing output in scc will be promoted to s32, as that 545 // is the allocatable register type that will be needed for the copy from 546 // scc. This will be promoted during RegBankSelect, and we assume something 547 // before that won't try to use s32 result types. 548 // 549 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 550 // bank. 551 .legalForCartesianProduct( 552 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 553 .legalForCartesianProduct( 554 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 555 if (ST.has16BitInsts()) { 556 CmpBuilder.legalFor({{S1, S16}}); 557 } 558 559 CmpBuilder 560 .widenScalarToNextPow2(1) 561 .clampScalar(1, S32, S64) 562 .scalarize(0) 563 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 564 565 getActionDefinitionsBuilder(G_FCMP) 566 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 567 .widenScalarToNextPow2(1) 568 .clampScalar(1, S32, S64) 569 .scalarize(0); 570 571 // FIXME: fpow has a selection pattern that should move to custom lowering. 572 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2, G_FPOW}); 573 if (ST.has16BitInsts()) 574 Exp2Ops.legalFor({S32, S16}); 575 else 576 Exp2Ops.legalFor({S32}); 577 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 578 Exp2Ops.scalarize(0); 579 580 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10}); 581 if (ST.has16BitInsts()) 582 ExpOps.customFor({{S32}, {S16}}); 583 else 584 ExpOps.customFor({S32}); 585 ExpOps.clampScalar(0, MinScalarFPTy, S32) 586 .scalarize(0); 587 588 // The 64-bit versions produce 32-bit results, but only on the SALU. 589 getActionDefinitionsBuilder(G_CTPOP) 590 .legalFor({{S32, S32}, {S32, S64}}) 591 .clampScalar(0, S32, S32) 592 .clampScalar(1, S32, S64) 593 .scalarize(0) 594 .widenScalarToNextPow2(0, 32) 595 .widenScalarToNextPow2(1, 32); 596 597 // The hardware instructions return a different result on 0 than the generic 598 // instructions expect. The hardware produces -1, but these produce the 599 // bitwidth. 600 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 601 .scalarize(0) 602 .clampScalar(0, S32, S32) 603 .clampScalar(1, S32, S64) 604 .widenScalarToNextPow2(0, 32) 605 .widenScalarToNextPow2(1, 32) 606 .lower(); 607 608 // The 64-bit versions produce 32-bit results, but only on the SALU. 609 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 610 .legalFor({{S32, S32}, {S32, S64}}) 611 .clampScalar(0, S32, S32) 612 .clampScalar(1, S32, S64) 613 .scalarize(0) 614 .widenScalarToNextPow2(0, 32) 615 .widenScalarToNextPow2(1, 32); 616 617 getActionDefinitionsBuilder(G_BITREVERSE) 618 .legalFor({S32}) 619 .clampScalar(0, S32, S32) 620 .scalarize(0); 621 622 if (ST.has16BitInsts()) { 623 getActionDefinitionsBuilder(G_BSWAP) 624 .legalFor({S16, S32, V2S16}) 625 .clampMaxNumElements(0, S16, 2) 626 // FIXME: Fixing non-power-of-2 before clamp is workaround for 627 // narrowScalar limitation. 628 .widenScalarToNextPow2(0) 629 .clampScalar(0, S16, S32) 630 .scalarize(0); 631 632 if (ST.hasVOP3PInsts()) { 633 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 634 .legalFor({S32, S16, V2S16}) 635 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 636 .clampMaxNumElements(0, S16, 2) 637 .clampScalar(0, S16, S32) 638 .widenScalarToNextPow2(0) 639 .scalarize(0); 640 } else { 641 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 642 .legalFor({S32, S16}) 643 .widenScalarToNextPow2(0) 644 .clampScalar(0, S16, S32) 645 .scalarize(0); 646 } 647 } else { 648 // TODO: Should have same legality without v_perm_b32 649 getActionDefinitionsBuilder(G_BSWAP) 650 .legalFor({S32}) 651 .lowerIf(narrowerThan(0, 32)) 652 // FIXME: Fixing non-power-of-2 before clamp is workaround for 653 // narrowScalar limitation. 654 .widenScalarToNextPow2(0) 655 .maxScalar(0, S32) 656 .scalarize(0) 657 .lower(); 658 659 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 660 .legalFor({S32}) 661 .clampScalar(0, S32, S32) 662 .widenScalarToNextPow2(0) 663 .scalarize(0); 664 } 665 666 getActionDefinitionsBuilder(G_INTTOPTR) 667 // List the common cases 668 .legalForCartesianProduct(AddrSpaces64, {S64}) 669 .legalForCartesianProduct(AddrSpaces32, {S32}) 670 .scalarize(0) 671 // Accept any address space as long as the size matches 672 .legalIf(sameSize(0, 1)) 673 .widenScalarIf(smallerThan(1, 0), 674 [](const LegalityQuery &Query) { 675 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 676 }) 677 .narrowScalarIf(greaterThan(1, 0), 678 [](const LegalityQuery &Query) { 679 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 680 }); 681 682 getActionDefinitionsBuilder(G_PTRTOINT) 683 // List the common cases 684 .legalForCartesianProduct(AddrSpaces64, {S64}) 685 .legalForCartesianProduct(AddrSpaces32, {S32}) 686 .scalarize(0) 687 // Accept any address space as long as the size matches 688 .legalIf(sameSize(0, 1)) 689 .widenScalarIf(smallerThan(0, 1), 690 [](const LegalityQuery &Query) { 691 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 692 }) 693 .narrowScalarIf( 694 greaterThan(0, 1), 695 [](const LegalityQuery &Query) { 696 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 697 }); 698 699 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 700 .scalarize(0) 701 .custom(); 702 703 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 704 // handle some operations by just promoting the register during 705 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 706 auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned { 707 switch (AS) { 708 // FIXME: Private element size. 709 case AMDGPUAS::PRIVATE_ADDRESS: 710 return 32; 711 // FIXME: Check subtarget 712 case AMDGPUAS::LOCAL_ADDRESS: 713 return ST.useDS128() ? 128 : 64; 714 715 // Treat constant and global as identical. SMRD loads are sometimes usable 716 // for global loads (ideally constant address space should be eliminated) 717 // depending on the context. Legality cannot be context dependent, but 718 // RegBankSelect can split the load as necessary depending on the pointer 719 // register bank/uniformity and if the memory is invariant or not written in 720 // a kernel. 721 case AMDGPUAS::CONSTANT_ADDRESS: 722 case AMDGPUAS::GLOBAL_ADDRESS: 723 return IsLoad ? 512 : 128; 724 default: 725 return 128; 726 } 727 }; 728 729 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 730 bool IsLoad) -> bool { 731 const LLT DstTy = Query.Types[0]; 732 733 // Split vector extloads. 734 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 735 unsigned Align = Query.MMODescrs[0].AlignInBits; 736 737 if (MemSize < DstTy.getSizeInBits()) 738 MemSize = std::max(MemSize, Align); 739 740 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 741 return true; 742 743 const LLT PtrTy = Query.Types[1]; 744 unsigned AS = PtrTy.getAddressSpace(); 745 if (MemSize > maxSizeForAddrSpace(AS, IsLoad)) 746 return true; 747 748 // Catch weird sized loads that don't evenly divide into the access sizes 749 // TODO: May be able to widen depending on alignment etc. 750 unsigned NumRegs = (MemSize + 31) / 32; 751 if (NumRegs == 3) { 752 if (!ST.hasDwordx3LoadStores()) 753 return true; 754 } else { 755 // If the alignment allows, these should have been widened. 756 if (!isPowerOf2_32(NumRegs)) 757 return true; 758 } 759 760 if (Align < MemSize) { 761 const SITargetLowering *TLI = ST.getTargetLowering(); 762 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 763 } 764 765 return false; 766 }; 767 768 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool { 769 unsigned Size = Query.Types[0].getSizeInBits(); 770 if (isPowerOf2_32(Size)) 771 return false; 772 773 if (Size == 96 && ST.hasDwordx3LoadStores()) 774 return false; 775 776 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 777 if (Size >= maxSizeForAddrSpace(AddrSpace, true)) 778 return false; 779 780 unsigned Align = Query.MMODescrs[0].AlignInBits; 781 unsigned RoundedSize = NextPowerOf2(Size); 782 return (Align >= RoundedSize); 783 }; 784 785 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 786 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 787 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 788 789 // TODO: Refine based on subtargets which support unaligned access or 128-bit 790 // LDS 791 // TODO: Unsupported flat for SI. 792 793 for (unsigned Op : {G_LOAD, G_STORE}) { 794 const bool IsStore = Op == G_STORE; 795 796 auto &Actions = getActionDefinitionsBuilder(Op); 797 // Whitelist the common cases. 798 // TODO: Loads to s16 on gfx9 799 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 800 {V2S32, GlobalPtr, 64, GlobalAlign32}, 801 {V4S32, GlobalPtr, 128, GlobalAlign32}, 802 {S128, GlobalPtr, 128, GlobalAlign32}, 803 {S64, GlobalPtr, 64, GlobalAlign32}, 804 {V2S64, GlobalPtr, 128, GlobalAlign32}, 805 {V2S16, GlobalPtr, 32, GlobalAlign32}, 806 {S32, GlobalPtr, 8, GlobalAlign8}, 807 {S32, GlobalPtr, 16, GlobalAlign16}, 808 809 {S32, LocalPtr, 32, 32}, 810 {S64, LocalPtr, 64, 32}, 811 {V2S32, LocalPtr, 64, 32}, 812 {S32, LocalPtr, 8, 8}, 813 {S32, LocalPtr, 16, 16}, 814 {V2S16, LocalPtr, 32, 32}, 815 816 {S32, PrivatePtr, 32, 32}, 817 {S32, PrivatePtr, 8, 8}, 818 {S32, PrivatePtr, 16, 16}, 819 {V2S16, PrivatePtr, 32, 32}, 820 821 {S32, FlatPtr, 32, GlobalAlign32}, 822 {S32, FlatPtr, 16, GlobalAlign16}, 823 {S32, FlatPtr, 8, GlobalAlign8}, 824 {V2S16, FlatPtr, 32, GlobalAlign32}, 825 826 {S32, ConstantPtr, 32, GlobalAlign32}, 827 {V2S32, ConstantPtr, 64, GlobalAlign32}, 828 {V4S32, ConstantPtr, 128, GlobalAlign32}, 829 {S64, ConstantPtr, 64, GlobalAlign32}, 830 {S128, ConstantPtr, 128, GlobalAlign32}, 831 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 832 Actions 833 .customIf(typeIs(1, Constant32Ptr)) 834 // Widen suitably aligned loads by loading extra elements. 835 .moreElementsIf([=](const LegalityQuery &Query) { 836 const LLT Ty = Query.Types[0]; 837 return Op == G_LOAD && Ty.isVector() && 838 shouldWidenLoadResult(Query); 839 }, moreElementsToNextPow2(0)) 840 .widenScalarIf([=](const LegalityQuery &Query) { 841 const LLT Ty = Query.Types[0]; 842 return Op == G_LOAD && !Ty.isVector() && 843 shouldWidenLoadResult(Query); 844 }, widenScalarOrEltToNextPow2(0)) 845 .narrowScalarIf( 846 [=](const LegalityQuery &Query) -> bool { 847 return !Query.Types[0].isVector() && 848 needToSplitMemOp(Query, Op == G_LOAD); 849 }, 850 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 851 const LLT DstTy = Query.Types[0]; 852 const LLT PtrTy = Query.Types[1]; 853 854 const unsigned DstSize = DstTy.getSizeInBits(); 855 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 856 857 // Split extloads. 858 if (DstSize > MemSize) 859 return std::make_pair(0, LLT::scalar(MemSize)); 860 861 if (!isPowerOf2_32(DstSize)) { 862 // We're probably decomposing an odd sized store. Try to split 863 // to the widest type. TODO: Account for alignment. As-is it 864 // should be OK, since the new parts will be further legalized. 865 unsigned FloorSize = PowerOf2Floor(DstSize); 866 return std::make_pair(0, LLT::scalar(FloorSize)); 867 } 868 869 if (DstSize > 32 && (DstSize % 32 != 0)) { 870 // FIXME: Need a way to specify non-extload of larger size if 871 // suitably aligned. 872 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 873 } 874 875 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 876 Op == G_LOAD); 877 if (MemSize > MaxSize) 878 return std::make_pair(0, LLT::scalar(MaxSize)); 879 880 unsigned Align = Query.MMODescrs[0].AlignInBits; 881 return std::make_pair(0, LLT::scalar(Align)); 882 }) 883 .fewerElementsIf( 884 [=](const LegalityQuery &Query) -> bool { 885 return Query.Types[0].isVector() && 886 needToSplitMemOp(Query, Op == G_LOAD); 887 }, 888 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 889 const LLT DstTy = Query.Types[0]; 890 const LLT PtrTy = Query.Types[1]; 891 892 LLT EltTy = DstTy.getElementType(); 893 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 894 Op == G_LOAD); 895 896 // FIXME: Handle widened to power of 2 results better. This ends 897 // up scalarizing. 898 // FIXME: 3 element stores scalarized on SI 899 900 // Split if it's too large for the address space. 901 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 902 unsigned NumElts = DstTy.getNumElements(); 903 unsigned EltSize = EltTy.getSizeInBits(); 904 905 if (MaxSize % EltSize == 0) { 906 return std::make_pair( 907 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 908 } 909 910 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 911 912 // FIXME: Refine when odd breakdowns handled 913 // The scalars will need to be re-legalized. 914 if (NumPieces == 1 || NumPieces >= NumElts || 915 NumElts % NumPieces != 0) 916 return std::make_pair(0, EltTy); 917 918 return std::make_pair(0, 919 LLT::vector(NumElts / NumPieces, EltTy)); 920 } 921 922 // FIXME: We could probably handle weird extending loads better. 923 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 924 if (DstTy.getSizeInBits() > MemSize) 925 return std::make_pair(0, EltTy); 926 927 unsigned EltSize = EltTy.getSizeInBits(); 928 unsigned DstSize = DstTy.getSizeInBits(); 929 if (!isPowerOf2_32(DstSize)) { 930 // We're probably decomposing an odd sized store. Try to split 931 // to the widest type. TODO: Account for alignment. As-is it 932 // should be OK, since the new parts will be further legalized. 933 unsigned FloorSize = PowerOf2Floor(DstSize); 934 return std::make_pair( 935 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 936 } 937 938 // Need to split because of alignment. 939 unsigned Align = Query.MMODescrs[0].AlignInBits; 940 if (EltSize > Align && 941 (EltSize / Align < DstTy.getNumElements())) { 942 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 943 } 944 945 // May need relegalization for the scalars. 946 return std::make_pair(0, EltTy); 947 }) 948 .minScalar(0, S32); 949 950 if (IsStore) 951 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 952 953 // TODO: Need a bitcast lower option? 954 Actions 955 .legalIf([=](const LegalityQuery &Query) { 956 const LLT Ty0 = Query.Types[0]; 957 unsigned Size = Ty0.getSizeInBits(); 958 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 959 unsigned Align = Query.MMODescrs[0].AlignInBits; 960 961 // FIXME: Widening store from alignment not valid. 962 if (MemSize < Size) 963 MemSize = std::max(MemSize, Align); 964 965 // No extending vector loads. 966 if (Size > MemSize && Ty0.isVector()) 967 return false; 968 969 switch (MemSize) { 970 case 8: 971 case 16: 972 return Size == 32; 973 case 32: 974 case 64: 975 case 128: 976 return true; 977 case 96: 978 return ST.hasDwordx3LoadStores(); 979 case 256: 980 case 512: 981 return true; 982 default: 983 return false; 984 } 985 }) 986 .widenScalarToNextPow2(0) 987 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 988 } 989 990 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 991 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 992 {S32, GlobalPtr, 16, 2 * 8}, 993 {S32, LocalPtr, 8, 8}, 994 {S32, LocalPtr, 16, 16}, 995 {S32, PrivatePtr, 8, 8}, 996 {S32, PrivatePtr, 16, 16}, 997 {S32, ConstantPtr, 8, 8}, 998 {S32, ConstantPtr, 16, 2 * 8}}); 999 if (ST.hasFlatAddressSpace()) { 1000 ExtLoads.legalForTypesWithMemDesc( 1001 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1002 } 1003 1004 ExtLoads.clampScalar(0, S32, S32) 1005 .widenScalarToNextPow2(0) 1006 .unsupportedIfMemSizeNotPow2() 1007 .lower(); 1008 1009 auto &Atomics = getActionDefinitionsBuilder( 1010 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1011 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1012 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1013 G_ATOMICRMW_UMIN}) 1014 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1015 {S64, GlobalPtr}, {S64, LocalPtr}}); 1016 if (ST.hasFlatAddressSpace()) { 1017 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1018 } 1019 1020 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1021 .legalFor({{S32, LocalPtr}}); 1022 1023 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1024 // demarshalling 1025 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1026 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1027 {S32, FlatPtr}, {S64, FlatPtr}}) 1028 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1029 {S32, RegionPtr}, {S64, RegionPtr}}); 1030 // TODO: Pointer types, any 32-bit or 64-bit vector 1031 1032 // Condition should be s32 for scalar, s1 for vector. 1033 getActionDefinitionsBuilder(G_SELECT) 1034 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1035 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1036 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1037 .clampScalar(0, S16, S64) 1038 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1039 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1040 .scalarize(1) 1041 .clampMaxNumElements(0, S32, 2) 1042 .clampMaxNumElements(0, LocalPtr, 2) 1043 .clampMaxNumElements(0, PrivatePtr, 2) 1044 .scalarize(0) 1045 .widenScalarToNextPow2(0) 1046 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1047 1048 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1049 // be more flexible with the shift amount type. 1050 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1051 .legalFor({{S32, S32}, {S64, S32}}); 1052 if (ST.has16BitInsts()) { 1053 if (ST.hasVOP3PInsts()) { 1054 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 1055 .clampMaxNumElements(0, S16, 2); 1056 } else 1057 Shifts.legalFor({{S16, S32}, {S16, S16}}); 1058 1059 // TODO: Support 16-bit shift amounts 1060 Shifts.clampScalar(1, S32, S32); 1061 Shifts.clampScalar(0, S16, S64); 1062 Shifts.widenScalarToNextPow2(0, 16); 1063 } else { 1064 // Make sure we legalize the shift amount type first, as the general 1065 // expansion for the shifted type will produce much worse code if it hasn't 1066 // been truncated already. 1067 Shifts.clampScalar(1, S32, S32); 1068 Shifts.clampScalar(0, S32, S64); 1069 Shifts.widenScalarToNextPow2(0, 32); 1070 } 1071 Shifts.scalarize(0); 1072 1073 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1074 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1075 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1076 unsigned IdxTypeIdx = 2; 1077 1078 getActionDefinitionsBuilder(Op) 1079 .customIf([=](const LegalityQuery &Query) { 1080 const LLT EltTy = Query.Types[EltTypeIdx]; 1081 const LLT VecTy = Query.Types[VecTypeIdx]; 1082 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1083 return (EltTy.getSizeInBits() == 16 || 1084 EltTy.getSizeInBits() % 32 == 0) && 1085 VecTy.getSizeInBits() % 32 == 0 && 1086 VecTy.getSizeInBits() <= 1024 && 1087 IdxTy.getSizeInBits() == 32; 1088 }) 1089 .clampScalar(EltTypeIdx, S32, S64) 1090 .clampScalar(VecTypeIdx, S32, S64) 1091 .clampScalar(IdxTypeIdx, S32, S32); 1092 } 1093 1094 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1095 .unsupportedIf([=](const LegalityQuery &Query) { 1096 const LLT &EltTy = Query.Types[1].getElementType(); 1097 return Query.Types[0] != EltTy; 1098 }); 1099 1100 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1101 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1102 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1103 1104 // FIXME: Doesn't handle extract of illegal sizes. 1105 getActionDefinitionsBuilder(Op) 1106 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1107 // FIXME: Multiples of 16 should not be legal. 1108 .legalIf([=](const LegalityQuery &Query) { 1109 const LLT BigTy = Query.Types[BigTyIdx]; 1110 const LLT LitTy = Query.Types[LitTyIdx]; 1111 return (BigTy.getSizeInBits() % 32 == 0) && 1112 (LitTy.getSizeInBits() % 16 == 0); 1113 }) 1114 .widenScalarIf( 1115 [=](const LegalityQuery &Query) { 1116 const LLT BigTy = Query.Types[BigTyIdx]; 1117 return (BigTy.getScalarSizeInBits() < 16); 1118 }, 1119 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1120 .widenScalarIf( 1121 [=](const LegalityQuery &Query) { 1122 const LLT LitTy = Query.Types[LitTyIdx]; 1123 return (LitTy.getScalarSizeInBits() < 16); 1124 }, 1125 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1126 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1127 .widenScalarToNextPow2(BigTyIdx, 32); 1128 1129 } 1130 1131 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1132 .legalForCartesianProduct(AllS32Vectors, {S32}) 1133 .legalForCartesianProduct(AllS64Vectors, {S64}) 1134 .clampNumElements(0, V16S32, V32S32) 1135 .clampNumElements(0, V2S64, V16S64) 1136 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1137 1138 if (ST.hasScalarPackInsts()) { 1139 BuildVector 1140 // FIXME: Should probably widen s1 vectors straight to s32 1141 .minScalarOrElt(0, S16) 1142 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1143 .minScalar(1, S32); 1144 1145 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1146 .legalFor({V2S16, S32}) 1147 .lower(); 1148 BuildVector.minScalarOrElt(0, S32); 1149 } else { 1150 BuildVector.customFor({V2S16, S16}); 1151 BuildVector.minScalarOrElt(0, S32); 1152 1153 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1154 .customFor({V2S16, S32}) 1155 .lower(); 1156 } 1157 1158 BuildVector.legalIf(isRegisterType(0)); 1159 1160 // FIXME: Clamp maximum size 1161 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1162 .legalIf(isRegisterType(0)); 1163 1164 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1165 // pre-legalize. 1166 if (ST.hasVOP3PInsts()) { 1167 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1168 .customFor({V2S16, V2S16}) 1169 .lower(); 1170 } else 1171 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1172 1173 // Merge/Unmerge 1174 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1175 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1176 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1177 1178 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1179 const LLT &Ty = Query.Types[TypeIdx]; 1180 if (Ty.isVector()) { 1181 const LLT &EltTy = Ty.getElementType(); 1182 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 1183 return true; 1184 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1185 return true; 1186 } 1187 return false; 1188 }; 1189 1190 auto &Builder = getActionDefinitionsBuilder(Op) 1191 // Try to widen to s16 first for small types. 1192 // TODO: Only do this on targets with legal s16 shifts 1193 .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1194 1195 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1196 .lowerFor({{S16, V2S16}}) 1197 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1198 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1199 elementTypeIs(1, S16)), 1200 changeTo(1, V2S16)) 1201 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1202 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1203 // valid. 1204 .clampScalar(LitTyIdx, S32, S256) 1205 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1206 // Break up vectors with weird elements into scalars 1207 .fewerElementsIf( 1208 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 1209 scalarize(0)) 1210 .fewerElementsIf( 1211 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 1212 scalarize(1)) 1213 .clampScalar(BigTyIdx, S32, S1024); 1214 1215 if (Op == G_MERGE_VALUES) { 1216 Builder.widenScalarIf( 1217 // TODO: Use 16-bit shifts if legal for 8-bit values? 1218 [=](const LegalityQuery &Query) { 1219 const LLT Ty = Query.Types[LitTyIdx]; 1220 return Ty.getSizeInBits() < 32; 1221 }, 1222 changeTo(LitTyIdx, S32)); 1223 } 1224 1225 Builder.widenScalarIf( 1226 [=](const LegalityQuery &Query) { 1227 const LLT Ty = Query.Types[BigTyIdx]; 1228 return !isPowerOf2_32(Ty.getSizeInBits()) && 1229 Ty.getSizeInBits() % 16 != 0; 1230 }, 1231 [=](const LegalityQuery &Query) { 1232 // Pick the next power of 2, or a multiple of 64 over 128. 1233 // Whichever is smaller. 1234 const LLT &Ty = Query.Types[BigTyIdx]; 1235 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1236 if (NewSizeInBits >= 256) { 1237 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1238 if (RoundedTo < NewSizeInBits) 1239 NewSizeInBits = RoundedTo; 1240 } 1241 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1242 }) 1243 .legalIf([=](const LegalityQuery &Query) { 1244 const LLT &BigTy = Query.Types[BigTyIdx]; 1245 const LLT &LitTy = Query.Types[LitTyIdx]; 1246 1247 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1248 return false; 1249 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1250 return false; 1251 1252 return BigTy.getSizeInBits() % 16 == 0 && 1253 LitTy.getSizeInBits() % 16 == 0 && 1254 BigTy.getSizeInBits() <= 1024; 1255 }) 1256 // Any vectors left are the wrong size. Scalarize them. 1257 .scalarize(0) 1258 .scalarize(1); 1259 } 1260 1261 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1262 // RegBankSelect. 1263 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1264 .legalFor({{S32}, {S64}}); 1265 1266 if (ST.hasVOP3PInsts()) { 1267 SextInReg.lowerFor({{V2S16}}) 1268 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1269 // get more vector shift opportunities, since we'll get those when 1270 // expanded. 1271 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1272 } else if (ST.has16BitInsts()) { 1273 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1274 } else { 1275 // Prefer to promote to s32 before lowering if we don't have 16-bit 1276 // shifts. This avoid a lot of intermediate truncate and extend operations. 1277 SextInReg.lowerFor({{S32}, {S64}}); 1278 } 1279 1280 SextInReg 1281 .scalarize(0) 1282 .clampScalar(0, S32, S64) 1283 .lower(); 1284 1285 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1286 .legalFor({S64}); 1287 1288 getActionDefinitionsBuilder({ 1289 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1290 G_FCOPYSIGN, 1291 1292 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1293 G_READ_REGISTER, 1294 G_WRITE_REGISTER, 1295 1296 G_SADDO, G_SSUBO, 1297 1298 // TODO: Implement 1299 G_FMINIMUM, G_FMAXIMUM 1300 }).lower(); 1301 1302 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1303 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1304 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1305 .unsupported(); 1306 1307 computeTables(); 1308 verify(*ST.getInstrInfo()); 1309 } 1310 1311 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1312 MachineRegisterInfo &MRI, 1313 MachineIRBuilder &B, 1314 GISelChangeObserver &Observer) const { 1315 switch (MI.getOpcode()) { 1316 case TargetOpcode::G_ADDRSPACE_CAST: 1317 return legalizeAddrSpaceCast(MI, MRI, B); 1318 case TargetOpcode::G_FRINT: 1319 return legalizeFrint(MI, MRI, B); 1320 case TargetOpcode::G_FCEIL: 1321 return legalizeFceil(MI, MRI, B); 1322 case TargetOpcode::G_INTRINSIC_TRUNC: 1323 return legalizeIntrinsicTrunc(MI, MRI, B); 1324 case TargetOpcode::G_SITOFP: 1325 return legalizeITOFP(MI, MRI, B, true); 1326 case TargetOpcode::G_UITOFP: 1327 return legalizeITOFP(MI, MRI, B, false); 1328 case TargetOpcode::G_FPTOSI: 1329 return legalizeFPTOI(MI, MRI, B, true); 1330 case TargetOpcode::G_FPTOUI: 1331 return legalizeFPTOI(MI, MRI, B, false); 1332 case TargetOpcode::G_FMINNUM: 1333 case TargetOpcode::G_FMAXNUM: 1334 case TargetOpcode::G_FMINNUM_IEEE: 1335 case TargetOpcode::G_FMAXNUM_IEEE: 1336 return legalizeMinNumMaxNum(MI, MRI, B); 1337 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1338 return legalizeExtractVectorElt(MI, MRI, B); 1339 case TargetOpcode::G_INSERT_VECTOR_ELT: 1340 return legalizeInsertVectorElt(MI, MRI, B); 1341 case TargetOpcode::G_SHUFFLE_VECTOR: 1342 return legalizeShuffleVector(MI, MRI, B); 1343 case TargetOpcode::G_FSIN: 1344 case TargetOpcode::G_FCOS: 1345 return legalizeSinCos(MI, MRI, B); 1346 case TargetOpcode::G_GLOBAL_VALUE: 1347 return legalizeGlobalValue(MI, MRI, B); 1348 case TargetOpcode::G_LOAD: 1349 return legalizeLoad(MI, MRI, B, Observer); 1350 case TargetOpcode::G_FMAD: 1351 return legalizeFMad(MI, MRI, B); 1352 case TargetOpcode::G_FDIV: 1353 return legalizeFDIV(MI, MRI, B); 1354 case TargetOpcode::G_UDIV: 1355 case TargetOpcode::G_UREM: 1356 return legalizeUDIV_UREM(MI, MRI, B); 1357 case TargetOpcode::G_SDIV: 1358 case TargetOpcode::G_SREM: 1359 return legalizeSDIV_SREM(MI, MRI, B); 1360 case TargetOpcode::G_ATOMIC_CMPXCHG: 1361 return legalizeAtomicCmpXChg(MI, MRI, B); 1362 case TargetOpcode::G_FLOG: 1363 return legalizeFlog(MI, B, 1.0f / numbers::log2ef); 1364 case TargetOpcode::G_FLOG10: 1365 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1366 case TargetOpcode::G_FEXP: 1367 return legalizeFExp(MI, B); 1368 case TargetOpcode::G_FFLOOR: 1369 return legalizeFFloor(MI, MRI, B); 1370 case TargetOpcode::G_BUILD_VECTOR: 1371 return legalizeBuildVector(MI, MRI, B); 1372 default: 1373 return false; 1374 } 1375 1376 llvm_unreachable("expected switch to return"); 1377 } 1378 1379 Register AMDGPULegalizerInfo::getSegmentAperture( 1380 unsigned AS, 1381 MachineRegisterInfo &MRI, 1382 MachineIRBuilder &B) const { 1383 MachineFunction &MF = B.getMF(); 1384 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1385 const LLT S32 = LLT::scalar(32); 1386 1387 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1388 1389 if (ST.hasApertureRegs()) { 1390 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1391 // getreg. 1392 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1393 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1394 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1395 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1396 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1397 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1398 unsigned Encoding = 1399 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1400 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1401 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1402 1403 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1404 1405 B.buildInstr(AMDGPU::S_GETREG_B32) 1406 .addDef(GetReg) 1407 .addImm(Encoding); 1408 MRI.setType(GetReg, S32); 1409 1410 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1411 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1412 } 1413 1414 Register QueuePtr = MRI.createGenericVirtualRegister( 1415 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1416 1417 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1418 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1419 return Register(); 1420 1421 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1422 // private_segment_aperture_base_hi. 1423 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1424 1425 // TODO: can we be smarter about machine pointer info? 1426 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1427 MachineMemOperand *MMO = MF.getMachineMemOperand( 1428 PtrInfo, 1429 MachineMemOperand::MOLoad | 1430 MachineMemOperand::MODereferenceable | 1431 MachineMemOperand::MOInvariant, 1432 4, 1433 MinAlign(64, StructOffset)); 1434 1435 Register LoadAddr; 1436 1437 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1438 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1439 } 1440 1441 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1442 MachineInstr &MI, MachineRegisterInfo &MRI, 1443 MachineIRBuilder &B) const { 1444 MachineFunction &MF = B.getMF(); 1445 1446 B.setInstr(MI); 1447 1448 const LLT S32 = LLT::scalar(32); 1449 Register Dst = MI.getOperand(0).getReg(); 1450 Register Src = MI.getOperand(1).getReg(); 1451 1452 LLT DstTy = MRI.getType(Dst); 1453 LLT SrcTy = MRI.getType(Src); 1454 unsigned DestAS = DstTy.getAddressSpace(); 1455 unsigned SrcAS = SrcTy.getAddressSpace(); 1456 1457 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1458 // vector element. 1459 assert(!DstTy.isVector()); 1460 1461 const AMDGPUTargetMachine &TM 1462 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1463 1464 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1465 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1466 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1467 return true; 1468 } 1469 1470 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1471 // Truncate. 1472 B.buildExtract(Dst, Src, 0); 1473 MI.eraseFromParent(); 1474 return true; 1475 } 1476 1477 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1478 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1479 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1480 1481 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1482 // another. Merge operands are required to be the same type, but creating an 1483 // extra ptrtoint would be kind of pointless. 1484 auto HighAddr = B.buildConstant( 1485 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1486 B.buildMerge(Dst, {Src, HighAddr}); 1487 MI.eraseFromParent(); 1488 return true; 1489 } 1490 1491 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1492 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1493 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1494 unsigned NullVal = TM.getNullPointerValue(DestAS); 1495 1496 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1497 auto FlatNull = B.buildConstant(SrcTy, 0); 1498 1499 // Extract low 32-bits of the pointer. 1500 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1501 1502 auto CmpRes = 1503 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1504 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1505 1506 MI.eraseFromParent(); 1507 return true; 1508 } 1509 1510 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1511 return false; 1512 1513 if (!ST.hasFlatAddressSpace()) 1514 return false; 1515 1516 auto SegmentNull = 1517 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1518 auto FlatNull = 1519 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1520 1521 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1522 if (!ApertureReg.isValid()) 1523 return false; 1524 1525 auto CmpRes = 1526 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1527 1528 // Coerce the type of the low half of the result so we can use merge_values. 1529 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1530 1531 // TODO: Should we allow mismatched types but matching sizes in merges to 1532 // avoid the ptrtoint? 1533 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1534 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1535 1536 MI.eraseFromParent(); 1537 return true; 1538 } 1539 1540 bool AMDGPULegalizerInfo::legalizeFrint( 1541 MachineInstr &MI, MachineRegisterInfo &MRI, 1542 MachineIRBuilder &B) const { 1543 B.setInstr(MI); 1544 1545 Register Src = MI.getOperand(1).getReg(); 1546 LLT Ty = MRI.getType(Src); 1547 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1548 1549 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1550 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1551 1552 auto C1 = B.buildFConstant(Ty, C1Val); 1553 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1554 1555 // TODO: Should this propagate fast-math-flags? 1556 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1557 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1558 1559 auto C2 = B.buildFConstant(Ty, C2Val); 1560 auto Fabs = B.buildFAbs(Ty, Src); 1561 1562 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1563 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1564 return true; 1565 } 1566 1567 bool AMDGPULegalizerInfo::legalizeFceil( 1568 MachineInstr &MI, MachineRegisterInfo &MRI, 1569 MachineIRBuilder &B) const { 1570 B.setInstr(MI); 1571 1572 const LLT S1 = LLT::scalar(1); 1573 const LLT S64 = LLT::scalar(64); 1574 1575 Register Src = MI.getOperand(1).getReg(); 1576 assert(MRI.getType(Src) == S64); 1577 1578 // result = trunc(src) 1579 // if (src > 0.0 && src != result) 1580 // result += 1.0 1581 1582 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1583 1584 const auto Zero = B.buildFConstant(S64, 0.0); 1585 const auto One = B.buildFConstant(S64, 1.0); 1586 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1587 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1588 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1589 auto Add = B.buildSelect(S64, And, One, Zero); 1590 1591 // TODO: Should this propagate fast-math-flags? 1592 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1593 return true; 1594 } 1595 1596 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1597 MachineIRBuilder &B) { 1598 const unsigned FractBits = 52; 1599 const unsigned ExpBits = 11; 1600 LLT S32 = LLT::scalar(32); 1601 1602 auto Const0 = B.buildConstant(S32, FractBits - 32); 1603 auto Const1 = B.buildConstant(S32, ExpBits); 1604 1605 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1606 .addUse(Const0.getReg(0)) 1607 .addUse(Const1.getReg(0)); 1608 1609 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1610 } 1611 1612 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1613 MachineInstr &MI, MachineRegisterInfo &MRI, 1614 MachineIRBuilder &B) const { 1615 B.setInstr(MI); 1616 1617 const LLT S1 = LLT::scalar(1); 1618 const LLT S32 = LLT::scalar(32); 1619 const LLT S64 = LLT::scalar(64); 1620 1621 Register Src = MI.getOperand(1).getReg(); 1622 assert(MRI.getType(Src) == S64); 1623 1624 // TODO: Should this use extract since the low half is unused? 1625 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1626 Register Hi = Unmerge.getReg(1); 1627 1628 // Extract the upper half, since this is where we will find the sign and 1629 // exponent. 1630 auto Exp = extractF64Exponent(Hi, B); 1631 1632 const unsigned FractBits = 52; 1633 1634 // Extract the sign bit. 1635 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1636 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1637 1638 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1639 1640 const auto Zero32 = B.buildConstant(S32, 0); 1641 1642 // Extend back to 64-bits. 1643 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1644 1645 auto Shr = B.buildAShr(S64, FractMask, Exp); 1646 auto Not = B.buildNot(S64, Shr); 1647 auto Tmp0 = B.buildAnd(S64, Src, Not); 1648 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1649 1650 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1651 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1652 1653 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1654 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1655 return true; 1656 } 1657 1658 bool AMDGPULegalizerInfo::legalizeITOFP( 1659 MachineInstr &MI, MachineRegisterInfo &MRI, 1660 MachineIRBuilder &B, bool Signed) const { 1661 B.setInstr(MI); 1662 1663 Register Dst = MI.getOperand(0).getReg(); 1664 Register Src = MI.getOperand(1).getReg(); 1665 1666 const LLT S64 = LLT::scalar(64); 1667 const LLT S32 = LLT::scalar(32); 1668 1669 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1670 1671 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1672 1673 auto CvtHi = Signed ? 1674 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1675 B.buildUITOFP(S64, Unmerge.getReg(1)); 1676 1677 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1678 1679 auto ThirtyTwo = B.buildConstant(S32, 32); 1680 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1681 .addUse(CvtHi.getReg(0)) 1682 .addUse(ThirtyTwo.getReg(0)); 1683 1684 // TODO: Should this propagate fast-math-flags? 1685 B.buildFAdd(Dst, LdExp, CvtLo); 1686 MI.eraseFromParent(); 1687 return true; 1688 } 1689 1690 // TODO: Copied from DAG implementation. Verify logic and document how this 1691 // actually works. 1692 bool AMDGPULegalizerInfo::legalizeFPTOI( 1693 MachineInstr &MI, MachineRegisterInfo &MRI, 1694 MachineIRBuilder &B, bool Signed) const { 1695 B.setInstr(MI); 1696 1697 Register Dst = MI.getOperand(0).getReg(); 1698 Register Src = MI.getOperand(1).getReg(); 1699 1700 const LLT S64 = LLT::scalar(64); 1701 const LLT S32 = LLT::scalar(32); 1702 1703 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1704 1705 unsigned Flags = MI.getFlags(); 1706 1707 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1708 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1709 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1710 1711 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1712 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1713 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1714 1715 auto Hi = Signed ? 1716 B.buildFPTOSI(S32, FloorMul) : 1717 B.buildFPTOUI(S32, FloorMul); 1718 auto Lo = B.buildFPTOUI(S32, Fma); 1719 1720 B.buildMerge(Dst, { Lo, Hi }); 1721 MI.eraseFromParent(); 1722 1723 return true; 1724 } 1725 1726 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1727 MachineInstr &MI, MachineRegisterInfo &MRI, 1728 MachineIRBuilder &B) const { 1729 MachineFunction &MF = B.getMF(); 1730 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1731 1732 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1733 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1734 1735 // With ieee_mode disabled, the instructions have the correct behavior 1736 // already for G_FMINNUM/G_FMAXNUM 1737 if (!MFI->getMode().IEEE) 1738 return !IsIEEEOp; 1739 1740 if (IsIEEEOp) 1741 return true; 1742 1743 MachineIRBuilder HelperBuilder(MI); 1744 GISelObserverWrapper DummyObserver; 1745 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1746 HelperBuilder.setInstr(MI); 1747 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1748 } 1749 1750 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1751 MachineInstr &MI, MachineRegisterInfo &MRI, 1752 MachineIRBuilder &B) const { 1753 // TODO: Should move some of this into LegalizerHelper. 1754 1755 // TODO: Promote dynamic indexing of s16 to s32 1756 1757 // FIXME: Artifact combiner probably should have replaced the truncated 1758 // constant before this, so we shouldn't need 1759 // getConstantVRegValWithLookThrough. 1760 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1761 MI.getOperand(2).getReg(), MRI); 1762 if (!IdxVal) // Dynamic case will be selected to register indexing. 1763 return true; 1764 1765 Register Dst = MI.getOperand(0).getReg(); 1766 Register Vec = MI.getOperand(1).getReg(); 1767 1768 LLT VecTy = MRI.getType(Vec); 1769 LLT EltTy = VecTy.getElementType(); 1770 assert(EltTy == MRI.getType(Dst)); 1771 1772 B.setInstr(MI); 1773 1774 if (IdxVal->Value < VecTy.getNumElements()) 1775 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1776 else 1777 B.buildUndef(Dst); 1778 1779 MI.eraseFromParent(); 1780 return true; 1781 } 1782 1783 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1784 MachineInstr &MI, MachineRegisterInfo &MRI, 1785 MachineIRBuilder &B) const { 1786 // TODO: Should move some of this into LegalizerHelper. 1787 1788 // TODO: Promote dynamic indexing of s16 to s32 1789 1790 // FIXME: Artifact combiner probably should have replaced the truncated 1791 // constant before this, so we shouldn't need 1792 // getConstantVRegValWithLookThrough. 1793 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1794 MI.getOperand(3).getReg(), MRI); 1795 if (!IdxVal) // Dynamic case will be selected to register indexing. 1796 return true; 1797 1798 Register Dst = MI.getOperand(0).getReg(); 1799 Register Vec = MI.getOperand(1).getReg(); 1800 Register Ins = MI.getOperand(2).getReg(); 1801 1802 LLT VecTy = MRI.getType(Vec); 1803 LLT EltTy = VecTy.getElementType(); 1804 assert(EltTy == MRI.getType(Ins)); 1805 1806 B.setInstr(MI); 1807 1808 if (IdxVal->Value < VecTy.getNumElements()) 1809 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 1810 else 1811 B.buildUndef(Dst); 1812 1813 MI.eraseFromParent(); 1814 return true; 1815 } 1816 1817 static bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask) { 1818 assert(Mask.size() == 2); 1819 1820 // If one half is undef, the other is trivially in the same reg. 1821 if (Mask[0] == -1 || Mask[1] == -1) 1822 return true; 1823 return ((Mask[0] == 0 || Mask[0] == 1) && (Mask[1] == 0 || Mask[1] == 1)) || 1824 ((Mask[0] == 2 || Mask[0] == 3) && (Mask[1] == 2 || Mask[1] == 3)); 1825 } 1826 1827 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1828 MachineInstr &MI, MachineRegisterInfo &MRI, 1829 MachineIRBuilder &B) const { 1830 const LLT V2S16 = LLT::vector(2, 16); 1831 1832 Register Dst = MI.getOperand(0).getReg(); 1833 Register Src0 = MI.getOperand(1).getReg(); 1834 LLT DstTy = MRI.getType(Dst); 1835 LLT SrcTy = MRI.getType(Src0); 1836 1837 if (SrcTy == V2S16 && DstTy == V2S16 && 1838 isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1839 return true; 1840 1841 MachineIRBuilder HelperBuilder(MI); 1842 GISelObserverWrapper DummyObserver; 1843 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1844 HelperBuilder.setInstr(MI); 1845 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1846 } 1847 1848 bool AMDGPULegalizerInfo::legalizeSinCos( 1849 MachineInstr &MI, MachineRegisterInfo &MRI, 1850 MachineIRBuilder &B) const { 1851 B.setInstr(MI); 1852 1853 Register DstReg = MI.getOperand(0).getReg(); 1854 Register SrcReg = MI.getOperand(1).getReg(); 1855 LLT Ty = MRI.getType(DstReg); 1856 unsigned Flags = MI.getFlags(); 1857 1858 Register TrigVal; 1859 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1860 if (ST.hasTrigReducedRange()) { 1861 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1862 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1863 .addUse(MulVal.getReg(0)) 1864 .setMIFlags(Flags).getReg(0); 1865 } else 1866 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1867 1868 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1869 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1870 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1871 .addUse(TrigVal) 1872 .setMIFlags(Flags); 1873 MI.eraseFromParent(); 1874 return true; 1875 } 1876 1877 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1878 Register DstReg, LLT PtrTy, 1879 MachineIRBuilder &B, const GlobalValue *GV, 1880 unsigned Offset, unsigned GAFlags) const { 1881 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1882 // to the following code sequence: 1883 // 1884 // For constant address space: 1885 // s_getpc_b64 s[0:1] 1886 // s_add_u32 s0, s0, $symbol 1887 // s_addc_u32 s1, s1, 0 1888 // 1889 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1890 // a fixup or relocation is emitted to replace $symbol with a literal 1891 // constant, which is a pc-relative offset from the encoding of the $symbol 1892 // operand to the global variable. 1893 // 1894 // For global address space: 1895 // s_getpc_b64 s[0:1] 1896 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1897 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1898 // 1899 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1900 // fixups or relocations are emitted to replace $symbol@*@lo and 1901 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1902 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1903 // operand to the global variable. 1904 // 1905 // What we want here is an offset from the value returned by s_getpc 1906 // (which is the address of the s_add_u32 instruction) to the global 1907 // variable, but since the encoding of $symbol starts 4 bytes after the start 1908 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1909 // small. This requires us to add 4 to the global variable offset in order to 1910 // compute the correct address. 1911 1912 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1913 1914 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1915 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1916 1917 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1918 .addDef(PCReg); 1919 1920 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1921 if (GAFlags == SIInstrInfo::MO_NONE) 1922 MIB.addImm(0); 1923 else 1924 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1925 1926 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1927 1928 if (PtrTy.getSizeInBits() == 32) 1929 B.buildExtract(DstReg, PCReg, 0); 1930 return true; 1931 } 1932 1933 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1934 MachineInstr &MI, MachineRegisterInfo &MRI, 1935 MachineIRBuilder &B) const { 1936 Register DstReg = MI.getOperand(0).getReg(); 1937 LLT Ty = MRI.getType(DstReg); 1938 unsigned AS = Ty.getAddressSpace(); 1939 1940 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1941 MachineFunction &MF = B.getMF(); 1942 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1943 B.setInstr(MI); 1944 1945 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1946 if (!MFI->isEntryFunction()) { 1947 const Function &Fn = MF.getFunction(); 1948 DiagnosticInfoUnsupported BadLDSDecl( 1949 Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 1950 Fn.getContext().diagnose(BadLDSDecl); 1951 } 1952 1953 // TODO: We could emit code to handle the initialization somewhere. 1954 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1955 const SITargetLowering *TLI = ST.getTargetLowering(); 1956 if (!TLI->shouldUseLDSConstAddress(GV)) { 1957 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 1958 return true; // Leave in place; 1959 } 1960 1961 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1962 MI.eraseFromParent(); 1963 return true; 1964 } 1965 1966 const Function &Fn = MF.getFunction(); 1967 DiagnosticInfoUnsupported BadInit( 1968 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1969 Fn.getContext().diagnose(BadInit); 1970 return true; 1971 } 1972 1973 const SITargetLowering *TLI = ST.getTargetLowering(); 1974 1975 if (TLI->shouldEmitFixup(GV)) { 1976 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 1977 MI.eraseFromParent(); 1978 return true; 1979 } 1980 1981 if (TLI->shouldEmitPCReloc(GV)) { 1982 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 1983 MI.eraseFromParent(); 1984 return true; 1985 } 1986 1987 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1988 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 1989 1990 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 1991 MachinePointerInfo::getGOT(MF), 1992 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1993 MachineMemOperand::MOInvariant, 1994 8 /*Size*/, 8 /*Align*/); 1995 1996 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 1997 1998 if (Ty.getSizeInBits() == 32) { 1999 // Truncate if this is a 32-bit constant adrdess. 2000 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2001 B.buildExtract(DstReg, Load, 0); 2002 } else 2003 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2004 2005 MI.eraseFromParent(); 2006 return true; 2007 } 2008 2009 bool AMDGPULegalizerInfo::legalizeLoad( 2010 MachineInstr &MI, MachineRegisterInfo &MRI, 2011 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2012 B.setInstr(MI); 2013 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2014 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2015 Observer.changingInstr(MI); 2016 MI.getOperand(1).setReg(Cast.getReg(0)); 2017 Observer.changedInstr(MI); 2018 return true; 2019 } 2020 2021 bool AMDGPULegalizerInfo::legalizeFMad( 2022 MachineInstr &MI, MachineRegisterInfo &MRI, 2023 MachineIRBuilder &B) const { 2024 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2025 assert(Ty.isScalar()); 2026 2027 MachineFunction &MF = B.getMF(); 2028 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2029 2030 // TODO: Always legal with future ftz flag. 2031 // FIXME: Do we need just output? 2032 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2033 return true; 2034 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2035 return true; 2036 2037 MachineIRBuilder HelperBuilder(MI); 2038 GISelObserverWrapper DummyObserver; 2039 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2040 HelperBuilder.setMBB(*MI.getParent()); 2041 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2042 } 2043 2044 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2045 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2046 Register DstReg = MI.getOperand(0).getReg(); 2047 Register PtrReg = MI.getOperand(1).getReg(); 2048 Register CmpVal = MI.getOperand(2).getReg(); 2049 Register NewVal = MI.getOperand(3).getReg(); 2050 2051 assert(SITargetLowering::isFlatGlobalAddrSpace( 2052 MRI.getType(PtrReg).getAddressSpace()) && 2053 "this should not have been custom lowered"); 2054 2055 LLT ValTy = MRI.getType(CmpVal); 2056 LLT VecTy = LLT::vector(2, ValTy); 2057 2058 B.setInstr(MI); 2059 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2060 2061 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2062 .addDef(DstReg) 2063 .addUse(PtrReg) 2064 .addUse(PackedVal) 2065 .setMemRefs(MI.memoperands()); 2066 2067 MI.eraseFromParent(); 2068 return true; 2069 } 2070 2071 bool AMDGPULegalizerInfo::legalizeFlog( 2072 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2073 Register Dst = MI.getOperand(0).getReg(); 2074 Register Src = MI.getOperand(1).getReg(); 2075 LLT Ty = B.getMRI()->getType(Dst); 2076 unsigned Flags = MI.getFlags(); 2077 B.setInstr(MI); 2078 2079 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2080 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2081 2082 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2083 MI.eraseFromParent(); 2084 return true; 2085 } 2086 2087 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2088 MachineIRBuilder &B) const { 2089 Register Dst = MI.getOperand(0).getReg(); 2090 Register Src = MI.getOperand(1).getReg(); 2091 unsigned Flags = MI.getFlags(); 2092 LLT Ty = B.getMRI()->getType(Dst); 2093 B.setInstr(MI); 2094 2095 auto K = B.buildFConstant(Ty, numbers::log2e); 2096 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2097 B.buildFExp2(Dst, Mul, Flags); 2098 MI.eraseFromParent(); 2099 return true; 2100 } 2101 2102 // Find a source register, ignoring any possible source modifiers. 2103 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2104 Register ModSrc = OrigSrc; 2105 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2106 ModSrc = SrcFNeg->getOperand(1).getReg(); 2107 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2108 ModSrc = SrcFAbs->getOperand(1).getReg(); 2109 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2110 ModSrc = SrcFAbs->getOperand(1).getReg(); 2111 return ModSrc; 2112 } 2113 2114 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2115 MachineRegisterInfo &MRI, 2116 MachineIRBuilder &B) const { 2117 B.setInstr(MI); 2118 2119 const LLT S1 = LLT::scalar(1); 2120 const LLT S64 = LLT::scalar(64); 2121 Register Dst = MI.getOperand(0).getReg(); 2122 Register OrigSrc = MI.getOperand(1).getReg(); 2123 unsigned Flags = MI.getFlags(); 2124 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2125 "this should not have been custom lowered"); 2126 2127 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2128 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2129 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2130 // V_FRACT bug is: 2131 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2132 // 2133 // Convert floor(x) to (x - fract(x)) 2134 2135 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2136 .addUse(OrigSrc) 2137 .setMIFlags(Flags); 2138 2139 // Give source modifier matching some assistance before obscuring a foldable 2140 // pattern. 2141 2142 // TODO: We can avoid the neg on the fract? The input sign to fract 2143 // shouldn't matter? 2144 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2145 2146 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2147 2148 Register Min = MRI.createGenericVirtualRegister(S64); 2149 2150 // We don't need to concern ourselves with the snan handling difference, so 2151 // use the one which will directly select. 2152 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2153 if (MFI->getMode().IEEE) 2154 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2155 else 2156 B.buildFMinNum(Min, Fract, Const, Flags); 2157 2158 Register CorrectedFract = Min; 2159 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2160 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2161 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2162 } 2163 2164 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2165 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2166 2167 MI.eraseFromParent(); 2168 return true; 2169 } 2170 2171 // Turn an illegal packed v2s16 build vector into bit operations. 2172 // TODO: This should probably be a bitcast action in LegalizerHelper. 2173 bool AMDGPULegalizerInfo::legalizeBuildVector( 2174 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2175 Register Dst = MI.getOperand(0).getReg(); 2176 LLT DstTy = MRI.getType(Dst); 2177 const LLT S32 = LLT::scalar(32); 2178 const LLT V2S16 = LLT::vector(2, 16); 2179 (void)DstTy; 2180 (void)V2S16; 2181 assert(DstTy == V2S16); 2182 2183 Register Src0 = MI.getOperand(1).getReg(); 2184 Register Src1 = MI.getOperand(2).getReg(); 2185 assert(MRI.getType(Src0) == LLT::scalar(16)); 2186 2187 B.setInstr(MI); 2188 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2189 B.buildBitcast(Dst, Merge); 2190 2191 MI.eraseFromParent(); 2192 return true; 2193 } 2194 2195 // Return the use branch instruction, otherwise null if the usage is invalid. 2196 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2197 MachineRegisterInfo &MRI, 2198 MachineInstr *&Br) { 2199 Register CondDef = MI.getOperand(0).getReg(); 2200 if (!MRI.hasOneNonDBGUse(CondDef)) 2201 return nullptr; 2202 2203 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2204 if (UseMI.getParent() != MI.getParent() || 2205 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2206 return nullptr; 2207 2208 // Make sure the cond br is followed by a G_BR 2209 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2210 if (Next != MI.getParent()->end()) { 2211 if (Next->getOpcode() != AMDGPU::G_BR) 2212 return nullptr; 2213 Br = &*Next; 2214 } 2215 2216 return &UseMI; 2217 } 2218 2219 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 2220 Register Reg, LLT Ty) const { 2221 Register LiveIn = MRI.getLiveInVirtReg(Reg); 2222 if (LiveIn) 2223 return LiveIn; 2224 2225 Register NewReg = MRI.createGenericVirtualRegister(Ty); 2226 MRI.addLiveIn(Reg, NewReg); 2227 return NewReg; 2228 } 2229 2230 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2231 const ArgDescriptor *Arg) const { 2232 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2233 return false; // TODO: Handle these 2234 2235 assert(Arg->getRegister().isPhysical()); 2236 2237 MachineRegisterInfo &MRI = *B.getMRI(); 2238 2239 LLT Ty = MRI.getType(DstReg); 2240 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 2241 2242 if (Arg->isMasked()) { 2243 // TODO: Should we try to emit this once in the entry block? 2244 const LLT S32 = LLT::scalar(32); 2245 const unsigned Mask = Arg->getMask(); 2246 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2247 2248 Register AndMaskSrc = LiveIn; 2249 2250 if (Shift != 0) { 2251 auto ShiftAmt = B.buildConstant(S32, Shift); 2252 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2253 } 2254 2255 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2256 } else 2257 B.buildCopy(DstReg, LiveIn); 2258 2259 // Insert the argument copy if it doens't already exist. 2260 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2261 if (!MRI.getVRegDef(LiveIn)) { 2262 // FIXME: Should have scoped insert pt 2263 MachineBasicBlock &OrigInsBB = B.getMBB(); 2264 auto OrigInsPt = B.getInsertPt(); 2265 2266 MachineBasicBlock &EntryMBB = B.getMF().front(); 2267 EntryMBB.addLiveIn(Arg->getRegister()); 2268 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2269 B.buildCopy(LiveIn, Arg->getRegister()); 2270 2271 B.setInsertPt(OrigInsBB, OrigInsPt); 2272 } 2273 2274 return true; 2275 } 2276 2277 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2278 MachineInstr &MI, 2279 MachineRegisterInfo &MRI, 2280 MachineIRBuilder &B, 2281 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2282 B.setInstr(MI); 2283 2284 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2285 2286 const ArgDescriptor *Arg; 2287 const TargetRegisterClass *RC; 2288 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 2289 if (!Arg) { 2290 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2291 return false; 2292 } 2293 2294 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 2295 MI.eraseFromParent(); 2296 return true; 2297 } 2298 2299 return false; 2300 } 2301 2302 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2303 MachineRegisterInfo &MRI, 2304 MachineIRBuilder &B) const { 2305 B.setInstr(MI); 2306 Register Dst = MI.getOperand(0).getReg(); 2307 LLT DstTy = MRI.getType(Dst); 2308 LLT S16 = LLT::scalar(16); 2309 LLT S32 = LLT::scalar(32); 2310 LLT S64 = LLT::scalar(64); 2311 2312 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2313 return true; 2314 2315 if (DstTy == S16) 2316 return legalizeFDIV16(MI, MRI, B); 2317 if (DstTy == S32) 2318 return legalizeFDIV32(MI, MRI, B); 2319 if (DstTy == S64) 2320 return legalizeFDIV64(MI, MRI, B); 2321 2322 return false; 2323 } 2324 2325 static Register buildDivRCP(MachineIRBuilder &B, Register Src) { 2326 const LLT S32 = LLT::scalar(32); 2327 2328 auto Cvt0 = B.buildUITOFP(S32, Src); 2329 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0}); 2330 auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000)); 2331 auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1); 2332 return B.buildFPTOUI(S32, Mul).getReg(0); 2333 } 2334 2335 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2336 Register DstReg, 2337 Register Num, 2338 Register Den, 2339 bool IsRem) const { 2340 const LLT S1 = LLT::scalar(1); 2341 const LLT S32 = LLT::scalar(32); 2342 2343 // RCP = URECIP(Den) = 2^32 / Den + e 2344 // e is rounding error. 2345 auto RCP = buildDivRCP(B, Den); 2346 2347 // RCP_LO = mul(RCP, Den) 2348 auto RCP_LO = B.buildMul(S32, RCP, Den); 2349 2350 // RCP_HI = mulhu (RCP, Den) */ 2351 auto RCP_HI = B.buildUMulH(S32, RCP, Den); 2352 2353 // NEG_RCP_LO = -RCP_LO 2354 auto Zero = B.buildConstant(S32, 0); 2355 auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO); 2356 2357 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 2358 auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero); 2359 auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO); 2360 2361 // Calculate the rounding error from the URECIP instruction 2362 // E = mulhu(ABS_RCP_LO, RCP) 2363 auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP); 2364 2365 // RCP_A_E = RCP + E 2366 auto RCP_A_E = B.buildAdd(S32, RCP, E); 2367 2368 // RCP_S_E = RCP - E 2369 auto RCP_S_E = B.buildSub(S32, RCP, E); 2370 2371 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 2372 auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E); 2373 2374 // Quotient = mulhu(Tmp0, Num)stmp 2375 auto Quotient = B.buildUMulH(S32, Tmp0, Num); 2376 2377 // Num_S_Remainder = Quotient * Den 2378 auto Num_S_Remainder = B.buildMul(S32, Quotient, Den); 2379 2380 // Remainder = Num - Num_S_Remainder 2381 auto Remainder = B.buildSub(S32, Num, Num_S_Remainder); 2382 2383 // Remainder_GE_Den = Remainder >= Den 2384 auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den); 2385 2386 // Remainder_GE_Zero = Num >= Num_S_Remainder; 2387 auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1, 2388 Num, Num_S_Remainder); 2389 2390 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 2391 auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero); 2392 2393 // Calculate Division result: 2394 2395 // Quotient_A_One = Quotient + 1 2396 auto One = B.buildConstant(S32, 1); 2397 auto Quotient_A_One = B.buildAdd(S32, Quotient, One); 2398 2399 // Quotient_S_One = Quotient - 1 2400 auto Quotient_S_One = B.buildSub(S32, Quotient, One); 2401 2402 // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient) 2403 auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One); 2404 2405 // Div = (Remainder_GE_Zero ? Div : Quotient_S_One) 2406 if (IsRem) { 2407 Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One); 2408 2409 // Calculate Rem result: 2410 auto Remainder_S_Den = B.buildSub(S32, Remainder, Den); 2411 2412 // Remainder_A_Den = Remainder + Den 2413 auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den); 2414 2415 // Rem = (Tmp1 ? Remainder_S_Den : Remainder) 2416 auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder); 2417 2418 // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den) 2419 B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den); 2420 } else { 2421 B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One); 2422 } 2423 } 2424 2425 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2426 MachineRegisterInfo &MRI, 2427 MachineIRBuilder &B) const { 2428 B.setInstr(MI); 2429 const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM; 2430 Register DstReg = MI.getOperand(0).getReg(); 2431 Register Num = MI.getOperand(1).getReg(); 2432 Register Den = MI.getOperand(2).getReg(); 2433 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem); 2434 MI.eraseFromParent(); 2435 return true; 2436 } 2437 2438 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2439 MachineRegisterInfo &MRI, 2440 MachineIRBuilder &B) const { 2441 if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32)) 2442 return legalizeUDIV_UREM32(MI, MRI, B); 2443 return false; 2444 } 2445 2446 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI, 2447 MachineRegisterInfo &MRI, 2448 MachineIRBuilder &B) const { 2449 B.setInstr(MI); 2450 const LLT S32 = LLT::scalar(32); 2451 2452 const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM; 2453 Register DstReg = MI.getOperand(0).getReg(); 2454 Register LHS = MI.getOperand(1).getReg(); 2455 Register RHS = MI.getOperand(2).getReg(); 2456 2457 auto ThirtyOne = B.buildConstant(S32, 31); 2458 auto LHSign = B.buildAShr(S32, LHS, ThirtyOne); 2459 auto RHSign = B.buildAShr(S32, LHS, ThirtyOne); 2460 2461 LHS = B.buildAdd(S32, LHS, LHSign).getReg(0); 2462 RHS = B.buildAdd(S32, RHS, RHSign).getReg(0); 2463 2464 LHS = B.buildXor(S32, LHS, LHSign).getReg(0); 2465 RHS = B.buildXor(S32, RHS, RHSign).getReg(0); 2466 2467 Register UDivRem = MRI.createGenericVirtualRegister(S32); 2468 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem); 2469 2470 if (IsRem) { 2471 auto RSign = LHSign; // Remainder sign is the same as LHS 2472 UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0); 2473 B.buildSub(DstReg, UDivRem, RSign); 2474 } else { 2475 auto DSign = B.buildXor(S32, LHSign, RHSign); 2476 UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0); 2477 B.buildSub(DstReg, UDivRem, DSign); 2478 } 2479 2480 MI.eraseFromParent(); 2481 return true; 2482 } 2483 2484 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2485 MachineRegisterInfo &MRI, 2486 MachineIRBuilder &B) const { 2487 if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32)) 2488 return legalizeSDIV_SREM32(MI, MRI, B); 2489 return false; 2490 } 2491 2492 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2493 MachineRegisterInfo &MRI, 2494 MachineIRBuilder &B) const { 2495 Register Res = MI.getOperand(0).getReg(); 2496 Register LHS = MI.getOperand(1).getReg(); 2497 Register RHS = MI.getOperand(2).getReg(); 2498 2499 uint16_t Flags = MI.getFlags(); 2500 2501 LLT ResTy = MRI.getType(Res); 2502 LLT S32 = LLT::scalar(32); 2503 LLT S64 = LLT::scalar(64); 2504 2505 const MachineFunction &MF = B.getMF(); 2506 bool Unsafe = 2507 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2508 2509 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2510 return false; 2511 2512 if (!Unsafe && ResTy == S32 && 2513 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2514 return false; 2515 2516 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2517 // 1 / x -> RCP(x) 2518 if (CLHS->isExactlyValue(1.0)) { 2519 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2520 .addUse(RHS) 2521 .setMIFlags(Flags); 2522 2523 MI.eraseFromParent(); 2524 return true; 2525 } 2526 2527 // -1 / x -> RCP( FNEG(x) ) 2528 if (CLHS->isExactlyValue(-1.0)) { 2529 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2530 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2531 .addUse(FNeg.getReg(0)) 2532 .setMIFlags(Flags); 2533 2534 MI.eraseFromParent(); 2535 return true; 2536 } 2537 } 2538 2539 // x / y -> x * (1.0 / y) 2540 if (Unsafe) { 2541 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2542 .addUse(RHS) 2543 .setMIFlags(Flags); 2544 B.buildFMul(Res, LHS, RCP, Flags); 2545 2546 MI.eraseFromParent(); 2547 return true; 2548 } 2549 2550 return false; 2551 } 2552 2553 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2554 MachineRegisterInfo &MRI, 2555 MachineIRBuilder &B) const { 2556 B.setInstr(MI); 2557 Register Res = MI.getOperand(0).getReg(); 2558 Register LHS = MI.getOperand(1).getReg(); 2559 Register RHS = MI.getOperand(2).getReg(); 2560 2561 uint16_t Flags = MI.getFlags(); 2562 2563 LLT S16 = LLT::scalar(16); 2564 LLT S32 = LLT::scalar(32); 2565 2566 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2567 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2568 2569 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2570 .addUse(RHSExt.getReg(0)) 2571 .setMIFlags(Flags); 2572 2573 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2574 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2575 2576 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2577 .addUse(RDst.getReg(0)) 2578 .addUse(RHS) 2579 .addUse(LHS) 2580 .setMIFlags(Flags); 2581 2582 MI.eraseFromParent(); 2583 return true; 2584 } 2585 2586 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2587 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2588 static void toggleSPDenormMode(bool Enable, 2589 MachineIRBuilder &B, 2590 const GCNSubtarget &ST, 2591 AMDGPU::SIModeRegisterDefaults Mode) { 2592 // Set SP denorm mode to this value. 2593 unsigned SPDenormMode = 2594 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2595 2596 if (ST.hasDenormModeInst()) { 2597 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2598 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2599 2600 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2601 B.buildInstr(AMDGPU::S_DENORM_MODE) 2602 .addImm(NewDenormModeValue); 2603 2604 } else { 2605 // Select FP32 bit field in mode register. 2606 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2607 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2608 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2609 2610 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2611 .addImm(SPDenormMode) 2612 .addImm(SPDenormModeBitField); 2613 } 2614 } 2615 2616 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2617 MachineRegisterInfo &MRI, 2618 MachineIRBuilder &B) const { 2619 B.setInstr(MI); 2620 Register Res = MI.getOperand(0).getReg(); 2621 Register LHS = MI.getOperand(1).getReg(); 2622 Register RHS = MI.getOperand(2).getReg(); 2623 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2624 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2625 2626 uint16_t Flags = MI.getFlags(); 2627 2628 LLT S32 = LLT::scalar(32); 2629 LLT S1 = LLT::scalar(1); 2630 2631 auto One = B.buildFConstant(S32, 1.0f); 2632 2633 auto DenominatorScaled = 2634 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2635 .addUse(RHS) 2636 .addUse(LHS) 2637 .addImm(1) 2638 .setMIFlags(Flags); 2639 auto NumeratorScaled = 2640 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2641 .addUse(LHS) 2642 .addUse(RHS) 2643 .addImm(0) 2644 .setMIFlags(Flags); 2645 2646 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2647 .addUse(DenominatorScaled.getReg(0)) 2648 .setMIFlags(Flags); 2649 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2650 2651 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2652 // aren't modeled as reading it. 2653 if (!Mode.allFP32Denormals()) 2654 toggleSPDenormMode(true, B, ST, Mode); 2655 2656 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2657 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2658 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2659 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2660 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2661 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2662 2663 if (!Mode.allFP32Denormals()) 2664 toggleSPDenormMode(false, B, ST, Mode); 2665 2666 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2667 .addUse(Fma4.getReg(0)) 2668 .addUse(Fma1.getReg(0)) 2669 .addUse(Fma3.getReg(0)) 2670 .addUse(NumeratorScaled.getReg(1)) 2671 .setMIFlags(Flags); 2672 2673 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2674 .addUse(Fmas.getReg(0)) 2675 .addUse(RHS) 2676 .addUse(LHS) 2677 .setMIFlags(Flags); 2678 2679 MI.eraseFromParent(); 2680 return true; 2681 } 2682 2683 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2684 MachineRegisterInfo &MRI, 2685 MachineIRBuilder &B) const { 2686 B.setInstr(MI); 2687 Register Res = MI.getOperand(0).getReg(); 2688 Register LHS = MI.getOperand(1).getReg(); 2689 Register RHS = MI.getOperand(2).getReg(); 2690 2691 uint16_t Flags = MI.getFlags(); 2692 2693 LLT S64 = LLT::scalar(64); 2694 LLT S1 = LLT::scalar(1); 2695 2696 auto One = B.buildFConstant(S64, 1.0); 2697 2698 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2699 .addUse(LHS) 2700 .addUse(RHS) 2701 .addImm(1) 2702 .setMIFlags(Flags); 2703 2704 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2705 2706 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2707 .addUse(DivScale0.getReg(0)) 2708 .setMIFlags(Flags); 2709 2710 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2711 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2712 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2713 2714 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2715 .addUse(LHS) 2716 .addUse(RHS) 2717 .addImm(0) 2718 .setMIFlags(Flags); 2719 2720 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2721 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); 2722 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2723 2724 Register Scale; 2725 if (!ST.hasUsableDivScaleConditionOutput()) { 2726 // Workaround a hardware bug on SI where the condition output from div_scale 2727 // is not usable. 2728 2729 LLT S32 = LLT::scalar(32); 2730 2731 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2732 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2733 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2734 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 2735 2736 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 2737 Scale1Unmerge.getReg(1)); 2738 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 2739 Scale0Unmerge.getReg(1)); 2740 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 2741 } else { 2742 Scale = DivScale1.getReg(1); 2743 } 2744 2745 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 2746 .addUse(Fma4.getReg(0)) 2747 .addUse(Fma3.getReg(0)) 2748 .addUse(Mul.getReg(0)) 2749 .addUse(Scale) 2750 .setMIFlags(Flags); 2751 2752 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 2753 .addUse(Fmas.getReg(0)) 2754 .addUse(RHS) 2755 .addUse(LHS) 2756 .setMIFlags(Flags); 2757 2758 MI.eraseFromParent(); 2759 return true; 2760 } 2761 2762 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 2763 MachineRegisterInfo &MRI, 2764 MachineIRBuilder &B) const { 2765 B.setInstr(MI); 2766 Register Res = MI.getOperand(0).getReg(); 2767 Register LHS = MI.getOperand(2).getReg(); 2768 Register RHS = MI.getOperand(3).getReg(); 2769 uint16_t Flags = MI.getFlags(); 2770 2771 LLT S32 = LLT::scalar(32); 2772 LLT S1 = LLT::scalar(1); 2773 2774 auto Abs = B.buildFAbs(S32, RHS, Flags); 2775 const APFloat C0Val(1.0f); 2776 2777 auto C0 = B.buildConstant(S32, 0x6f800000); 2778 auto C1 = B.buildConstant(S32, 0x2f800000); 2779 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 2780 2781 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 2782 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 2783 2784 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 2785 2786 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2787 .addUse(Mul0.getReg(0)) 2788 .setMIFlags(Flags); 2789 2790 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 2791 2792 B.buildFMul(Res, Sel, Mul1, Flags); 2793 2794 MI.eraseFromParent(); 2795 return true; 2796 } 2797 2798 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 2799 MachineRegisterInfo &MRI, 2800 MachineIRBuilder &B) const { 2801 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2802 if (!MFI->isEntryFunction()) { 2803 return legalizePreloadedArgIntrin(MI, MRI, B, 2804 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 2805 } 2806 2807 B.setInstr(MI); 2808 2809 uint64_t Offset = 2810 ST.getTargetLowering()->getImplicitParameterOffset( 2811 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 2812 Register DstReg = MI.getOperand(0).getReg(); 2813 LLT DstTy = MRI.getType(DstReg); 2814 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 2815 2816 const ArgDescriptor *Arg; 2817 const TargetRegisterClass *RC; 2818 std::tie(Arg, RC) 2819 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2820 if (!Arg) 2821 return false; 2822 2823 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 2824 if (!loadInputValue(KernargPtrReg, B, Arg)) 2825 return false; 2826 2827 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 2828 MI.eraseFromParent(); 2829 return true; 2830 } 2831 2832 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 2833 MachineRegisterInfo &MRI, 2834 MachineIRBuilder &B, 2835 unsigned AddrSpace) const { 2836 B.setInstr(MI); 2837 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 2838 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 2839 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 2840 MI.eraseFromParent(); 2841 return true; 2842 } 2843 2844 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 2845 // offset (the offset that is included in bounds checking and swizzling, to be 2846 // split between the instruction's voffset and immoffset fields) and soffset 2847 // (the offset that is excluded from bounds checking and swizzling, to go in 2848 // the instruction's soffset field). This function takes the first kind of 2849 // offset and figures out how to split it between voffset and immoffset. 2850 std::tuple<Register, unsigned, unsigned> 2851 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 2852 Register OrigOffset) const { 2853 const unsigned MaxImm = 4095; 2854 Register BaseReg; 2855 unsigned TotalConstOffset; 2856 MachineInstr *OffsetDef; 2857 const LLT S32 = LLT::scalar(32); 2858 2859 std::tie(BaseReg, TotalConstOffset, OffsetDef) 2860 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 2861 2862 unsigned ImmOffset = TotalConstOffset; 2863 2864 // If the immediate value is too big for the immoffset field, put the value 2865 // and -4096 into the immoffset field so that the value that is copied/added 2866 // for the voffset field is a multiple of 4096, and it stands more chance 2867 // of being CSEd with the copy/add for another similar load/store. 2868 // However, do not do that rounding down to a multiple of 4096 if that is a 2869 // negative number, as it appears to be illegal to have a negative offset 2870 // in the vgpr, even if adding the immediate offset makes it positive. 2871 unsigned Overflow = ImmOffset & ~MaxImm; 2872 ImmOffset -= Overflow; 2873 if ((int32_t)Overflow < 0) { 2874 Overflow += ImmOffset; 2875 ImmOffset = 0; 2876 } 2877 2878 if (Overflow != 0) { 2879 if (!BaseReg) { 2880 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 2881 } else { 2882 auto OverflowVal = B.buildConstant(S32, Overflow); 2883 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 2884 } 2885 } 2886 2887 if (!BaseReg) 2888 BaseReg = B.buildConstant(S32, 0).getReg(0); 2889 2890 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 2891 } 2892 2893 /// Handle register layout difference for f16 images for some subtargets. 2894 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 2895 MachineRegisterInfo &MRI, 2896 Register Reg) const { 2897 if (!ST.hasUnpackedD16VMem()) 2898 return Reg; 2899 2900 const LLT S16 = LLT::scalar(16); 2901 const LLT S32 = LLT::scalar(32); 2902 LLT StoreVT = MRI.getType(Reg); 2903 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 2904 2905 auto Unmerge = B.buildUnmerge(S16, Reg); 2906 2907 SmallVector<Register, 4> WideRegs; 2908 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 2909 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 2910 2911 int NumElts = StoreVT.getNumElements(); 2912 2913 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 2914 } 2915 2916 Register AMDGPULegalizerInfo::fixStoreSourceType( 2917 MachineIRBuilder &B, Register VData, bool IsFormat) const { 2918 MachineRegisterInfo *MRI = B.getMRI(); 2919 LLT Ty = MRI->getType(VData); 2920 2921 const LLT S16 = LLT::scalar(16); 2922 2923 // Fixup illegal register types for i8 stores. 2924 if (Ty == LLT::scalar(8) || Ty == S16) { 2925 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 2926 return AnyExt; 2927 } 2928 2929 if (Ty.isVector()) { 2930 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 2931 if (IsFormat) 2932 return handleD16VData(B, *MRI, VData); 2933 } 2934 } 2935 2936 return VData; 2937 } 2938 2939 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 2940 MachineRegisterInfo &MRI, 2941 MachineIRBuilder &B, 2942 bool IsTyped, 2943 bool IsFormat) const { 2944 B.setInstr(MI); 2945 2946 Register VData = MI.getOperand(1).getReg(); 2947 LLT Ty = MRI.getType(VData); 2948 LLT EltTy = Ty.getScalarType(); 2949 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 2950 const LLT S32 = LLT::scalar(32); 2951 2952 VData = fixStoreSourceType(B, VData, IsFormat); 2953 Register RSrc = MI.getOperand(2).getReg(); 2954 2955 MachineMemOperand *MMO = *MI.memoperands_begin(); 2956 const int MemSize = MMO->getSize(); 2957 2958 unsigned ImmOffset; 2959 unsigned TotalOffset; 2960 2961 // The typed intrinsics add an immediate after the registers. 2962 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 2963 2964 // The struct intrinsic variants add one additional operand over raw. 2965 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 2966 Register VIndex; 2967 int OpOffset = 0; 2968 if (HasVIndex) { 2969 VIndex = MI.getOperand(3).getReg(); 2970 OpOffset = 1; 2971 } 2972 2973 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 2974 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 2975 2976 unsigned Format = 0; 2977 if (IsTyped) { 2978 Format = MI.getOperand(5 + OpOffset).getImm(); 2979 ++OpOffset; 2980 } 2981 2982 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 2983 2984 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 2985 if (TotalOffset != 0) 2986 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 2987 2988 unsigned Opc; 2989 if (IsTyped) { 2990 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 2991 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 2992 } else if (IsFormat) { 2993 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 2994 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 2995 } else { 2996 switch (MemSize) { 2997 case 1: 2998 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 2999 break; 3000 case 2: 3001 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3002 break; 3003 default: 3004 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3005 break; 3006 } 3007 } 3008 3009 if (!VIndex) 3010 VIndex = B.buildConstant(S32, 0).getReg(0); 3011 3012 auto MIB = B.buildInstr(Opc) 3013 .addUse(VData) // vdata 3014 .addUse(RSrc) // rsrc 3015 .addUse(VIndex) // vindex 3016 .addUse(VOffset) // voffset 3017 .addUse(SOffset) // soffset 3018 .addImm(ImmOffset); // offset(imm) 3019 3020 if (IsTyped) 3021 MIB.addImm(Format); 3022 3023 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3024 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3025 .addMemOperand(MMO); 3026 3027 MI.eraseFromParent(); 3028 return true; 3029 } 3030 3031 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3032 MachineRegisterInfo &MRI, 3033 MachineIRBuilder &B, 3034 bool IsFormat, 3035 bool IsTyped) const { 3036 B.setInstr(MI); 3037 3038 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3039 MachineMemOperand *MMO = *MI.memoperands_begin(); 3040 const int MemSize = MMO->getSize(); 3041 const LLT S32 = LLT::scalar(32); 3042 3043 Register Dst = MI.getOperand(0).getReg(); 3044 Register RSrc = MI.getOperand(2).getReg(); 3045 3046 // The typed intrinsics add an immediate after the registers. 3047 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3048 3049 // The struct intrinsic variants add one additional operand over raw. 3050 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3051 Register VIndex; 3052 int OpOffset = 0; 3053 if (HasVIndex) { 3054 VIndex = MI.getOperand(3).getReg(); 3055 OpOffset = 1; 3056 } 3057 3058 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3059 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3060 3061 unsigned Format = 0; 3062 if (IsTyped) { 3063 Format = MI.getOperand(5 + OpOffset).getImm(); 3064 ++OpOffset; 3065 } 3066 3067 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3068 unsigned ImmOffset; 3069 unsigned TotalOffset; 3070 3071 LLT Ty = MRI.getType(Dst); 3072 LLT EltTy = Ty.getScalarType(); 3073 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3074 const bool Unpacked = ST.hasUnpackedD16VMem(); 3075 3076 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3077 if (TotalOffset != 0) 3078 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3079 3080 unsigned Opc; 3081 3082 if (IsTyped) { 3083 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3084 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3085 } else if (IsFormat) { 3086 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3087 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3088 } else { 3089 switch (MemSize) { 3090 case 1: 3091 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3092 break; 3093 case 2: 3094 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3095 break; 3096 default: 3097 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3098 break; 3099 } 3100 } 3101 3102 Register LoadDstReg; 3103 3104 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3105 LLT UnpackedTy = Ty.changeElementSize(32); 3106 3107 if (IsExtLoad) 3108 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3109 else if (Unpacked && IsD16 && Ty.isVector()) 3110 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3111 else 3112 LoadDstReg = Dst; 3113 3114 if (!VIndex) 3115 VIndex = B.buildConstant(S32, 0).getReg(0); 3116 3117 auto MIB = B.buildInstr(Opc) 3118 .addDef(LoadDstReg) // vdata 3119 .addUse(RSrc) // rsrc 3120 .addUse(VIndex) // vindex 3121 .addUse(VOffset) // voffset 3122 .addUse(SOffset) // soffset 3123 .addImm(ImmOffset); // offset(imm) 3124 3125 if (IsTyped) 3126 MIB.addImm(Format); 3127 3128 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3129 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3130 .addMemOperand(MMO); 3131 3132 if (LoadDstReg != Dst) { 3133 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3134 3135 // Widen result for extending loads was widened. 3136 if (IsExtLoad) 3137 B.buildTrunc(Dst, LoadDstReg); 3138 else { 3139 // Repack to original 16-bit vector result 3140 // FIXME: G_TRUNC should work, but legalization currently fails 3141 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3142 SmallVector<Register, 4> Repack; 3143 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3144 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3145 B.buildMerge(Dst, Repack); 3146 } 3147 } 3148 3149 MI.eraseFromParent(); 3150 return true; 3151 } 3152 3153 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3154 MachineIRBuilder &B, 3155 bool IsInc) const { 3156 B.setInstr(MI); 3157 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3158 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3159 B.buildInstr(Opc) 3160 .addDef(MI.getOperand(0).getReg()) 3161 .addUse(MI.getOperand(2).getReg()) 3162 .addUse(MI.getOperand(3).getReg()) 3163 .cloneMemRefs(MI); 3164 MI.eraseFromParent(); 3165 return true; 3166 } 3167 3168 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3169 switch (IntrID) { 3170 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3171 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3172 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3173 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3174 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3175 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3176 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3177 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3178 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3179 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3180 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3181 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3182 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3183 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3184 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3185 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3186 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3187 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3188 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3189 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3190 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3191 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3192 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3193 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3194 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3195 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3196 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3197 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3198 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3199 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3200 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3201 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3202 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3203 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3204 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3205 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3206 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3207 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3208 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3209 default: 3210 llvm_unreachable("unhandled atomic opcode"); 3211 } 3212 } 3213 3214 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3215 MachineIRBuilder &B, 3216 Intrinsic::ID IID) const { 3217 B.setInstr(MI); 3218 3219 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3220 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3221 3222 Register Dst = MI.getOperand(0).getReg(); 3223 Register VData = MI.getOperand(2).getReg(); 3224 3225 Register CmpVal; 3226 int OpOffset = 0; 3227 3228 if (IsCmpSwap) { 3229 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3230 ++OpOffset; 3231 } 3232 3233 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3234 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3235 3236 // The struct intrinsic variants add one additional operand over raw. 3237 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3238 Register VIndex; 3239 if (HasVIndex) { 3240 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3241 ++OpOffset; 3242 } 3243 3244 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3245 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3246 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3247 3248 MachineMemOperand *MMO = *MI.memoperands_begin(); 3249 3250 unsigned ImmOffset; 3251 unsigned TotalOffset; 3252 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3253 if (TotalOffset != 0) 3254 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3255 3256 if (!VIndex) 3257 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3258 3259 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3260 .addDef(Dst) 3261 .addUse(VData); // vdata 3262 3263 if (IsCmpSwap) 3264 MIB.addReg(CmpVal); 3265 3266 MIB.addUse(RSrc) // rsrc 3267 .addUse(VIndex) // vindex 3268 .addUse(VOffset) // voffset 3269 .addUse(SOffset) // soffset 3270 .addImm(ImmOffset) // offset(imm) 3271 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3272 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3273 .addMemOperand(MMO); 3274 3275 MI.eraseFromParent(); 3276 return true; 3277 } 3278 3279 // Produce a vector of s16 elements from s32 pieces. 3280 static void truncToS16Vector(MachineIRBuilder &B, Register DstReg, 3281 ArrayRef<Register> UnmergeParts) { 3282 const LLT S16 = LLT::scalar(16); 3283 3284 SmallVector<Register, 4> RemergeParts(UnmergeParts.size()); 3285 for (int I = 0, E = UnmergeParts.size(); I != E; ++I) 3286 RemergeParts[I] = B.buildTrunc(S16, UnmergeParts[I]).getReg(0); 3287 3288 B.buildBuildVector(DstReg, RemergeParts); 3289 } 3290 3291 /// Convert a set of s32 registers to a result vector with s16 elements. 3292 static void bitcastToS16Vector(MachineIRBuilder &B, Register DstReg, 3293 ArrayRef<Register> UnmergeParts) { 3294 MachineRegisterInfo &MRI = *B.getMRI(); 3295 const LLT V2S16 = LLT::vector(2, 16); 3296 LLT TargetTy = MRI.getType(DstReg); 3297 int NumElts = UnmergeParts.size(); 3298 3299 if (NumElts == 1) { 3300 assert(TargetTy == V2S16); 3301 B.buildBitcast(DstReg, UnmergeParts[0]); 3302 return; 3303 } 3304 3305 SmallVector<Register, 4> RemergeParts(NumElts); 3306 for (int I = 0; I != NumElts; ++I) 3307 RemergeParts[I] = B.buildBitcast(V2S16, UnmergeParts[I]).getReg(0); 3308 3309 if (TargetTy.getSizeInBits() == 32u * NumElts) { 3310 B.buildConcatVectors(DstReg, RemergeParts); 3311 return; 3312 } 3313 3314 const LLT V3S16 = LLT::vector(3, 16); 3315 const LLT V6S16 = LLT::vector(6, 16); 3316 3317 // Widen to v6s16 and unpack v3 parts. 3318 assert(TargetTy == V3S16); 3319 3320 RemergeParts.push_back(B.buildUndef(V2S16).getReg(0)); 3321 auto Concat = B.buildConcatVectors(V6S16, RemergeParts); 3322 B.buildUnmerge({DstReg, MRI.createGenericVirtualRegister(V3S16)}, Concat); 3323 } 3324 3325 // FIXME: Just vector trunc should be sufficent, but legalization currently 3326 // broken. 3327 static void repackUnpackedD16Load(MachineIRBuilder &B, Register DstReg, 3328 Register WideDstReg) { 3329 const LLT S32 = LLT::scalar(32); 3330 const LLT S16 = LLT::scalar(16); 3331 3332 auto Unmerge = B.buildUnmerge(S32, WideDstReg); 3333 3334 int NumOps = Unmerge->getNumOperands() - 1; 3335 SmallVector<Register, 4> RemergeParts(NumOps); 3336 for (int I = 0; I != NumOps; ++I) 3337 RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0); 3338 3339 B.buildBuildVector(DstReg, RemergeParts); 3340 } 3341 3342 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3343 MachineInstr &MI, MachineIRBuilder &B, 3344 GISelChangeObserver &Observer, 3345 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3346 bool IsTFE = MI.getNumExplicitDefs() == 2; 3347 3348 // We are only processing the operands of d16 image operations on subtargets 3349 // that use the unpacked register layout, or need to repack the TFE result. 3350 3351 // TODO: Need to handle a16 images too 3352 // TODO: Do we need to guard against already legalized intrinsics? 3353 if (!IsTFE && !ST.hasUnpackedD16VMem()) 3354 return true; 3355 3356 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3357 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3358 3359 if (BaseOpcode->Atomic) // No d16 atomics, or TFE. 3360 return true; 3361 3362 B.setInstr(MI); 3363 3364 MachineRegisterInfo *MRI = B.getMRI(); 3365 const LLT S32 = LLT::scalar(32); 3366 const LLT S16 = LLT::scalar(16); 3367 3368 if (BaseOpcode->Store) { // No TFE for stores? 3369 Register VData = MI.getOperand(1).getReg(); 3370 LLT Ty = MRI->getType(VData); 3371 if (!Ty.isVector() || Ty.getElementType() != S16) 3372 return true; 3373 3374 B.setInstr(MI); 3375 3376 Observer.changingInstr(MI); 3377 MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData)); 3378 Observer.changedInstr(MI); 3379 return true; 3380 } 3381 3382 Register DstReg = MI.getOperand(0).getReg(); 3383 LLT Ty = MRI->getType(DstReg); 3384 const LLT EltTy = Ty.getScalarType(); 3385 const bool IsD16 = Ty.getScalarType() == S16; 3386 const unsigned NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3387 3388 if (IsTFE) { 3389 // In the IR, TFE is supposed to be used with a 2 element struct return 3390 // type. The intruction really returns these two values in one contiguous 3391 // register, with one additional dword beyond the loaded data. Rewrite the 3392 // return type to use a single register result. 3393 Register Dst1Reg = MI.getOperand(1).getReg(); 3394 if (MRI->getType(Dst1Reg) != S32) 3395 return false; 3396 3397 // TODO: Make sure the TFE operand bit is set. 3398 3399 // The raw dword aligned data component of the load. The only legal cases 3400 // where this matters should be when using the packed D16 format, for 3401 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3402 LLT RoundedTy; 3403 LLT TFETy; 3404 3405 if (IsD16 && ST.hasUnpackedD16VMem()) { 3406 RoundedTy = LLT::scalarOrVector(NumElts, 32); 3407 TFETy = LLT::vector(NumElts + 1, 32); 3408 } else { 3409 unsigned EltSize = Ty.getScalarSizeInBits(); 3410 unsigned RoundedElts = (Ty.getSizeInBits() + 31) / 32; 3411 unsigned RoundedSize = 32 * RoundedElts; 3412 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3413 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3414 } 3415 3416 Register TFEReg = MRI->createGenericVirtualRegister(TFETy); 3417 Observer.changingInstr(MI); 3418 3419 MI.getOperand(0).setReg(TFEReg); 3420 MI.RemoveOperand(1); 3421 3422 Observer.changedInstr(MI); 3423 3424 // Insert after the instruction. 3425 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3426 3427 // Now figure out how to copy the new result register back into the old 3428 // result. 3429 3430 SmallVector<Register, 5> UnmergeResults(TFETy.getNumElements(), Dst1Reg); 3431 int NumDataElts = TFETy.getNumElements() - 1; 3432 3433 if (!Ty.isVector()) { 3434 // Simplest case is a trivial unmerge (plus a truncate for d16). 3435 UnmergeResults[0] = Ty == S32 ? 3436 DstReg : MRI->createGenericVirtualRegister(S32); 3437 3438 B.buildUnmerge(UnmergeResults, TFEReg); 3439 if (Ty != S32) 3440 B.buildTrunc(DstReg, UnmergeResults[0]); 3441 return true; 3442 } 3443 3444 // We have to repack into a new vector of some kind. 3445 for (int I = 0; I != NumDataElts; ++I) 3446 UnmergeResults[I] = MRI->createGenericVirtualRegister(S32); 3447 B.buildUnmerge(UnmergeResults, TFEReg); 3448 3449 // Drop the final TFE element. 3450 ArrayRef<Register> DataPart(UnmergeResults.data(), NumDataElts); 3451 3452 if (EltTy == S32) 3453 B.buildBuildVector(DstReg, DataPart); 3454 else if (ST.hasUnpackedD16VMem()) 3455 truncToS16Vector(B, DstReg, DataPart); 3456 else 3457 bitcastToS16Vector(B, DstReg, DataPart); 3458 3459 return true; 3460 } 3461 3462 // Must be an image load. 3463 if (!Ty.isVector() || Ty.getElementType() != S16) 3464 return true; 3465 3466 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3467 3468 LLT WidenedTy = Ty.changeElementType(S32); 3469 Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy); 3470 3471 Observer.changingInstr(MI); 3472 MI.getOperand(0).setReg(WideDstReg); 3473 Observer.changedInstr(MI); 3474 3475 repackUnpackedD16Load(B, DstReg, WideDstReg); 3476 return true; 3477 } 3478 3479 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 3480 MachineInstr &MI, MachineIRBuilder &B, 3481 GISelChangeObserver &Observer) const { 3482 Register Dst = MI.getOperand(0).getReg(); 3483 LLT Ty = B.getMRI()->getType(Dst); 3484 unsigned Size = Ty.getSizeInBits(); 3485 MachineFunction &MF = B.getMF(); 3486 3487 Observer.changingInstr(MI); 3488 3489 // FIXME: We don't really need this intermediate instruction. The intrinsic 3490 // should be fixed to have a memory operand. Since it's readnone, we're not 3491 // allowed to add one. 3492 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 3493 MI.RemoveOperand(1); // Remove intrinsic ID 3494 3495 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 3496 // TODO: Should this use datalayout alignment? 3497 const unsigned MemSize = (Size + 7) / 8; 3498 const unsigned MemAlign = 4; 3499 MachineMemOperand *MMO = MF.getMachineMemOperand( 3500 MachinePointerInfo(), 3501 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 3502 MachineMemOperand::MOInvariant, MemSize, MemAlign); 3503 MI.addMemOperand(MF, MMO); 3504 3505 // There are no 96-bit result scalar loads, but widening to 128-bit should 3506 // always be legal. We may need to restore this to a 96-bit result if it turns 3507 // out this needs to be converted to a vector load during RegBankSelect. 3508 if (!isPowerOf2_32(Size)) { 3509 LegalizerHelper Helper(MF, *this, Observer, B); 3510 B.setInstr(MI); 3511 3512 if (Ty.isVector()) 3513 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 3514 else 3515 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 3516 } 3517 3518 Observer.changedInstr(MI); 3519 return true; 3520 } 3521 3522 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 3523 MachineIRBuilder &B, 3524 GISelChangeObserver &Observer) const { 3525 MachineRegisterInfo &MRI = *B.getMRI(); 3526 3527 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 3528 auto IntrID = MI.getIntrinsicID(); 3529 switch (IntrID) { 3530 case Intrinsic::amdgcn_if: 3531 case Intrinsic::amdgcn_else: { 3532 MachineInstr *Br = nullptr; 3533 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 3534 const SIRegisterInfo *TRI 3535 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 3536 3537 B.setInstr(*BrCond); 3538 Register Def = MI.getOperand(1).getReg(); 3539 Register Use = MI.getOperand(3).getReg(); 3540 3541 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 3542 if (Br) 3543 BrTarget = Br->getOperand(0).getMBB(); 3544 3545 if (IntrID == Intrinsic::amdgcn_if) { 3546 B.buildInstr(AMDGPU::SI_IF) 3547 .addDef(Def) 3548 .addUse(Use) 3549 .addMBB(BrTarget); 3550 } else { 3551 B.buildInstr(AMDGPU::SI_ELSE) 3552 .addDef(Def) 3553 .addUse(Use) 3554 .addMBB(BrTarget) 3555 .addImm(0); 3556 } 3557 3558 if (Br) 3559 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 3560 3561 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 3562 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 3563 MI.eraseFromParent(); 3564 BrCond->eraseFromParent(); 3565 return true; 3566 } 3567 3568 return false; 3569 } 3570 case Intrinsic::amdgcn_loop: { 3571 MachineInstr *Br = nullptr; 3572 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 3573 const SIRegisterInfo *TRI 3574 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 3575 3576 B.setInstr(*BrCond); 3577 3578 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 3579 if (Br) 3580 BrTarget = Br->getOperand(0).getMBB(); 3581 3582 Register Reg = MI.getOperand(2).getReg(); 3583 B.buildInstr(AMDGPU::SI_LOOP) 3584 .addUse(Reg) 3585 .addMBB(BrTarget); 3586 3587 if (Br) 3588 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 3589 3590 MI.eraseFromParent(); 3591 BrCond->eraseFromParent(); 3592 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 3593 return true; 3594 } 3595 3596 return false; 3597 } 3598 case Intrinsic::amdgcn_kernarg_segment_ptr: 3599 return legalizePreloadedArgIntrin( 3600 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3601 case Intrinsic::amdgcn_implicitarg_ptr: 3602 return legalizeImplicitArgPtr(MI, MRI, B); 3603 case Intrinsic::amdgcn_workitem_id_x: 3604 return legalizePreloadedArgIntrin(MI, MRI, B, 3605 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 3606 case Intrinsic::amdgcn_workitem_id_y: 3607 return legalizePreloadedArgIntrin(MI, MRI, B, 3608 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 3609 case Intrinsic::amdgcn_workitem_id_z: 3610 return legalizePreloadedArgIntrin(MI, MRI, B, 3611 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 3612 case Intrinsic::amdgcn_workgroup_id_x: 3613 return legalizePreloadedArgIntrin(MI, MRI, B, 3614 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 3615 case Intrinsic::amdgcn_workgroup_id_y: 3616 return legalizePreloadedArgIntrin(MI, MRI, B, 3617 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 3618 case Intrinsic::amdgcn_workgroup_id_z: 3619 return legalizePreloadedArgIntrin(MI, MRI, B, 3620 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 3621 case Intrinsic::amdgcn_dispatch_ptr: 3622 return legalizePreloadedArgIntrin(MI, MRI, B, 3623 AMDGPUFunctionArgInfo::DISPATCH_PTR); 3624 case Intrinsic::amdgcn_queue_ptr: 3625 return legalizePreloadedArgIntrin(MI, MRI, B, 3626 AMDGPUFunctionArgInfo::QUEUE_PTR); 3627 case Intrinsic::amdgcn_implicit_buffer_ptr: 3628 return legalizePreloadedArgIntrin( 3629 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 3630 case Intrinsic::amdgcn_dispatch_id: 3631 return legalizePreloadedArgIntrin(MI, MRI, B, 3632 AMDGPUFunctionArgInfo::DISPATCH_ID); 3633 case Intrinsic::amdgcn_fdiv_fast: 3634 return legalizeFDIVFastIntrin(MI, MRI, B); 3635 case Intrinsic::amdgcn_is_shared: 3636 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 3637 case Intrinsic::amdgcn_is_private: 3638 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 3639 case Intrinsic::amdgcn_wavefrontsize: { 3640 B.setInstr(MI); 3641 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 3642 MI.eraseFromParent(); 3643 return true; 3644 } 3645 case Intrinsic::amdgcn_s_buffer_load: 3646 return legalizeSBufferLoad(MI, B, Observer); 3647 case Intrinsic::amdgcn_raw_buffer_store: 3648 case Intrinsic::amdgcn_struct_buffer_store: 3649 return legalizeBufferStore(MI, MRI, B, false, false); 3650 case Intrinsic::amdgcn_raw_buffer_store_format: 3651 case Intrinsic::amdgcn_struct_buffer_store_format: 3652 return legalizeBufferStore(MI, MRI, B, false, true); 3653 case Intrinsic::amdgcn_raw_tbuffer_store: 3654 case Intrinsic::amdgcn_struct_tbuffer_store: 3655 return legalizeBufferStore(MI, MRI, B, true, true); 3656 case Intrinsic::amdgcn_raw_buffer_load: 3657 case Intrinsic::amdgcn_struct_buffer_load: 3658 return legalizeBufferLoad(MI, MRI, B, false, false); 3659 case Intrinsic::amdgcn_raw_buffer_load_format: 3660 case Intrinsic::amdgcn_struct_buffer_load_format: 3661 return legalizeBufferLoad(MI, MRI, B, true, false); 3662 case Intrinsic::amdgcn_raw_tbuffer_load: 3663 case Intrinsic::amdgcn_struct_tbuffer_load: 3664 return legalizeBufferLoad(MI, MRI, B, true, true); 3665 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3666 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3667 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3668 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3669 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3670 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3671 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3672 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3673 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3674 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3675 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3676 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3677 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3678 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3679 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3680 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3681 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3682 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3683 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3684 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3685 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3686 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3687 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3688 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3689 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3690 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3691 return legalizeBufferAtomic(MI, B, IntrID); 3692 case Intrinsic::amdgcn_atomic_inc: 3693 return legalizeAtomicIncDec(MI, B, true); 3694 case Intrinsic::amdgcn_atomic_dec: 3695 return legalizeAtomicIncDec(MI, B, false); 3696 default: { 3697 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 3698 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 3699 return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr); 3700 return true; 3701 } 3702 } 3703 3704 return true; 3705 } 3706