1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPULegalizerInfo.h" 22 23 #include "AMDGPU.h" 24 #include "AMDGPUGlobalISelUtils.h" 25 #include "AMDGPUTargetMachine.h" 26 #include "SIMachineFunctionInfo.h" 27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 30 #include "llvm/CodeGen/TargetOpcodes.h" 31 #include "llvm/CodeGen/ValueTypes.h" 32 #include "llvm/IR/DerivedTypes.h" 33 #include "llvm/IR/DiagnosticInfo.h" 34 #include "llvm/IR/Type.h" 35 #include "llvm/Support/Debug.h" 36 37 #define DEBUG_TYPE "amdgpu-legalinfo" 38 39 using namespace llvm; 40 using namespace LegalizeActions; 41 using namespace LegalizeMutations; 42 using namespace LegalityPredicates; 43 using namespace MIPatternMatch; 44 45 // Round the number of elements to the next power of two elements 46 static LLT getPow2VectorType(LLT Ty) { 47 unsigned NElts = Ty.getNumElements(); 48 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 49 return Ty.changeNumElements(Pow2NElts); 50 } 51 52 // Round the number of bits to the next power of two bits 53 static LLT getPow2ScalarType(LLT Ty) { 54 unsigned Bits = Ty.getSizeInBits(); 55 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 56 return LLT::scalar(Pow2Bits); 57 } 58 59 static LegalityPredicate isMultiple32(unsigned TypeIdx, 60 unsigned MaxSize = 1024) { 61 return [=](const LegalityQuery &Query) { 62 const LLT Ty = Query.Types[TypeIdx]; 63 const LLT EltTy = Ty.getScalarType(); 64 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 65 }; 66 } 67 68 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 69 return [=](const LegalityQuery &Query) { 70 return Query.Types[TypeIdx].getSizeInBits() == Size; 71 }; 72 } 73 74 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 75 return [=](const LegalityQuery &Query) { 76 const LLT Ty = Query.Types[TypeIdx]; 77 return Ty.isVector() && 78 Ty.getNumElements() % 2 != 0 && 79 Ty.getElementType().getSizeInBits() < 32 && 80 Ty.getSizeInBits() % 32 != 0; 81 }; 82 } 83 84 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 85 return [=](const LegalityQuery &Query) { 86 const LLT Ty = Query.Types[TypeIdx]; 87 const LLT EltTy = Ty.getScalarType(); 88 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 89 }; 90 } 91 92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 93 return [=](const LegalityQuery &Query) { 94 const LLT Ty = Query.Types[TypeIdx]; 95 const LLT EltTy = Ty.getElementType(); 96 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 97 }; 98 } 99 100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 101 return [=](const LegalityQuery &Query) { 102 const LLT Ty = Query.Types[TypeIdx]; 103 const LLT EltTy = Ty.getElementType(); 104 unsigned Size = Ty.getSizeInBits(); 105 unsigned Pieces = (Size + 63) / 64; 106 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 107 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 108 }; 109 } 110 111 // Increase the number of vector elements to reach the next multiple of 32-bit 112 // type. 113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 114 return [=](const LegalityQuery &Query) { 115 const LLT Ty = Query.Types[TypeIdx]; 116 117 const LLT EltTy = Ty.getElementType(); 118 const int Size = Ty.getSizeInBits(); 119 const int EltSize = EltTy.getSizeInBits(); 120 const int NextMul32 = (Size + 31) / 32; 121 122 assert(EltSize < 32); 123 124 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 125 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 126 }; 127 } 128 129 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 130 return [=](const LegalityQuery &Query) { 131 const LLT QueryTy = Query.Types[TypeIdx]; 132 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 133 }; 134 } 135 136 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 137 return [=](const LegalityQuery &Query) { 138 const LLT QueryTy = Query.Types[TypeIdx]; 139 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 140 }; 141 } 142 143 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 144 return [=](const LegalityQuery &Query) { 145 const LLT QueryTy = Query.Types[TypeIdx]; 146 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 147 }; 148 } 149 150 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 151 // v2s16. 152 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 153 return [=](const LegalityQuery &Query) { 154 const LLT Ty = Query.Types[TypeIdx]; 155 if (Ty.isVector()) { 156 const int EltSize = Ty.getElementType().getSizeInBits(); 157 return EltSize == 32 || EltSize == 64 || 158 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 159 EltSize == 128 || EltSize == 256; 160 } 161 162 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 163 }; 164 } 165 166 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 167 return [=](const LegalityQuery &Query) { 168 const LLT QueryTy = Query.Types[TypeIdx]; 169 return QueryTy.isVector() && QueryTy.getElementType() == Type; 170 }; 171 } 172 173 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 174 return [=](const LegalityQuery &Query) { 175 const LLT Ty = Query.Types[TypeIdx]; 176 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 177 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 178 }; 179 } 180 181 static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) { 182 return [=](const LegalityQuery &Query) { 183 return Query.Types[TypeIdx0].getSizeInBits() < 184 Query.Types[TypeIdx1].getSizeInBits(); 185 }; 186 } 187 188 static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) { 189 return [=](const LegalityQuery &Query) { 190 return Query.Types[TypeIdx0].getSizeInBits() > 191 Query.Types[TypeIdx1].getSizeInBits(); 192 }; 193 } 194 195 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 196 const GCNTargetMachine &TM) 197 : ST(ST_) { 198 using namespace TargetOpcode; 199 200 auto GetAddrSpacePtr = [&TM](unsigned AS) { 201 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 202 }; 203 204 const LLT S1 = LLT::scalar(1); 205 const LLT S16 = LLT::scalar(16); 206 const LLT S32 = LLT::scalar(32); 207 const LLT S64 = LLT::scalar(64); 208 const LLT S128 = LLT::scalar(128); 209 const LLT S256 = LLT::scalar(256); 210 const LLT S1024 = LLT::scalar(1024); 211 212 const LLT V2S16 = LLT::vector(2, 16); 213 const LLT V4S16 = LLT::vector(4, 16); 214 215 const LLT V2S32 = LLT::vector(2, 32); 216 const LLT V3S32 = LLT::vector(3, 32); 217 const LLT V4S32 = LLT::vector(4, 32); 218 const LLT V5S32 = LLT::vector(5, 32); 219 const LLT V6S32 = LLT::vector(6, 32); 220 const LLT V7S32 = LLT::vector(7, 32); 221 const LLT V8S32 = LLT::vector(8, 32); 222 const LLT V9S32 = LLT::vector(9, 32); 223 const LLT V10S32 = LLT::vector(10, 32); 224 const LLT V11S32 = LLT::vector(11, 32); 225 const LLT V12S32 = LLT::vector(12, 32); 226 const LLT V13S32 = LLT::vector(13, 32); 227 const LLT V14S32 = LLT::vector(14, 32); 228 const LLT V15S32 = LLT::vector(15, 32); 229 const LLT V16S32 = LLT::vector(16, 32); 230 const LLT V32S32 = LLT::vector(32, 32); 231 232 const LLT V2S64 = LLT::vector(2, 64); 233 const LLT V3S64 = LLT::vector(3, 64); 234 const LLT V4S64 = LLT::vector(4, 64); 235 const LLT V5S64 = LLT::vector(5, 64); 236 const LLT V6S64 = LLT::vector(6, 64); 237 const LLT V7S64 = LLT::vector(7, 64); 238 const LLT V8S64 = LLT::vector(8, 64); 239 const LLT V16S64 = LLT::vector(16, 64); 240 241 std::initializer_list<LLT> AllS32Vectors = 242 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 243 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 244 std::initializer_list<LLT> AllS64Vectors = 245 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 246 247 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 248 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 249 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 250 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 251 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 252 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 253 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 254 255 const LLT CodePtr = FlatPtr; 256 257 const std::initializer_list<LLT> AddrSpaces64 = { 258 GlobalPtr, ConstantPtr, FlatPtr 259 }; 260 261 const std::initializer_list<LLT> AddrSpaces32 = { 262 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 263 }; 264 265 const std::initializer_list<LLT> FPTypesBase = { 266 S32, S64 267 }; 268 269 const std::initializer_list<LLT> FPTypes16 = { 270 S32, S64, S16 271 }; 272 273 const std::initializer_list<LLT> FPTypesPK16 = { 274 S32, S64, S16, V2S16 275 }; 276 277 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 278 279 setAction({G_BRCOND, S1}, Legal); // VCC branches 280 setAction({G_BRCOND, S32}, Legal); // SCC branches 281 282 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 283 // elements for v3s16 284 getActionDefinitionsBuilder(G_PHI) 285 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 286 .legalFor(AllS32Vectors) 287 .legalFor(AllS64Vectors) 288 .legalFor(AddrSpaces64) 289 .legalFor(AddrSpaces32) 290 .clampScalar(0, S32, S256) 291 .widenScalarToNextPow2(0, 32) 292 .clampMaxNumElements(0, S32, 16) 293 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 294 .legalIf(isPointer(0)); 295 296 if (ST.hasVOP3PInsts()) { 297 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 298 .legalFor({S32, S16, V2S16}) 299 .clampScalar(0, S16, S32) 300 .clampMaxNumElements(0, S16, 2) 301 .scalarize(0) 302 .widenScalarToNextPow2(0, 32); 303 } else if (ST.has16BitInsts()) { 304 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 305 .legalFor({S32, S16}) 306 .clampScalar(0, S16, S32) 307 .scalarize(0) 308 .widenScalarToNextPow2(0, 32); 309 } else { 310 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 311 .legalFor({S32}) 312 .clampScalar(0, S32, S32) 313 .scalarize(0); 314 } 315 316 // FIXME: Not really legal. Placeholder for custom lowering. 317 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 318 .customFor({S32, S64}) 319 .clampScalar(0, S32, S64) 320 .widenScalarToNextPow2(0, 32) 321 .scalarize(0); 322 323 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 324 .legalFor({S32}) 325 .clampScalar(0, S32, S32) 326 .scalarize(0); 327 328 // Report legal for any types we can handle anywhere. For the cases only legal 329 // on the SALU, RegBankSelect will be able to re-legalize. 330 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 331 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 332 .clampScalar(0, S32, S64) 333 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 334 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 335 .widenScalarToNextPow2(0) 336 .scalarize(0); 337 338 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 339 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 340 .legalFor({{S32, S1}, {S32, S32}}) 341 .minScalar(0, S32) 342 // TODO: .scalarize(0) 343 .lower(); 344 345 getActionDefinitionsBuilder(G_BITCAST) 346 // Don't worry about the size constraint. 347 .legalIf(all(isRegisterType(0), isRegisterType(1))) 348 .lower(); 349 350 351 getActionDefinitionsBuilder(G_CONSTANT) 352 .legalFor({S1, S32, S64, S16, GlobalPtr, 353 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 354 .clampScalar(0, S32, S64) 355 .widenScalarToNextPow2(0) 356 .legalIf(isPointer(0)); 357 358 getActionDefinitionsBuilder(G_FCONSTANT) 359 .legalFor({S32, S64, S16}) 360 .clampScalar(0, S16, S64); 361 362 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 363 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 364 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 365 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 366 .clampScalarOrElt(0, S32, S1024) 367 .legalIf(isMultiple32(0)) 368 .widenScalarToNextPow2(0, 32) 369 .clampMaxNumElements(0, S32, 16); 370 371 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 372 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 373 .unsupportedFor({PrivatePtr}) 374 .custom(); 375 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 376 377 auto &FPOpActions = getActionDefinitionsBuilder( 378 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 379 .legalFor({S32, S64}); 380 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 381 .customFor({S32, S64}); 382 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 383 .customFor({S32, S64}); 384 385 if (ST.has16BitInsts()) { 386 if (ST.hasVOP3PInsts()) 387 FPOpActions.legalFor({S16, V2S16}); 388 else 389 FPOpActions.legalFor({S16}); 390 391 TrigActions.customFor({S16}); 392 FDIVActions.customFor({S16}); 393 } 394 395 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 396 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 397 398 if (ST.hasVOP3PInsts()) { 399 MinNumMaxNum.customFor(FPTypesPK16) 400 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 401 .clampMaxNumElements(0, S16, 2) 402 .clampScalar(0, S16, S64) 403 .scalarize(0); 404 } else if (ST.has16BitInsts()) { 405 MinNumMaxNum.customFor(FPTypes16) 406 .clampScalar(0, S16, S64) 407 .scalarize(0); 408 } else { 409 MinNumMaxNum.customFor(FPTypesBase) 410 .clampScalar(0, S32, S64) 411 .scalarize(0); 412 } 413 414 if (ST.hasVOP3PInsts()) 415 FPOpActions.clampMaxNumElements(0, S16, 2); 416 417 FPOpActions 418 .scalarize(0) 419 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 420 421 TrigActions 422 .scalarize(0) 423 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 424 425 FDIVActions 426 .scalarize(0) 427 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 428 429 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 430 .legalFor(FPTypesPK16) 431 .clampMaxNumElements(0, S16, 2) 432 .scalarize(0) 433 .clampScalar(0, S16, S64); 434 435 if (ST.has16BitInsts()) { 436 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 437 .legalFor({S32, S64, S16}) 438 .scalarize(0) 439 .clampScalar(0, S16, S64); 440 } else { 441 getActionDefinitionsBuilder(G_FSQRT) 442 .legalFor({S32, S64}) 443 .scalarize(0) 444 .clampScalar(0, S32, S64); 445 446 if (ST.hasFractBug()) { 447 getActionDefinitionsBuilder(G_FFLOOR) 448 .customFor({S64}) 449 .legalFor({S32, S64}) 450 .scalarize(0) 451 .clampScalar(0, S32, S64); 452 } else { 453 getActionDefinitionsBuilder(G_FFLOOR) 454 .legalFor({S32, S64}) 455 .scalarize(0) 456 .clampScalar(0, S32, S64); 457 } 458 } 459 460 getActionDefinitionsBuilder(G_FPTRUNC) 461 .legalFor({{S32, S64}, {S16, S32}}) 462 .scalarize(0) 463 .lower(); 464 465 getActionDefinitionsBuilder(G_FPEXT) 466 .legalFor({{S64, S32}, {S32, S16}}) 467 .lowerFor({{S64, S16}}) // FIXME: Implement 468 .scalarize(0); 469 470 getActionDefinitionsBuilder(G_FSUB) 471 // Use actual fsub instruction 472 .legalFor({S32}) 473 // Must use fadd + fneg 474 .lowerFor({S64, S16, V2S16}) 475 .scalarize(0) 476 .clampScalar(0, S32, S64); 477 478 // Whether this is legal depends on the floating point mode for the function. 479 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 480 if (ST.hasMadF16()) 481 FMad.customFor({S32, S16}); 482 else 483 FMad.customFor({S32}); 484 FMad.scalarize(0) 485 .lower(); 486 487 getActionDefinitionsBuilder(G_TRUNC) 488 .alwaysLegal(); 489 490 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 491 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 492 {S32, S1}, {S64, S1}, {S16, S1}}) 493 .scalarize(0) 494 .clampScalar(0, S32, S64) 495 .widenScalarToNextPow2(1, 32); 496 497 // TODO: Split s1->s64 during regbankselect for VALU. 498 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 499 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 500 .lowerFor({{S32, S64}}) 501 .lowerIf(typeIs(1, S1)) 502 .customFor({{S64, S64}}); 503 if (ST.has16BitInsts()) 504 IToFP.legalFor({{S16, S16}}); 505 IToFP.clampScalar(1, S32, S64) 506 .scalarize(0) 507 .widenScalarToNextPow2(1); 508 509 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 510 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 511 .customFor({{S64, S64}}); 512 if (ST.has16BitInsts()) 513 FPToI.legalFor({{S16, S16}}); 514 else 515 FPToI.minScalar(1, S32); 516 517 FPToI.minScalar(0, S32) 518 .scalarize(0) 519 .lower(); 520 521 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 522 .scalarize(0) 523 .lower(); 524 525 if (ST.has16BitInsts()) { 526 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 527 .legalFor({S16, S32, S64}) 528 .clampScalar(0, S16, S64) 529 .scalarize(0); 530 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 531 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 532 .legalFor({S32, S64}) 533 .clampScalar(0, S32, S64) 534 .scalarize(0); 535 } else { 536 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 537 .legalFor({S32}) 538 .customFor({S64}) 539 .clampScalar(0, S32, S64) 540 .scalarize(0); 541 } 542 543 getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK}) 544 .scalarize(0) 545 .alwaysLegal(); 546 547 auto &CmpBuilder = 548 getActionDefinitionsBuilder(G_ICMP) 549 // The compare output type differs based on the register bank of the output, 550 // so make both s1 and s32 legal. 551 // 552 // Scalar compares producing output in scc will be promoted to s32, as that 553 // is the allocatable register type that will be needed for the copy from 554 // scc. This will be promoted during RegBankSelect, and we assume something 555 // before that won't try to use s32 result types. 556 // 557 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 558 // bank. 559 .legalForCartesianProduct( 560 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 561 .legalForCartesianProduct( 562 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 563 if (ST.has16BitInsts()) { 564 CmpBuilder.legalFor({{S1, S16}}); 565 } 566 567 CmpBuilder 568 .widenScalarToNextPow2(1) 569 .clampScalar(1, S32, S64) 570 .scalarize(0) 571 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 572 573 getActionDefinitionsBuilder(G_FCMP) 574 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 575 .widenScalarToNextPow2(1) 576 .clampScalar(1, S32, S64) 577 .scalarize(0); 578 579 // FIXME: fpow has a selection pattern that should move to custom lowering. 580 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 581 if (ST.has16BitInsts()) 582 Exp2Ops.legalFor({S32, S16}); 583 else 584 Exp2Ops.legalFor({S32}); 585 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 586 Exp2Ops.scalarize(0); 587 588 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 589 if (ST.has16BitInsts()) 590 ExpOps.customFor({{S32}, {S16}}); 591 else 592 ExpOps.customFor({S32}); 593 ExpOps.clampScalar(0, MinScalarFPTy, S32) 594 .scalarize(0); 595 596 // The 64-bit versions produce 32-bit results, but only on the SALU. 597 getActionDefinitionsBuilder(G_CTPOP) 598 .legalFor({{S32, S32}, {S32, S64}}) 599 .clampScalar(0, S32, S32) 600 .clampScalar(1, S32, S64) 601 .scalarize(0) 602 .widenScalarToNextPow2(0, 32) 603 .widenScalarToNextPow2(1, 32); 604 605 // The hardware instructions return a different result on 0 than the generic 606 // instructions expect. The hardware produces -1, but these produce the 607 // bitwidth. 608 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 609 .scalarize(0) 610 .clampScalar(0, S32, S32) 611 .clampScalar(1, S32, S64) 612 .widenScalarToNextPow2(0, 32) 613 .widenScalarToNextPow2(1, 32) 614 .lower(); 615 616 // The 64-bit versions produce 32-bit results, but only on the SALU. 617 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 618 .legalFor({{S32, S32}, {S32, S64}}) 619 .clampScalar(0, S32, S32) 620 .clampScalar(1, S32, S64) 621 .scalarize(0) 622 .widenScalarToNextPow2(0, 32) 623 .widenScalarToNextPow2(1, 32); 624 625 getActionDefinitionsBuilder(G_BITREVERSE) 626 .legalFor({S32}) 627 .clampScalar(0, S32, S32) 628 .scalarize(0); 629 630 if (ST.has16BitInsts()) { 631 getActionDefinitionsBuilder(G_BSWAP) 632 .legalFor({S16, S32, V2S16}) 633 .clampMaxNumElements(0, S16, 2) 634 // FIXME: Fixing non-power-of-2 before clamp is workaround for 635 // narrowScalar limitation. 636 .widenScalarToNextPow2(0) 637 .clampScalar(0, S16, S32) 638 .scalarize(0); 639 640 if (ST.hasVOP3PInsts()) { 641 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 642 .legalFor({S32, S16, V2S16}) 643 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 644 .clampMaxNumElements(0, S16, 2) 645 .minScalar(0, S16) 646 .widenScalarToNextPow2(0) 647 .scalarize(0) 648 .lower(); 649 } else { 650 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 651 .legalFor({S32, S16}) 652 .widenScalarToNextPow2(0) 653 .minScalar(0, S16) 654 .scalarize(0) 655 .lower(); 656 } 657 } else { 658 // TODO: Should have same legality without v_perm_b32 659 getActionDefinitionsBuilder(G_BSWAP) 660 .legalFor({S32}) 661 .lowerIf(narrowerThan(0, 32)) 662 // FIXME: Fixing non-power-of-2 before clamp is workaround for 663 // narrowScalar limitation. 664 .widenScalarToNextPow2(0) 665 .maxScalar(0, S32) 666 .scalarize(0) 667 .lower(); 668 669 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 670 .legalFor({S32}) 671 .minScalar(0, S32) 672 .widenScalarToNextPow2(0) 673 .scalarize(0) 674 .lower(); 675 } 676 677 getActionDefinitionsBuilder(G_INTTOPTR) 678 // List the common cases 679 .legalForCartesianProduct(AddrSpaces64, {S64}) 680 .legalForCartesianProduct(AddrSpaces32, {S32}) 681 .scalarize(0) 682 // Accept any address space as long as the size matches 683 .legalIf(sameSize(0, 1)) 684 .widenScalarIf(smallerThan(1, 0), 685 [](const LegalityQuery &Query) { 686 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 687 }) 688 .narrowScalarIf(greaterThan(1, 0), 689 [](const LegalityQuery &Query) { 690 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 691 }); 692 693 getActionDefinitionsBuilder(G_PTRTOINT) 694 // List the common cases 695 .legalForCartesianProduct(AddrSpaces64, {S64}) 696 .legalForCartesianProduct(AddrSpaces32, {S32}) 697 .scalarize(0) 698 // Accept any address space as long as the size matches 699 .legalIf(sameSize(0, 1)) 700 .widenScalarIf(smallerThan(0, 1), 701 [](const LegalityQuery &Query) { 702 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 703 }) 704 .narrowScalarIf( 705 greaterThan(0, 1), 706 [](const LegalityQuery &Query) { 707 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 708 }); 709 710 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 711 .scalarize(0) 712 .custom(); 713 714 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 715 // handle some operations by just promoting the register during 716 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 717 auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned { 718 switch (AS) { 719 // FIXME: Private element size. 720 case AMDGPUAS::PRIVATE_ADDRESS: 721 return 32; 722 // FIXME: Check subtarget 723 case AMDGPUAS::LOCAL_ADDRESS: 724 return ST.useDS128() ? 128 : 64; 725 726 // Treat constant and global as identical. SMRD loads are sometimes usable 727 // for global loads (ideally constant address space should be eliminated) 728 // depending on the context. Legality cannot be context dependent, but 729 // RegBankSelect can split the load as necessary depending on the pointer 730 // register bank/uniformity and if the memory is invariant or not written in 731 // a kernel. 732 case AMDGPUAS::CONSTANT_ADDRESS: 733 case AMDGPUAS::GLOBAL_ADDRESS: 734 return IsLoad ? 512 : 128; 735 default: 736 return 128; 737 } 738 }; 739 740 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 741 bool IsLoad) -> bool { 742 const LLT DstTy = Query.Types[0]; 743 744 // Split vector extloads. 745 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 746 unsigned Align = Query.MMODescrs[0].AlignInBits; 747 748 if (MemSize < DstTy.getSizeInBits()) 749 MemSize = std::max(MemSize, Align); 750 751 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 752 return true; 753 754 const LLT PtrTy = Query.Types[1]; 755 unsigned AS = PtrTy.getAddressSpace(); 756 if (MemSize > maxSizeForAddrSpace(AS, IsLoad)) 757 return true; 758 759 // Catch weird sized loads that don't evenly divide into the access sizes 760 // TODO: May be able to widen depending on alignment etc. 761 unsigned NumRegs = (MemSize + 31) / 32; 762 if (NumRegs == 3) { 763 if (!ST.hasDwordx3LoadStores()) 764 return true; 765 } else { 766 // If the alignment allows, these should have been widened. 767 if (!isPowerOf2_32(NumRegs)) 768 return true; 769 } 770 771 if (Align < MemSize) { 772 const SITargetLowering *TLI = ST.getTargetLowering(); 773 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 774 } 775 776 return false; 777 }; 778 779 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool { 780 unsigned Size = Query.Types[0].getSizeInBits(); 781 if (isPowerOf2_32(Size)) 782 return false; 783 784 if (Size == 96 && ST.hasDwordx3LoadStores()) 785 return false; 786 787 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 788 if (Size >= maxSizeForAddrSpace(AddrSpace, true)) 789 return false; 790 791 unsigned Align = Query.MMODescrs[0].AlignInBits; 792 unsigned RoundedSize = NextPowerOf2(Size); 793 return (Align >= RoundedSize); 794 }; 795 796 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 797 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 798 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 799 800 // TODO: Refine based on subtargets which support unaligned access or 128-bit 801 // LDS 802 // TODO: Unsupported flat for SI. 803 804 for (unsigned Op : {G_LOAD, G_STORE}) { 805 const bool IsStore = Op == G_STORE; 806 807 auto &Actions = getActionDefinitionsBuilder(Op); 808 // Whitelist the common cases. 809 // TODO: Loads to s16 on gfx9 810 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 811 {V2S32, GlobalPtr, 64, GlobalAlign32}, 812 {V4S32, GlobalPtr, 128, GlobalAlign32}, 813 {S128, GlobalPtr, 128, GlobalAlign32}, 814 {S64, GlobalPtr, 64, GlobalAlign32}, 815 {V2S64, GlobalPtr, 128, GlobalAlign32}, 816 {V2S16, GlobalPtr, 32, GlobalAlign32}, 817 {S32, GlobalPtr, 8, GlobalAlign8}, 818 {S32, GlobalPtr, 16, GlobalAlign16}, 819 820 {S32, LocalPtr, 32, 32}, 821 {S64, LocalPtr, 64, 32}, 822 {V2S32, LocalPtr, 64, 32}, 823 {S32, LocalPtr, 8, 8}, 824 {S32, LocalPtr, 16, 16}, 825 {V2S16, LocalPtr, 32, 32}, 826 827 {S32, PrivatePtr, 32, 32}, 828 {S32, PrivatePtr, 8, 8}, 829 {S32, PrivatePtr, 16, 16}, 830 {V2S16, PrivatePtr, 32, 32}, 831 832 {S32, FlatPtr, 32, GlobalAlign32}, 833 {S32, FlatPtr, 16, GlobalAlign16}, 834 {S32, FlatPtr, 8, GlobalAlign8}, 835 {V2S16, FlatPtr, 32, GlobalAlign32}, 836 837 {S32, ConstantPtr, 32, GlobalAlign32}, 838 {V2S32, ConstantPtr, 64, GlobalAlign32}, 839 {V4S32, ConstantPtr, 128, GlobalAlign32}, 840 {S64, ConstantPtr, 64, GlobalAlign32}, 841 {S128, ConstantPtr, 128, GlobalAlign32}, 842 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 843 Actions 844 .customIf(typeIs(1, Constant32Ptr)) 845 // Widen suitably aligned loads by loading extra elements. 846 .moreElementsIf([=](const LegalityQuery &Query) { 847 const LLT Ty = Query.Types[0]; 848 return Op == G_LOAD && Ty.isVector() && 849 shouldWidenLoadResult(Query); 850 }, moreElementsToNextPow2(0)) 851 .widenScalarIf([=](const LegalityQuery &Query) { 852 const LLT Ty = Query.Types[0]; 853 return Op == G_LOAD && !Ty.isVector() && 854 shouldWidenLoadResult(Query); 855 }, widenScalarOrEltToNextPow2(0)) 856 .narrowScalarIf( 857 [=](const LegalityQuery &Query) -> bool { 858 return !Query.Types[0].isVector() && 859 needToSplitMemOp(Query, Op == G_LOAD); 860 }, 861 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 862 const LLT DstTy = Query.Types[0]; 863 const LLT PtrTy = Query.Types[1]; 864 865 const unsigned DstSize = DstTy.getSizeInBits(); 866 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 867 868 // Split extloads. 869 if (DstSize > MemSize) 870 return std::make_pair(0, LLT::scalar(MemSize)); 871 872 if (!isPowerOf2_32(DstSize)) { 873 // We're probably decomposing an odd sized store. Try to split 874 // to the widest type. TODO: Account for alignment. As-is it 875 // should be OK, since the new parts will be further legalized. 876 unsigned FloorSize = PowerOf2Floor(DstSize); 877 return std::make_pair(0, LLT::scalar(FloorSize)); 878 } 879 880 if (DstSize > 32 && (DstSize % 32 != 0)) { 881 // FIXME: Need a way to specify non-extload of larger size if 882 // suitably aligned. 883 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 884 } 885 886 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 887 Op == G_LOAD); 888 if (MemSize > MaxSize) 889 return std::make_pair(0, LLT::scalar(MaxSize)); 890 891 unsigned Align = Query.MMODescrs[0].AlignInBits; 892 return std::make_pair(0, LLT::scalar(Align)); 893 }) 894 .fewerElementsIf( 895 [=](const LegalityQuery &Query) -> bool { 896 return Query.Types[0].isVector() && 897 needToSplitMemOp(Query, Op == G_LOAD); 898 }, 899 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 900 const LLT DstTy = Query.Types[0]; 901 const LLT PtrTy = Query.Types[1]; 902 903 LLT EltTy = DstTy.getElementType(); 904 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 905 Op == G_LOAD); 906 907 // FIXME: Handle widened to power of 2 results better. This ends 908 // up scalarizing. 909 // FIXME: 3 element stores scalarized on SI 910 911 // Split if it's too large for the address space. 912 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 913 unsigned NumElts = DstTy.getNumElements(); 914 unsigned EltSize = EltTy.getSizeInBits(); 915 916 if (MaxSize % EltSize == 0) { 917 return std::make_pair( 918 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 919 } 920 921 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 922 923 // FIXME: Refine when odd breakdowns handled 924 // The scalars will need to be re-legalized. 925 if (NumPieces == 1 || NumPieces >= NumElts || 926 NumElts % NumPieces != 0) 927 return std::make_pair(0, EltTy); 928 929 return std::make_pair(0, 930 LLT::vector(NumElts / NumPieces, EltTy)); 931 } 932 933 // FIXME: We could probably handle weird extending loads better. 934 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 935 if (DstTy.getSizeInBits() > MemSize) 936 return std::make_pair(0, EltTy); 937 938 unsigned EltSize = EltTy.getSizeInBits(); 939 unsigned DstSize = DstTy.getSizeInBits(); 940 if (!isPowerOf2_32(DstSize)) { 941 // We're probably decomposing an odd sized store. Try to split 942 // to the widest type. TODO: Account for alignment. As-is it 943 // should be OK, since the new parts will be further legalized. 944 unsigned FloorSize = PowerOf2Floor(DstSize); 945 return std::make_pair( 946 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 947 } 948 949 // Need to split because of alignment. 950 unsigned Align = Query.MMODescrs[0].AlignInBits; 951 if (EltSize > Align && 952 (EltSize / Align < DstTy.getNumElements())) { 953 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 954 } 955 956 // May need relegalization for the scalars. 957 return std::make_pair(0, EltTy); 958 }) 959 .minScalar(0, S32); 960 961 if (IsStore) 962 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 963 964 // TODO: Need a bitcast lower option? 965 Actions 966 .legalIf([=](const LegalityQuery &Query) { 967 const LLT Ty0 = Query.Types[0]; 968 unsigned Size = Ty0.getSizeInBits(); 969 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 970 unsigned Align = Query.MMODescrs[0].AlignInBits; 971 972 // FIXME: Widening store from alignment not valid. 973 if (MemSize < Size) 974 MemSize = std::max(MemSize, Align); 975 976 // No extending vector loads. 977 if (Size > MemSize && Ty0.isVector()) 978 return false; 979 980 switch (MemSize) { 981 case 8: 982 case 16: 983 return Size == 32; 984 case 32: 985 case 64: 986 case 128: 987 return true; 988 case 96: 989 return ST.hasDwordx3LoadStores(); 990 case 256: 991 case 512: 992 return true; 993 default: 994 return false; 995 } 996 }) 997 .widenScalarToNextPow2(0) 998 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 999 } 1000 1001 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1002 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1003 {S32, GlobalPtr, 16, 2 * 8}, 1004 {S32, LocalPtr, 8, 8}, 1005 {S32, LocalPtr, 16, 16}, 1006 {S32, PrivatePtr, 8, 8}, 1007 {S32, PrivatePtr, 16, 16}, 1008 {S32, ConstantPtr, 8, 8}, 1009 {S32, ConstantPtr, 16, 2 * 8}}); 1010 if (ST.hasFlatAddressSpace()) { 1011 ExtLoads.legalForTypesWithMemDesc( 1012 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1013 } 1014 1015 ExtLoads.clampScalar(0, S32, S32) 1016 .widenScalarToNextPow2(0) 1017 .unsupportedIfMemSizeNotPow2() 1018 .lower(); 1019 1020 auto &Atomics = getActionDefinitionsBuilder( 1021 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1022 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1023 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1024 G_ATOMICRMW_UMIN}) 1025 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1026 {S64, GlobalPtr}, {S64, LocalPtr}}); 1027 if (ST.hasFlatAddressSpace()) { 1028 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1029 } 1030 1031 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1032 .legalFor({{S32, LocalPtr}}); 1033 1034 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1035 // demarshalling 1036 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1037 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1038 {S32, FlatPtr}, {S64, FlatPtr}}) 1039 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1040 {S32, RegionPtr}, {S64, RegionPtr}}); 1041 // TODO: Pointer types, any 32-bit or 64-bit vector 1042 1043 // Condition should be s32 for scalar, s1 for vector. 1044 getActionDefinitionsBuilder(G_SELECT) 1045 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1046 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1047 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1048 .clampScalar(0, S16, S64) 1049 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1050 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1051 .scalarize(1) 1052 .clampMaxNumElements(0, S32, 2) 1053 .clampMaxNumElements(0, LocalPtr, 2) 1054 .clampMaxNumElements(0, PrivatePtr, 2) 1055 .scalarize(0) 1056 .widenScalarToNextPow2(0) 1057 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1058 1059 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1060 // be more flexible with the shift amount type. 1061 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1062 .legalFor({{S32, S32}, {S64, S32}}); 1063 if (ST.has16BitInsts()) { 1064 if (ST.hasVOP3PInsts()) { 1065 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 1066 .clampMaxNumElements(0, S16, 2); 1067 } else 1068 Shifts.legalFor({{S16, S32}, {S16, S16}}); 1069 1070 // TODO: Support 16-bit shift amounts 1071 Shifts.clampScalar(1, S32, S32); 1072 Shifts.clampScalar(0, S16, S64); 1073 Shifts.widenScalarToNextPow2(0, 16); 1074 } else { 1075 // Make sure we legalize the shift amount type first, as the general 1076 // expansion for the shifted type will produce much worse code if it hasn't 1077 // been truncated already. 1078 Shifts.clampScalar(1, S32, S32); 1079 Shifts.clampScalar(0, S32, S64); 1080 Shifts.widenScalarToNextPow2(0, 32); 1081 } 1082 Shifts.scalarize(0); 1083 1084 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1085 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1086 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1087 unsigned IdxTypeIdx = 2; 1088 1089 getActionDefinitionsBuilder(Op) 1090 .customIf([=](const LegalityQuery &Query) { 1091 const LLT EltTy = Query.Types[EltTypeIdx]; 1092 const LLT VecTy = Query.Types[VecTypeIdx]; 1093 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1094 return (EltTy.getSizeInBits() == 16 || 1095 EltTy.getSizeInBits() % 32 == 0) && 1096 VecTy.getSizeInBits() % 32 == 0 && 1097 VecTy.getSizeInBits() <= 1024 && 1098 IdxTy.getSizeInBits() == 32; 1099 }) 1100 .clampScalar(EltTypeIdx, S32, S64) 1101 .clampScalar(VecTypeIdx, S32, S64) 1102 .clampScalar(IdxTypeIdx, S32, S32); 1103 } 1104 1105 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1106 .unsupportedIf([=](const LegalityQuery &Query) { 1107 const LLT &EltTy = Query.Types[1].getElementType(); 1108 return Query.Types[0] != EltTy; 1109 }); 1110 1111 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1112 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1113 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1114 1115 // FIXME: Doesn't handle extract of illegal sizes. 1116 getActionDefinitionsBuilder(Op) 1117 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1118 // FIXME: Multiples of 16 should not be legal. 1119 .legalIf([=](const LegalityQuery &Query) { 1120 const LLT BigTy = Query.Types[BigTyIdx]; 1121 const LLT LitTy = Query.Types[LitTyIdx]; 1122 return (BigTy.getSizeInBits() % 32 == 0) && 1123 (LitTy.getSizeInBits() % 16 == 0); 1124 }) 1125 .widenScalarIf( 1126 [=](const LegalityQuery &Query) { 1127 const LLT BigTy = Query.Types[BigTyIdx]; 1128 return (BigTy.getScalarSizeInBits() < 16); 1129 }, 1130 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1131 .widenScalarIf( 1132 [=](const LegalityQuery &Query) { 1133 const LLT LitTy = Query.Types[LitTyIdx]; 1134 return (LitTy.getScalarSizeInBits() < 16); 1135 }, 1136 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1137 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1138 .widenScalarToNextPow2(BigTyIdx, 32); 1139 1140 } 1141 1142 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1143 .legalForCartesianProduct(AllS32Vectors, {S32}) 1144 .legalForCartesianProduct(AllS64Vectors, {S64}) 1145 .clampNumElements(0, V16S32, V32S32) 1146 .clampNumElements(0, V2S64, V16S64) 1147 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1148 1149 if (ST.hasScalarPackInsts()) { 1150 BuildVector 1151 // FIXME: Should probably widen s1 vectors straight to s32 1152 .minScalarOrElt(0, S16) 1153 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1154 .minScalar(1, S32); 1155 1156 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1157 .legalFor({V2S16, S32}) 1158 .lower(); 1159 BuildVector.minScalarOrElt(0, S32); 1160 } else { 1161 BuildVector.customFor({V2S16, S16}); 1162 BuildVector.minScalarOrElt(0, S32); 1163 1164 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1165 .customFor({V2S16, S32}) 1166 .lower(); 1167 } 1168 1169 BuildVector.legalIf(isRegisterType(0)); 1170 1171 // FIXME: Clamp maximum size 1172 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1173 .legalIf(isRegisterType(0)); 1174 1175 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1176 // pre-legalize. 1177 if (ST.hasVOP3PInsts()) { 1178 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1179 .customFor({V2S16, V2S16}) 1180 .lower(); 1181 } else 1182 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1183 1184 // Merge/Unmerge 1185 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1186 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1187 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1188 1189 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1190 const LLT &Ty = Query.Types[TypeIdx]; 1191 if (Ty.isVector()) { 1192 const LLT &EltTy = Ty.getElementType(); 1193 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 1194 return true; 1195 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1196 return true; 1197 } 1198 return false; 1199 }; 1200 1201 auto &Builder = getActionDefinitionsBuilder(Op) 1202 // Try to widen to s16 first for small types. 1203 // TODO: Only do this on targets with legal s16 shifts 1204 .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1205 1206 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1207 .lowerFor({{S16, V2S16}}) 1208 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1209 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1210 elementTypeIs(1, S16)), 1211 changeTo(1, V2S16)) 1212 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1213 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1214 // valid. 1215 .clampScalar(LitTyIdx, S32, S256) 1216 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1217 // Break up vectors with weird elements into scalars 1218 .fewerElementsIf( 1219 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 1220 scalarize(0)) 1221 .fewerElementsIf( 1222 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 1223 scalarize(1)) 1224 .clampScalar(BigTyIdx, S32, S1024); 1225 1226 if (Op == G_MERGE_VALUES) { 1227 Builder.widenScalarIf( 1228 // TODO: Use 16-bit shifts if legal for 8-bit values? 1229 [=](const LegalityQuery &Query) { 1230 const LLT Ty = Query.Types[LitTyIdx]; 1231 return Ty.getSizeInBits() < 32; 1232 }, 1233 changeTo(LitTyIdx, S32)); 1234 } 1235 1236 Builder.widenScalarIf( 1237 [=](const LegalityQuery &Query) { 1238 const LLT Ty = Query.Types[BigTyIdx]; 1239 return !isPowerOf2_32(Ty.getSizeInBits()) && 1240 Ty.getSizeInBits() % 16 != 0; 1241 }, 1242 [=](const LegalityQuery &Query) { 1243 // Pick the next power of 2, or a multiple of 64 over 128. 1244 // Whichever is smaller. 1245 const LLT &Ty = Query.Types[BigTyIdx]; 1246 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1247 if (NewSizeInBits >= 256) { 1248 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1249 if (RoundedTo < NewSizeInBits) 1250 NewSizeInBits = RoundedTo; 1251 } 1252 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1253 }) 1254 .legalIf([=](const LegalityQuery &Query) { 1255 const LLT &BigTy = Query.Types[BigTyIdx]; 1256 const LLT &LitTy = Query.Types[LitTyIdx]; 1257 1258 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1259 return false; 1260 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1261 return false; 1262 1263 return BigTy.getSizeInBits() % 16 == 0 && 1264 LitTy.getSizeInBits() % 16 == 0 && 1265 BigTy.getSizeInBits() <= 1024; 1266 }) 1267 // Any vectors left are the wrong size. Scalarize them. 1268 .scalarize(0) 1269 .scalarize(1); 1270 } 1271 1272 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1273 // RegBankSelect. 1274 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1275 .legalFor({{S32}, {S64}}); 1276 1277 if (ST.hasVOP3PInsts()) { 1278 SextInReg.lowerFor({{V2S16}}) 1279 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1280 // get more vector shift opportunities, since we'll get those when 1281 // expanded. 1282 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1283 } else if (ST.has16BitInsts()) { 1284 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1285 } else { 1286 // Prefer to promote to s32 before lowering if we don't have 16-bit 1287 // shifts. This avoid a lot of intermediate truncate and extend operations. 1288 SextInReg.lowerFor({{S32}, {S64}}); 1289 } 1290 1291 SextInReg 1292 .scalarize(0) 1293 .clampScalar(0, S32, S64) 1294 .lower(); 1295 1296 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1297 .legalFor({S64}); 1298 1299 getActionDefinitionsBuilder({ 1300 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1301 G_FCOPYSIGN, 1302 1303 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1304 G_READ_REGISTER, 1305 G_WRITE_REGISTER, 1306 1307 G_SADDO, G_SSUBO, 1308 1309 // TODO: Implement 1310 G_FMINIMUM, G_FMAXIMUM 1311 }).lower(); 1312 1313 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1314 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1315 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1316 .unsupported(); 1317 1318 computeTables(); 1319 verify(*ST.getInstrInfo()); 1320 } 1321 1322 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1323 MachineRegisterInfo &MRI, 1324 MachineIRBuilder &B, 1325 GISelChangeObserver &Observer) const { 1326 switch (MI.getOpcode()) { 1327 case TargetOpcode::G_ADDRSPACE_CAST: 1328 return legalizeAddrSpaceCast(MI, MRI, B); 1329 case TargetOpcode::G_FRINT: 1330 return legalizeFrint(MI, MRI, B); 1331 case TargetOpcode::G_FCEIL: 1332 return legalizeFceil(MI, MRI, B); 1333 case TargetOpcode::G_INTRINSIC_TRUNC: 1334 return legalizeIntrinsicTrunc(MI, MRI, B); 1335 case TargetOpcode::G_SITOFP: 1336 return legalizeITOFP(MI, MRI, B, true); 1337 case TargetOpcode::G_UITOFP: 1338 return legalizeITOFP(MI, MRI, B, false); 1339 case TargetOpcode::G_FPTOSI: 1340 return legalizeFPTOI(MI, MRI, B, true); 1341 case TargetOpcode::G_FPTOUI: 1342 return legalizeFPTOI(MI, MRI, B, false); 1343 case TargetOpcode::G_FMINNUM: 1344 case TargetOpcode::G_FMAXNUM: 1345 case TargetOpcode::G_FMINNUM_IEEE: 1346 case TargetOpcode::G_FMAXNUM_IEEE: 1347 return legalizeMinNumMaxNum(MI, MRI, B); 1348 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1349 return legalizeExtractVectorElt(MI, MRI, B); 1350 case TargetOpcode::G_INSERT_VECTOR_ELT: 1351 return legalizeInsertVectorElt(MI, MRI, B); 1352 case TargetOpcode::G_SHUFFLE_VECTOR: 1353 return legalizeShuffleVector(MI, MRI, B); 1354 case TargetOpcode::G_FSIN: 1355 case TargetOpcode::G_FCOS: 1356 return legalizeSinCos(MI, MRI, B); 1357 case TargetOpcode::G_GLOBAL_VALUE: 1358 return legalizeGlobalValue(MI, MRI, B); 1359 case TargetOpcode::G_LOAD: 1360 return legalizeLoad(MI, MRI, B, Observer); 1361 case TargetOpcode::G_FMAD: 1362 return legalizeFMad(MI, MRI, B); 1363 case TargetOpcode::G_FDIV: 1364 return legalizeFDIV(MI, MRI, B); 1365 case TargetOpcode::G_UDIV: 1366 case TargetOpcode::G_UREM: 1367 return legalizeUDIV_UREM(MI, MRI, B); 1368 case TargetOpcode::G_SDIV: 1369 case TargetOpcode::G_SREM: 1370 return legalizeSDIV_SREM(MI, MRI, B); 1371 case TargetOpcode::G_ATOMIC_CMPXCHG: 1372 return legalizeAtomicCmpXChg(MI, MRI, B); 1373 case TargetOpcode::G_FLOG: 1374 return legalizeFlog(MI, B, 1.0f / numbers::log2ef); 1375 case TargetOpcode::G_FLOG10: 1376 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1377 case TargetOpcode::G_FEXP: 1378 return legalizeFExp(MI, B); 1379 case TargetOpcode::G_FPOW: 1380 return legalizeFPow(MI, B); 1381 case TargetOpcode::G_FFLOOR: 1382 return legalizeFFloor(MI, MRI, B); 1383 case TargetOpcode::G_BUILD_VECTOR: 1384 return legalizeBuildVector(MI, MRI, B); 1385 default: 1386 return false; 1387 } 1388 1389 llvm_unreachable("expected switch to return"); 1390 } 1391 1392 Register AMDGPULegalizerInfo::getSegmentAperture( 1393 unsigned AS, 1394 MachineRegisterInfo &MRI, 1395 MachineIRBuilder &B) const { 1396 MachineFunction &MF = B.getMF(); 1397 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1398 const LLT S32 = LLT::scalar(32); 1399 1400 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1401 1402 if (ST.hasApertureRegs()) { 1403 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1404 // getreg. 1405 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1406 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1407 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1408 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1409 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1410 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1411 unsigned Encoding = 1412 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1413 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1414 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1415 1416 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1417 1418 B.buildInstr(AMDGPU::S_GETREG_B32) 1419 .addDef(GetReg) 1420 .addImm(Encoding); 1421 MRI.setType(GetReg, S32); 1422 1423 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1424 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1425 } 1426 1427 Register QueuePtr = MRI.createGenericVirtualRegister( 1428 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1429 1430 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1431 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1432 return Register(); 1433 1434 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1435 // private_segment_aperture_base_hi. 1436 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1437 1438 // TODO: can we be smarter about machine pointer info? 1439 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1440 MachineMemOperand *MMO = MF.getMachineMemOperand( 1441 PtrInfo, 1442 MachineMemOperand::MOLoad | 1443 MachineMemOperand::MODereferenceable | 1444 MachineMemOperand::MOInvariant, 1445 4, 1446 MinAlign(64, StructOffset)); 1447 1448 Register LoadAddr; 1449 1450 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1451 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1452 } 1453 1454 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1455 MachineInstr &MI, MachineRegisterInfo &MRI, 1456 MachineIRBuilder &B) const { 1457 MachineFunction &MF = B.getMF(); 1458 1459 B.setInstr(MI); 1460 1461 const LLT S32 = LLT::scalar(32); 1462 Register Dst = MI.getOperand(0).getReg(); 1463 Register Src = MI.getOperand(1).getReg(); 1464 1465 LLT DstTy = MRI.getType(Dst); 1466 LLT SrcTy = MRI.getType(Src); 1467 unsigned DestAS = DstTy.getAddressSpace(); 1468 unsigned SrcAS = SrcTy.getAddressSpace(); 1469 1470 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1471 // vector element. 1472 assert(!DstTy.isVector()); 1473 1474 const AMDGPUTargetMachine &TM 1475 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1476 1477 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1478 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1479 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1480 return true; 1481 } 1482 1483 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1484 // Truncate. 1485 B.buildExtract(Dst, Src, 0); 1486 MI.eraseFromParent(); 1487 return true; 1488 } 1489 1490 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1491 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1492 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1493 1494 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1495 // another. Merge operands are required to be the same type, but creating an 1496 // extra ptrtoint would be kind of pointless. 1497 auto HighAddr = B.buildConstant( 1498 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1499 B.buildMerge(Dst, {Src, HighAddr}); 1500 MI.eraseFromParent(); 1501 return true; 1502 } 1503 1504 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1505 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1506 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1507 unsigned NullVal = TM.getNullPointerValue(DestAS); 1508 1509 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1510 auto FlatNull = B.buildConstant(SrcTy, 0); 1511 1512 // Extract low 32-bits of the pointer. 1513 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1514 1515 auto CmpRes = 1516 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1517 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1518 1519 MI.eraseFromParent(); 1520 return true; 1521 } 1522 1523 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1524 return false; 1525 1526 if (!ST.hasFlatAddressSpace()) 1527 return false; 1528 1529 auto SegmentNull = 1530 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1531 auto FlatNull = 1532 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1533 1534 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1535 if (!ApertureReg.isValid()) 1536 return false; 1537 1538 auto CmpRes = 1539 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1540 1541 // Coerce the type of the low half of the result so we can use merge_values. 1542 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1543 1544 // TODO: Should we allow mismatched types but matching sizes in merges to 1545 // avoid the ptrtoint? 1546 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1547 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1548 1549 MI.eraseFromParent(); 1550 return true; 1551 } 1552 1553 bool AMDGPULegalizerInfo::legalizeFrint( 1554 MachineInstr &MI, MachineRegisterInfo &MRI, 1555 MachineIRBuilder &B) const { 1556 B.setInstr(MI); 1557 1558 Register Src = MI.getOperand(1).getReg(); 1559 LLT Ty = MRI.getType(Src); 1560 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1561 1562 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1563 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1564 1565 auto C1 = B.buildFConstant(Ty, C1Val); 1566 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1567 1568 // TODO: Should this propagate fast-math-flags? 1569 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1570 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1571 1572 auto C2 = B.buildFConstant(Ty, C2Val); 1573 auto Fabs = B.buildFAbs(Ty, Src); 1574 1575 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1576 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1577 return true; 1578 } 1579 1580 bool AMDGPULegalizerInfo::legalizeFceil( 1581 MachineInstr &MI, MachineRegisterInfo &MRI, 1582 MachineIRBuilder &B) const { 1583 B.setInstr(MI); 1584 1585 const LLT S1 = LLT::scalar(1); 1586 const LLT S64 = LLT::scalar(64); 1587 1588 Register Src = MI.getOperand(1).getReg(); 1589 assert(MRI.getType(Src) == S64); 1590 1591 // result = trunc(src) 1592 // if (src > 0.0 && src != result) 1593 // result += 1.0 1594 1595 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1596 1597 const auto Zero = B.buildFConstant(S64, 0.0); 1598 const auto One = B.buildFConstant(S64, 1.0); 1599 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1600 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1601 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1602 auto Add = B.buildSelect(S64, And, One, Zero); 1603 1604 // TODO: Should this propagate fast-math-flags? 1605 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1606 return true; 1607 } 1608 1609 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1610 MachineIRBuilder &B) { 1611 const unsigned FractBits = 52; 1612 const unsigned ExpBits = 11; 1613 LLT S32 = LLT::scalar(32); 1614 1615 auto Const0 = B.buildConstant(S32, FractBits - 32); 1616 auto Const1 = B.buildConstant(S32, ExpBits); 1617 1618 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1619 .addUse(Const0.getReg(0)) 1620 .addUse(Const1.getReg(0)); 1621 1622 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1623 } 1624 1625 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1626 MachineInstr &MI, MachineRegisterInfo &MRI, 1627 MachineIRBuilder &B) const { 1628 B.setInstr(MI); 1629 1630 const LLT S1 = LLT::scalar(1); 1631 const LLT S32 = LLT::scalar(32); 1632 const LLT S64 = LLT::scalar(64); 1633 1634 Register Src = MI.getOperand(1).getReg(); 1635 assert(MRI.getType(Src) == S64); 1636 1637 // TODO: Should this use extract since the low half is unused? 1638 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1639 Register Hi = Unmerge.getReg(1); 1640 1641 // Extract the upper half, since this is where we will find the sign and 1642 // exponent. 1643 auto Exp = extractF64Exponent(Hi, B); 1644 1645 const unsigned FractBits = 52; 1646 1647 // Extract the sign bit. 1648 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1649 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1650 1651 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1652 1653 const auto Zero32 = B.buildConstant(S32, 0); 1654 1655 // Extend back to 64-bits. 1656 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1657 1658 auto Shr = B.buildAShr(S64, FractMask, Exp); 1659 auto Not = B.buildNot(S64, Shr); 1660 auto Tmp0 = B.buildAnd(S64, Src, Not); 1661 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1662 1663 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1664 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1665 1666 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1667 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1668 return true; 1669 } 1670 1671 bool AMDGPULegalizerInfo::legalizeITOFP( 1672 MachineInstr &MI, MachineRegisterInfo &MRI, 1673 MachineIRBuilder &B, bool Signed) const { 1674 B.setInstr(MI); 1675 1676 Register Dst = MI.getOperand(0).getReg(); 1677 Register Src = MI.getOperand(1).getReg(); 1678 1679 const LLT S64 = LLT::scalar(64); 1680 const LLT S32 = LLT::scalar(32); 1681 1682 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1683 1684 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1685 1686 auto CvtHi = Signed ? 1687 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1688 B.buildUITOFP(S64, Unmerge.getReg(1)); 1689 1690 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1691 1692 auto ThirtyTwo = B.buildConstant(S32, 32); 1693 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1694 .addUse(CvtHi.getReg(0)) 1695 .addUse(ThirtyTwo.getReg(0)); 1696 1697 // TODO: Should this propagate fast-math-flags? 1698 B.buildFAdd(Dst, LdExp, CvtLo); 1699 MI.eraseFromParent(); 1700 return true; 1701 } 1702 1703 // TODO: Copied from DAG implementation. Verify logic and document how this 1704 // actually works. 1705 bool AMDGPULegalizerInfo::legalizeFPTOI( 1706 MachineInstr &MI, MachineRegisterInfo &MRI, 1707 MachineIRBuilder &B, bool Signed) const { 1708 B.setInstr(MI); 1709 1710 Register Dst = MI.getOperand(0).getReg(); 1711 Register Src = MI.getOperand(1).getReg(); 1712 1713 const LLT S64 = LLT::scalar(64); 1714 const LLT S32 = LLT::scalar(32); 1715 1716 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1717 1718 unsigned Flags = MI.getFlags(); 1719 1720 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1721 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1722 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1723 1724 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1725 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1726 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1727 1728 auto Hi = Signed ? 1729 B.buildFPTOSI(S32, FloorMul) : 1730 B.buildFPTOUI(S32, FloorMul); 1731 auto Lo = B.buildFPTOUI(S32, Fma); 1732 1733 B.buildMerge(Dst, { Lo, Hi }); 1734 MI.eraseFromParent(); 1735 1736 return true; 1737 } 1738 1739 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1740 MachineInstr &MI, MachineRegisterInfo &MRI, 1741 MachineIRBuilder &B) const { 1742 MachineFunction &MF = B.getMF(); 1743 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1744 1745 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1746 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1747 1748 // With ieee_mode disabled, the instructions have the correct behavior 1749 // already for G_FMINNUM/G_FMAXNUM 1750 if (!MFI->getMode().IEEE) 1751 return !IsIEEEOp; 1752 1753 if (IsIEEEOp) 1754 return true; 1755 1756 MachineIRBuilder HelperBuilder(MI); 1757 GISelObserverWrapper DummyObserver; 1758 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1759 HelperBuilder.setInstr(MI); 1760 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1761 } 1762 1763 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1764 MachineInstr &MI, MachineRegisterInfo &MRI, 1765 MachineIRBuilder &B) const { 1766 // TODO: Should move some of this into LegalizerHelper. 1767 1768 // TODO: Promote dynamic indexing of s16 to s32 1769 1770 // FIXME: Artifact combiner probably should have replaced the truncated 1771 // constant before this, so we shouldn't need 1772 // getConstantVRegValWithLookThrough. 1773 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1774 MI.getOperand(2).getReg(), MRI); 1775 if (!IdxVal) // Dynamic case will be selected to register indexing. 1776 return true; 1777 1778 Register Dst = MI.getOperand(0).getReg(); 1779 Register Vec = MI.getOperand(1).getReg(); 1780 1781 LLT VecTy = MRI.getType(Vec); 1782 LLT EltTy = VecTy.getElementType(); 1783 assert(EltTy == MRI.getType(Dst)); 1784 1785 B.setInstr(MI); 1786 1787 if (IdxVal->Value < VecTy.getNumElements()) 1788 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1789 else 1790 B.buildUndef(Dst); 1791 1792 MI.eraseFromParent(); 1793 return true; 1794 } 1795 1796 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1797 MachineInstr &MI, MachineRegisterInfo &MRI, 1798 MachineIRBuilder &B) const { 1799 // TODO: Should move some of this into LegalizerHelper. 1800 1801 // TODO: Promote dynamic indexing of s16 to s32 1802 1803 // FIXME: Artifact combiner probably should have replaced the truncated 1804 // constant before this, so we shouldn't need 1805 // getConstantVRegValWithLookThrough. 1806 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1807 MI.getOperand(3).getReg(), MRI); 1808 if (!IdxVal) // Dynamic case will be selected to register indexing. 1809 return true; 1810 1811 Register Dst = MI.getOperand(0).getReg(); 1812 Register Vec = MI.getOperand(1).getReg(); 1813 Register Ins = MI.getOperand(2).getReg(); 1814 1815 LLT VecTy = MRI.getType(Vec); 1816 LLT EltTy = VecTy.getElementType(); 1817 assert(EltTy == MRI.getType(Ins)); 1818 1819 B.setInstr(MI); 1820 1821 if (IdxVal->Value < VecTy.getNumElements()) 1822 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 1823 else 1824 B.buildUndef(Dst); 1825 1826 MI.eraseFromParent(); 1827 return true; 1828 } 1829 1830 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1831 MachineInstr &MI, MachineRegisterInfo &MRI, 1832 MachineIRBuilder &B) const { 1833 const LLT V2S16 = LLT::vector(2, 16); 1834 1835 Register Dst = MI.getOperand(0).getReg(); 1836 Register Src0 = MI.getOperand(1).getReg(); 1837 LLT DstTy = MRI.getType(Dst); 1838 LLT SrcTy = MRI.getType(Src0); 1839 1840 if (SrcTy == V2S16 && DstTy == V2S16 && 1841 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1842 return true; 1843 1844 MachineIRBuilder HelperBuilder(MI); 1845 GISelObserverWrapper DummyObserver; 1846 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1847 HelperBuilder.setInstr(MI); 1848 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1849 } 1850 1851 bool AMDGPULegalizerInfo::legalizeSinCos( 1852 MachineInstr &MI, MachineRegisterInfo &MRI, 1853 MachineIRBuilder &B) const { 1854 B.setInstr(MI); 1855 1856 Register DstReg = MI.getOperand(0).getReg(); 1857 Register SrcReg = MI.getOperand(1).getReg(); 1858 LLT Ty = MRI.getType(DstReg); 1859 unsigned Flags = MI.getFlags(); 1860 1861 Register TrigVal; 1862 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1863 if (ST.hasTrigReducedRange()) { 1864 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1865 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1866 .addUse(MulVal.getReg(0)) 1867 .setMIFlags(Flags).getReg(0); 1868 } else 1869 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1870 1871 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1872 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1873 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1874 .addUse(TrigVal) 1875 .setMIFlags(Flags); 1876 MI.eraseFromParent(); 1877 return true; 1878 } 1879 1880 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1881 Register DstReg, LLT PtrTy, 1882 MachineIRBuilder &B, const GlobalValue *GV, 1883 unsigned Offset, unsigned GAFlags) const { 1884 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1885 // to the following code sequence: 1886 // 1887 // For constant address space: 1888 // s_getpc_b64 s[0:1] 1889 // s_add_u32 s0, s0, $symbol 1890 // s_addc_u32 s1, s1, 0 1891 // 1892 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1893 // a fixup or relocation is emitted to replace $symbol with a literal 1894 // constant, which is a pc-relative offset from the encoding of the $symbol 1895 // operand to the global variable. 1896 // 1897 // For global address space: 1898 // s_getpc_b64 s[0:1] 1899 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1900 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1901 // 1902 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1903 // fixups or relocations are emitted to replace $symbol@*@lo and 1904 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1905 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1906 // operand to the global variable. 1907 // 1908 // What we want here is an offset from the value returned by s_getpc 1909 // (which is the address of the s_add_u32 instruction) to the global 1910 // variable, but since the encoding of $symbol starts 4 bytes after the start 1911 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1912 // small. This requires us to add 4 to the global variable offset in order to 1913 // compute the correct address. 1914 1915 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1916 1917 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1918 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1919 1920 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1921 .addDef(PCReg); 1922 1923 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1924 if (GAFlags == SIInstrInfo::MO_NONE) 1925 MIB.addImm(0); 1926 else 1927 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1928 1929 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1930 1931 if (PtrTy.getSizeInBits() == 32) 1932 B.buildExtract(DstReg, PCReg, 0); 1933 return true; 1934 } 1935 1936 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1937 MachineInstr &MI, MachineRegisterInfo &MRI, 1938 MachineIRBuilder &B) const { 1939 Register DstReg = MI.getOperand(0).getReg(); 1940 LLT Ty = MRI.getType(DstReg); 1941 unsigned AS = Ty.getAddressSpace(); 1942 1943 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1944 MachineFunction &MF = B.getMF(); 1945 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1946 B.setInstr(MI); 1947 1948 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1949 if (!MFI->isEntryFunction()) { 1950 const Function &Fn = MF.getFunction(); 1951 DiagnosticInfoUnsupported BadLDSDecl( 1952 Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 1953 Fn.getContext().diagnose(BadLDSDecl); 1954 } 1955 1956 // TODO: We could emit code to handle the initialization somewhere. 1957 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1958 const SITargetLowering *TLI = ST.getTargetLowering(); 1959 if (!TLI->shouldUseLDSConstAddress(GV)) { 1960 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 1961 return true; // Leave in place; 1962 } 1963 1964 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1965 MI.eraseFromParent(); 1966 return true; 1967 } 1968 1969 const Function &Fn = MF.getFunction(); 1970 DiagnosticInfoUnsupported BadInit( 1971 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1972 Fn.getContext().diagnose(BadInit); 1973 return true; 1974 } 1975 1976 const SITargetLowering *TLI = ST.getTargetLowering(); 1977 1978 if (TLI->shouldEmitFixup(GV)) { 1979 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 1980 MI.eraseFromParent(); 1981 return true; 1982 } 1983 1984 if (TLI->shouldEmitPCReloc(GV)) { 1985 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 1986 MI.eraseFromParent(); 1987 return true; 1988 } 1989 1990 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1991 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 1992 1993 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 1994 MachinePointerInfo::getGOT(MF), 1995 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1996 MachineMemOperand::MOInvariant, 1997 8 /*Size*/, 8 /*Align*/); 1998 1999 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2000 2001 if (Ty.getSizeInBits() == 32) { 2002 // Truncate if this is a 32-bit constant adrdess. 2003 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2004 B.buildExtract(DstReg, Load, 0); 2005 } else 2006 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2007 2008 MI.eraseFromParent(); 2009 return true; 2010 } 2011 2012 bool AMDGPULegalizerInfo::legalizeLoad( 2013 MachineInstr &MI, MachineRegisterInfo &MRI, 2014 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2015 B.setInstr(MI); 2016 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2017 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2018 Observer.changingInstr(MI); 2019 MI.getOperand(1).setReg(Cast.getReg(0)); 2020 Observer.changedInstr(MI); 2021 return true; 2022 } 2023 2024 bool AMDGPULegalizerInfo::legalizeFMad( 2025 MachineInstr &MI, MachineRegisterInfo &MRI, 2026 MachineIRBuilder &B) const { 2027 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2028 assert(Ty.isScalar()); 2029 2030 MachineFunction &MF = B.getMF(); 2031 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2032 2033 // TODO: Always legal with future ftz flag. 2034 // FIXME: Do we need just output? 2035 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2036 return true; 2037 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2038 return true; 2039 2040 MachineIRBuilder HelperBuilder(MI); 2041 GISelObserverWrapper DummyObserver; 2042 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2043 HelperBuilder.setMBB(*MI.getParent()); 2044 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2045 } 2046 2047 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2048 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2049 Register DstReg = MI.getOperand(0).getReg(); 2050 Register PtrReg = MI.getOperand(1).getReg(); 2051 Register CmpVal = MI.getOperand(2).getReg(); 2052 Register NewVal = MI.getOperand(3).getReg(); 2053 2054 assert(SITargetLowering::isFlatGlobalAddrSpace( 2055 MRI.getType(PtrReg).getAddressSpace()) && 2056 "this should not have been custom lowered"); 2057 2058 LLT ValTy = MRI.getType(CmpVal); 2059 LLT VecTy = LLT::vector(2, ValTy); 2060 2061 B.setInstr(MI); 2062 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2063 2064 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2065 .addDef(DstReg) 2066 .addUse(PtrReg) 2067 .addUse(PackedVal) 2068 .setMemRefs(MI.memoperands()); 2069 2070 MI.eraseFromParent(); 2071 return true; 2072 } 2073 2074 bool AMDGPULegalizerInfo::legalizeFlog( 2075 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2076 Register Dst = MI.getOperand(0).getReg(); 2077 Register Src = MI.getOperand(1).getReg(); 2078 LLT Ty = B.getMRI()->getType(Dst); 2079 unsigned Flags = MI.getFlags(); 2080 B.setInstr(MI); 2081 2082 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2083 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2084 2085 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2086 MI.eraseFromParent(); 2087 return true; 2088 } 2089 2090 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2091 MachineIRBuilder &B) const { 2092 Register Dst = MI.getOperand(0).getReg(); 2093 Register Src = MI.getOperand(1).getReg(); 2094 unsigned Flags = MI.getFlags(); 2095 LLT Ty = B.getMRI()->getType(Dst); 2096 B.setInstr(MI); 2097 2098 auto K = B.buildFConstant(Ty, numbers::log2e); 2099 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2100 B.buildFExp2(Dst, Mul, Flags); 2101 MI.eraseFromParent(); 2102 return true; 2103 } 2104 2105 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2106 MachineIRBuilder &B) const { 2107 Register Dst = MI.getOperand(0).getReg(); 2108 Register Src0 = MI.getOperand(1).getReg(); 2109 Register Src1 = MI.getOperand(2).getReg(); 2110 unsigned Flags = MI.getFlags(); 2111 LLT Ty = B.getMRI()->getType(Dst); 2112 B.setInstr(MI); 2113 const LLT S16 = LLT::scalar(16); 2114 const LLT S32 = LLT::scalar(32); 2115 2116 if (Ty == S32) { 2117 auto Log = B.buildFLog2(S32, Src0, Flags); 2118 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2119 .addUse(Log.getReg(0)) 2120 .addUse(Src1) 2121 .setMIFlags(Flags); 2122 B.buildFExp2(Dst, Mul, Flags); 2123 } else if (Ty == S16) { 2124 // There's no f16 fmul_legacy, so we need to convert for it. 2125 auto Log = B.buildFLog2(S16, Src0, Flags); 2126 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2127 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2128 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2129 .addUse(Ext0.getReg(0)) 2130 .addUse(Ext1.getReg(0)) 2131 .setMIFlags(Flags); 2132 2133 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2134 } else 2135 return false; 2136 2137 MI.eraseFromParent(); 2138 return true; 2139 } 2140 2141 // Find a source register, ignoring any possible source modifiers. 2142 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2143 Register ModSrc = OrigSrc; 2144 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2145 ModSrc = SrcFNeg->getOperand(1).getReg(); 2146 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2147 ModSrc = SrcFAbs->getOperand(1).getReg(); 2148 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2149 ModSrc = SrcFAbs->getOperand(1).getReg(); 2150 return ModSrc; 2151 } 2152 2153 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2154 MachineRegisterInfo &MRI, 2155 MachineIRBuilder &B) const { 2156 B.setInstr(MI); 2157 2158 const LLT S1 = LLT::scalar(1); 2159 const LLT S64 = LLT::scalar(64); 2160 Register Dst = MI.getOperand(0).getReg(); 2161 Register OrigSrc = MI.getOperand(1).getReg(); 2162 unsigned Flags = MI.getFlags(); 2163 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2164 "this should not have been custom lowered"); 2165 2166 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2167 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2168 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2169 // V_FRACT bug is: 2170 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2171 // 2172 // Convert floor(x) to (x - fract(x)) 2173 2174 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2175 .addUse(OrigSrc) 2176 .setMIFlags(Flags); 2177 2178 // Give source modifier matching some assistance before obscuring a foldable 2179 // pattern. 2180 2181 // TODO: We can avoid the neg on the fract? The input sign to fract 2182 // shouldn't matter? 2183 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2184 2185 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2186 2187 Register Min = MRI.createGenericVirtualRegister(S64); 2188 2189 // We don't need to concern ourselves with the snan handling difference, so 2190 // use the one which will directly select. 2191 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2192 if (MFI->getMode().IEEE) 2193 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2194 else 2195 B.buildFMinNum(Min, Fract, Const, Flags); 2196 2197 Register CorrectedFract = Min; 2198 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2199 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2200 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2201 } 2202 2203 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2204 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2205 2206 MI.eraseFromParent(); 2207 return true; 2208 } 2209 2210 // Turn an illegal packed v2s16 build vector into bit operations. 2211 // TODO: This should probably be a bitcast action in LegalizerHelper. 2212 bool AMDGPULegalizerInfo::legalizeBuildVector( 2213 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2214 Register Dst = MI.getOperand(0).getReg(); 2215 LLT DstTy = MRI.getType(Dst); 2216 const LLT S32 = LLT::scalar(32); 2217 const LLT V2S16 = LLT::vector(2, 16); 2218 (void)DstTy; 2219 (void)V2S16; 2220 assert(DstTy == V2S16); 2221 2222 Register Src0 = MI.getOperand(1).getReg(); 2223 Register Src1 = MI.getOperand(2).getReg(); 2224 assert(MRI.getType(Src0) == LLT::scalar(16)); 2225 2226 B.setInstr(MI); 2227 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2228 B.buildBitcast(Dst, Merge); 2229 2230 MI.eraseFromParent(); 2231 return true; 2232 } 2233 2234 // Return the use branch instruction, otherwise null if the usage is invalid. 2235 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2236 MachineRegisterInfo &MRI, 2237 MachineInstr *&Br) { 2238 Register CondDef = MI.getOperand(0).getReg(); 2239 if (!MRI.hasOneNonDBGUse(CondDef)) 2240 return nullptr; 2241 2242 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2243 if (UseMI.getParent() != MI.getParent() || 2244 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2245 return nullptr; 2246 2247 // Make sure the cond br is followed by a G_BR 2248 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2249 if (Next != MI.getParent()->end()) { 2250 if (Next->getOpcode() != AMDGPU::G_BR) 2251 return nullptr; 2252 Br = &*Next; 2253 } 2254 2255 return &UseMI; 2256 } 2257 2258 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 2259 Register Reg, LLT Ty) const { 2260 Register LiveIn = MRI.getLiveInVirtReg(Reg); 2261 if (LiveIn) 2262 return LiveIn; 2263 2264 Register NewReg = MRI.createGenericVirtualRegister(Ty); 2265 MRI.addLiveIn(Reg, NewReg); 2266 return NewReg; 2267 } 2268 2269 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2270 const ArgDescriptor *Arg) const { 2271 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2272 return false; // TODO: Handle these 2273 2274 assert(Arg->getRegister().isPhysical()); 2275 2276 MachineRegisterInfo &MRI = *B.getMRI(); 2277 2278 LLT Ty = MRI.getType(DstReg); 2279 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 2280 2281 if (Arg->isMasked()) { 2282 // TODO: Should we try to emit this once in the entry block? 2283 const LLT S32 = LLT::scalar(32); 2284 const unsigned Mask = Arg->getMask(); 2285 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2286 2287 Register AndMaskSrc = LiveIn; 2288 2289 if (Shift != 0) { 2290 auto ShiftAmt = B.buildConstant(S32, Shift); 2291 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2292 } 2293 2294 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2295 } else 2296 B.buildCopy(DstReg, LiveIn); 2297 2298 // Insert the argument copy if it doens't already exist. 2299 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2300 if (!MRI.getVRegDef(LiveIn)) { 2301 // FIXME: Should have scoped insert pt 2302 MachineBasicBlock &OrigInsBB = B.getMBB(); 2303 auto OrigInsPt = B.getInsertPt(); 2304 2305 MachineBasicBlock &EntryMBB = B.getMF().front(); 2306 EntryMBB.addLiveIn(Arg->getRegister()); 2307 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2308 B.buildCopy(LiveIn, Arg->getRegister()); 2309 2310 B.setInsertPt(OrigInsBB, OrigInsPt); 2311 } 2312 2313 return true; 2314 } 2315 2316 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2317 MachineInstr &MI, 2318 MachineRegisterInfo &MRI, 2319 MachineIRBuilder &B, 2320 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2321 B.setInstr(MI); 2322 2323 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2324 2325 const ArgDescriptor *Arg; 2326 const TargetRegisterClass *RC; 2327 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 2328 if (!Arg) { 2329 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2330 return false; 2331 } 2332 2333 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 2334 MI.eraseFromParent(); 2335 return true; 2336 } 2337 2338 return false; 2339 } 2340 2341 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2342 MachineRegisterInfo &MRI, 2343 MachineIRBuilder &B) const { 2344 B.setInstr(MI); 2345 Register Dst = MI.getOperand(0).getReg(); 2346 LLT DstTy = MRI.getType(Dst); 2347 LLT S16 = LLT::scalar(16); 2348 LLT S32 = LLT::scalar(32); 2349 LLT S64 = LLT::scalar(64); 2350 2351 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2352 return true; 2353 2354 if (DstTy == S16) 2355 return legalizeFDIV16(MI, MRI, B); 2356 if (DstTy == S32) 2357 return legalizeFDIV32(MI, MRI, B); 2358 if (DstTy == S64) 2359 return legalizeFDIV64(MI, MRI, B); 2360 2361 return false; 2362 } 2363 2364 static Register buildDivRCP(MachineIRBuilder &B, Register Src) { 2365 const LLT S32 = LLT::scalar(32); 2366 2367 auto Cvt0 = B.buildUITOFP(S32, Src); 2368 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0}); 2369 auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000)); 2370 auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1); 2371 return B.buildFPTOUI(S32, Mul).getReg(0); 2372 } 2373 2374 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2375 Register DstReg, 2376 Register Num, 2377 Register Den, 2378 bool IsRem) const { 2379 const LLT S1 = LLT::scalar(1); 2380 const LLT S32 = LLT::scalar(32); 2381 2382 // RCP = URECIP(Den) = 2^32 / Den + e 2383 // e is rounding error. 2384 auto RCP = buildDivRCP(B, Den); 2385 2386 // RCP_LO = mul(RCP, Den) 2387 auto RCP_LO = B.buildMul(S32, RCP, Den); 2388 2389 // RCP_HI = mulhu (RCP, Den) */ 2390 auto RCP_HI = B.buildUMulH(S32, RCP, Den); 2391 2392 // NEG_RCP_LO = -RCP_LO 2393 auto Zero = B.buildConstant(S32, 0); 2394 auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO); 2395 2396 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 2397 auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero); 2398 auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO); 2399 2400 // Calculate the rounding error from the URECIP instruction 2401 // E = mulhu(ABS_RCP_LO, RCP) 2402 auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP); 2403 2404 // RCP_A_E = RCP + E 2405 auto RCP_A_E = B.buildAdd(S32, RCP, E); 2406 2407 // RCP_S_E = RCP - E 2408 auto RCP_S_E = B.buildSub(S32, RCP, E); 2409 2410 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 2411 auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E); 2412 2413 // Quotient = mulhu(Tmp0, Num)stmp 2414 auto Quotient = B.buildUMulH(S32, Tmp0, Num); 2415 2416 // Num_S_Remainder = Quotient * Den 2417 auto Num_S_Remainder = B.buildMul(S32, Quotient, Den); 2418 2419 // Remainder = Num - Num_S_Remainder 2420 auto Remainder = B.buildSub(S32, Num, Num_S_Remainder); 2421 2422 // Remainder_GE_Den = Remainder >= Den 2423 auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den); 2424 2425 // Remainder_GE_Zero = Num >= Num_S_Remainder; 2426 auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1, 2427 Num, Num_S_Remainder); 2428 2429 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 2430 auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero); 2431 2432 // Calculate Division result: 2433 2434 // Quotient_A_One = Quotient + 1 2435 auto One = B.buildConstant(S32, 1); 2436 auto Quotient_A_One = B.buildAdd(S32, Quotient, One); 2437 2438 // Quotient_S_One = Quotient - 1 2439 auto Quotient_S_One = B.buildSub(S32, Quotient, One); 2440 2441 // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient) 2442 auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One); 2443 2444 // Div = (Remainder_GE_Zero ? Div : Quotient_S_One) 2445 if (IsRem) { 2446 Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One); 2447 2448 // Calculate Rem result: 2449 auto Remainder_S_Den = B.buildSub(S32, Remainder, Den); 2450 2451 // Remainder_A_Den = Remainder + Den 2452 auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den); 2453 2454 // Rem = (Tmp1 ? Remainder_S_Den : Remainder) 2455 auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder); 2456 2457 // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den) 2458 B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den); 2459 } else { 2460 B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One); 2461 } 2462 } 2463 2464 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2465 MachineRegisterInfo &MRI, 2466 MachineIRBuilder &B) const { 2467 B.setInstr(MI); 2468 const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM; 2469 Register DstReg = MI.getOperand(0).getReg(); 2470 Register Num = MI.getOperand(1).getReg(); 2471 Register Den = MI.getOperand(2).getReg(); 2472 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem); 2473 MI.eraseFromParent(); 2474 return true; 2475 } 2476 2477 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2478 MachineRegisterInfo &MRI, 2479 MachineIRBuilder &B) const { 2480 if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32)) 2481 return legalizeUDIV_UREM32(MI, MRI, B); 2482 return false; 2483 } 2484 2485 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI, 2486 MachineRegisterInfo &MRI, 2487 MachineIRBuilder &B) const { 2488 B.setInstr(MI); 2489 const LLT S32 = LLT::scalar(32); 2490 2491 const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM; 2492 Register DstReg = MI.getOperand(0).getReg(); 2493 Register LHS = MI.getOperand(1).getReg(); 2494 Register RHS = MI.getOperand(2).getReg(); 2495 2496 auto ThirtyOne = B.buildConstant(S32, 31); 2497 auto LHSign = B.buildAShr(S32, LHS, ThirtyOne); 2498 auto RHSign = B.buildAShr(S32, LHS, ThirtyOne); 2499 2500 LHS = B.buildAdd(S32, LHS, LHSign).getReg(0); 2501 RHS = B.buildAdd(S32, RHS, RHSign).getReg(0); 2502 2503 LHS = B.buildXor(S32, LHS, LHSign).getReg(0); 2504 RHS = B.buildXor(S32, RHS, RHSign).getReg(0); 2505 2506 Register UDivRem = MRI.createGenericVirtualRegister(S32); 2507 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem); 2508 2509 if (IsRem) { 2510 auto RSign = LHSign; // Remainder sign is the same as LHS 2511 UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0); 2512 B.buildSub(DstReg, UDivRem, RSign); 2513 } else { 2514 auto DSign = B.buildXor(S32, LHSign, RHSign); 2515 UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0); 2516 B.buildSub(DstReg, UDivRem, DSign); 2517 } 2518 2519 MI.eraseFromParent(); 2520 return true; 2521 } 2522 2523 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2524 MachineRegisterInfo &MRI, 2525 MachineIRBuilder &B) const { 2526 if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32)) 2527 return legalizeSDIV_SREM32(MI, MRI, B); 2528 return false; 2529 } 2530 2531 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2532 MachineRegisterInfo &MRI, 2533 MachineIRBuilder &B) const { 2534 Register Res = MI.getOperand(0).getReg(); 2535 Register LHS = MI.getOperand(1).getReg(); 2536 Register RHS = MI.getOperand(2).getReg(); 2537 2538 uint16_t Flags = MI.getFlags(); 2539 2540 LLT ResTy = MRI.getType(Res); 2541 LLT S32 = LLT::scalar(32); 2542 LLT S64 = LLT::scalar(64); 2543 2544 const MachineFunction &MF = B.getMF(); 2545 bool Unsafe = 2546 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2547 2548 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2549 return false; 2550 2551 if (!Unsafe && ResTy == S32 && 2552 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2553 return false; 2554 2555 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2556 // 1 / x -> RCP(x) 2557 if (CLHS->isExactlyValue(1.0)) { 2558 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2559 .addUse(RHS) 2560 .setMIFlags(Flags); 2561 2562 MI.eraseFromParent(); 2563 return true; 2564 } 2565 2566 // -1 / x -> RCP( FNEG(x) ) 2567 if (CLHS->isExactlyValue(-1.0)) { 2568 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2569 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2570 .addUse(FNeg.getReg(0)) 2571 .setMIFlags(Flags); 2572 2573 MI.eraseFromParent(); 2574 return true; 2575 } 2576 } 2577 2578 // x / y -> x * (1.0 / y) 2579 if (Unsafe) { 2580 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2581 .addUse(RHS) 2582 .setMIFlags(Flags); 2583 B.buildFMul(Res, LHS, RCP, Flags); 2584 2585 MI.eraseFromParent(); 2586 return true; 2587 } 2588 2589 return false; 2590 } 2591 2592 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2593 MachineRegisterInfo &MRI, 2594 MachineIRBuilder &B) const { 2595 B.setInstr(MI); 2596 Register Res = MI.getOperand(0).getReg(); 2597 Register LHS = MI.getOperand(1).getReg(); 2598 Register RHS = MI.getOperand(2).getReg(); 2599 2600 uint16_t Flags = MI.getFlags(); 2601 2602 LLT S16 = LLT::scalar(16); 2603 LLT S32 = LLT::scalar(32); 2604 2605 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2606 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2607 2608 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2609 .addUse(RHSExt.getReg(0)) 2610 .setMIFlags(Flags); 2611 2612 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2613 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2614 2615 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2616 .addUse(RDst.getReg(0)) 2617 .addUse(RHS) 2618 .addUse(LHS) 2619 .setMIFlags(Flags); 2620 2621 MI.eraseFromParent(); 2622 return true; 2623 } 2624 2625 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2626 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2627 static void toggleSPDenormMode(bool Enable, 2628 MachineIRBuilder &B, 2629 const GCNSubtarget &ST, 2630 AMDGPU::SIModeRegisterDefaults Mode) { 2631 // Set SP denorm mode to this value. 2632 unsigned SPDenormMode = 2633 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2634 2635 if (ST.hasDenormModeInst()) { 2636 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2637 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2638 2639 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2640 B.buildInstr(AMDGPU::S_DENORM_MODE) 2641 .addImm(NewDenormModeValue); 2642 2643 } else { 2644 // Select FP32 bit field in mode register. 2645 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2646 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2647 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2648 2649 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2650 .addImm(SPDenormMode) 2651 .addImm(SPDenormModeBitField); 2652 } 2653 } 2654 2655 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2656 MachineRegisterInfo &MRI, 2657 MachineIRBuilder &B) const { 2658 B.setInstr(MI); 2659 Register Res = MI.getOperand(0).getReg(); 2660 Register LHS = MI.getOperand(1).getReg(); 2661 Register RHS = MI.getOperand(2).getReg(); 2662 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2663 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2664 2665 uint16_t Flags = MI.getFlags(); 2666 2667 LLT S32 = LLT::scalar(32); 2668 LLT S1 = LLT::scalar(1); 2669 2670 auto One = B.buildFConstant(S32, 1.0f); 2671 2672 auto DenominatorScaled = 2673 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2674 .addUse(RHS) 2675 .addUse(LHS) 2676 .addImm(1) 2677 .setMIFlags(Flags); 2678 auto NumeratorScaled = 2679 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2680 .addUse(LHS) 2681 .addUse(RHS) 2682 .addImm(0) 2683 .setMIFlags(Flags); 2684 2685 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2686 .addUse(DenominatorScaled.getReg(0)) 2687 .setMIFlags(Flags); 2688 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2689 2690 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2691 // aren't modeled as reading it. 2692 if (!Mode.allFP32Denormals()) 2693 toggleSPDenormMode(true, B, ST, Mode); 2694 2695 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2696 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2697 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2698 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2699 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2700 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2701 2702 if (!Mode.allFP32Denormals()) 2703 toggleSPDenormMode(false, B, ST, Mode); 2704 2705 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2706 .addUse(Fma4.getReg(0)) 2707 .addUse(Fma1.getReg(0)) 2708 .addUse(Fma3.getReg(0)) 2709 .addUse(NumeratorScaled.getReg(1)) 2710 .setMIFlags(Flags); 2711 2712 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2713 .addUse(Fmas.getReg(0)) 2714 .addUse(RHS) 2715 .addUse(LHS) 2716 .setMIFlags(Flags); 2717 2718 MI.eraseFromParent(); 2719 return true; 2720 } 2721 2722 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2723 MachineRegisterInfo &MRI, 2724 MachineIRBuilder &B) const { 2725 B.setInstr(MI); 2726 Register Res = MI.getOperand(0).getReg(); 2727 Register LHS = MI.getOperand(1).getReg(); 2728 Register RHS = MI.getOperand(2).getReg(); 2729 2730 uint16_t Flags = MI.getFlags(); 2731 2732 LLT S64 = LLT::scalar(64); 2733 LLT S1 = LLT::scalar(1); 2734 2735 auto One = B.buildFConstant(S64, 1.0); 2736 2737 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2738 .addUse(LHS) 2739 .addUse(RHS) 2740 .addImm(1) 2741 .setMIFlags(Flags); 2742 2743 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2744 2745 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2746 .addUse(DivScale0.getReg(0)) 2747 .setMIFlags(Flags); 2748 2749 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2750 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2751 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2752 2753 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2754 .addUse(LHS) 2755 .addUse(RHS) 2756 .addImm(0) 2757 .setMIFlags(Flags); 2758 2759 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2760 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); 2761 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2762 2763 Register Scale; 2764 if (!ST.hasUsableDivScaleConditionOutput()) { 2765 // Workaround a hardware bug on SI where the condition output from div_scale 2766 // is not usable. 2767 2768 LLT S32 = LLT::scalar(32); 2769 2770 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2771 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2772 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2773 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 2774 2775 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 2776 Scale1Unmerge.getReg(1)); 2777 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 2778 Scale0Unmerge.getReg(1)); 2779 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 2780 } else { 2781 Scale = DivScale1.getReg(1); 2782 } 2783 2784 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 2785 .addUse(Fma4.getReg(0)) 2786 .addUse(Fma3.getReg(0)) 2787 .addUse(Mul.getReg(0)) 2788 .addUse(Scale) 2789 .setMIFlags(Flags); 2790 2791 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 2792 .addUse(Fmas.getReg(0)) 2793 .addUse(RHS) 2794 .addUse(LHS) 2795 .setMIFlags(Flags); 2796 2797 MI.eraseFromParent(); 2798 return true; 2799 } 2800 2801 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 2802 MachineRegisterInfo &MRI, 2803 MachineIRBuilder &B) const { 2804 B.setInstr(MI); 2805 Register Res = MI.getOperand(0).getReg(); 2806 Register LHS = MI.getOperand(2).getReg(); 2807 Register RHS = MI.getOperand(3).getReg(); 2808 uint16_t Flags = MI.getFlags(); 2809 2810 LLT S32 = LLT::scalar(32); 2811 LLT S1 = LLT::scalar(1); 2812 2813 auto Abs = B.buildFAbs(S32, RHS, Flags); 2814 const APFloat C0Val(1.0f); 2815 2816 auto C0 = B.buildConstant(S32, 0x6f800000); 2817 auto C1 = B.buildConstant(S32, 0x2f800000); 2818 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 2819 2820 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 2821 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 2822 2823 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 2824 2825 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2826 .addUse(Mul0.getReg(0)) 2827 .setMIFlags(Flags); 2828 2829 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 2830 2831 B.buildFMul(Res, Sel, Mul1, Flags); 2832 2833 MI.eraseFromParent(); 2834 return true; 2835 } 2836 2837 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 2838 MachineRegisterInfo &MRI, 2839 MachineIRBuilder &B) const { 2840 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2841 if (!MFI->isEntryFunction()) { 2842 return legalizePreloadedArgIntrin(MI, MRI, B, 2843 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 2844 } 2845 2846 B.setInstr(MI); 2847 2848 uint64_t Offset = 2849 ST.getTargetLowering()->getImplicitParameterOffset( 2850 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 2851 Register DstReg = MI.getOperand(0).getReg(); 2852 LLT DstTy = MRI.getType(DstReg); 2853 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 2854 2855 const ArgDescriptor *Arg; 2856 const TargetRegisterClass *RC; 2857 std::tie(Arg, RC) 2858 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2859 if (!Arg) 2860 return false; 2861 2862 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 2863 if (!loadInputValue(KernargPtrReg, B, Arg)) 2864 return false; 2865 2866 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 2867 MI.eraseFromParent(); 2868 return true; 2869 } 2870 2871 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 2872 MachineRegisterInfo &MRI, 2873 MachineIRBuilder &B, 2874 unsigned AddrSpace) const { 2875 B.setInstr(MI); 2876 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 2877 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 2878 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 2879 MI.eraseFromParent(); 2880 return true; 2881 } 2882 2883 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 2884 // offset (the offset that is included in bounds checking and swizzling, to be 2885 // split between the instruction's voffset and immoffset fields) and soffset 2886 // (the offset that is excluded from bounds checking and swizzling, to go in 2887 // the instruction's soffset field). This function takes the first kind of 2888 // offset and figures out how to split it between voffset and immoffset. 2889 std::tuple<Register, unsigned, unsigned> 2890 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 2891 Register OrigOffset) const { 2892 const unsigned MaxImm = 4095; 2893 Register BaseReg; 2894 unsigned TotalConstOffset; 2895 MachineInstr *OffsetDef; 2896 const LLT S32 = LLT::scalar(32); 2897 2898 std::tie(BaseReg, TotalConstOffset, OffsetDef) 2899 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 2900 2901 unsigned ImmOffset = TotalConstOffset; 2902 2903 // If the immediate value is too big for the immoffset field, put the value 2904 // and -4096 into the immoffset field so that the value that is copied/added 2905 // for the voffset field is a multiple of 4096, and it stands more chance 2906 // of being CSEd with the copy/add for another similar load/store. 2907 // However, do not do that rounding down to a multiple of 4096 if that is a 2908 // negative number, as it appears to be illegal to have a negative offset 2909 // in the vgpr, even if adding the immediate offset makes it positive. 2910 unsigned Overflow = ImmOffset & ~MaxImm; 2911 ImmOffset -= Overflow; 2912 if ((int32_t)Overflow < 0) { 2913 Overflow += ImmOffset; 2914 ImmOffset = 0; 2915 } 2916 2917 if (Overflow != 0) { 2918 if (!BaseReg) { 2919 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 2920 } else { 2921 auto OverflowVal = B.buildConstant(S32, Overflow); 2922 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 2923 } 2924 } 2925 2926 if (!BaseReg) 2927 BaseReg = B.buildConstant(S32, 0).getReg(0); 2928 2929 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 2930 } 2931 2932 /// Handle register layout difference for f16 images for some subtargets. 2933 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 2934 MachineRegisterInfo &MRI, 2935 Register Reg) const { 2936 if (!ST.hasUnpackedD16VMem()) 2937 return Reg; 2938 2939 const LLT S16 = LLT::scalar(16); 2940 const LLT S32 = LLT::scalar(32); 2941 LLT StoreVT = MRI.getType(Reg); 2942 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 2943 2944 auto Unmerge = B.buildUnmerge(S16, Reg); 2945 2946 SmallVector<Register, 4> WideRegs; 2947 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 2948 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 2949 2950 int NumElts = StoreVT.getNumElements(); 2951 2952 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 2953 } 2954 2955 Register AMDGPULegalizerInfo::fixStoreSourceType( 2956 MachineIRBuilder &B, Register VData, bool IsFormat) const { 2957 MachineRegisterInfo *MRI = B.getMRI(); 2958 LLT Ty = MRI->getType(VData); 2959 2960 const LLT S16 = LLT::scalar(16); 2961 2962 // Fixup illegal register types for i8 stores. 2963 if (Ty == LLT::scalar(8) || Ty == S16) { 2964 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 2965 return AnyExt; 2966 } 2967 2968 if (Ty.isVector()) { 2969 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 2970 if (IsFormat) 2971 return handleD16VData(B, *MRI, VData); 2972 } 2973 } 2974 2975 return VData; 2976 } 2977 2978 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 2979 MachineRegisterInfo &MRI, 2980 MachineIRBuilder &B, 2981 bool IsTyped, 2982 bool IsFormat) const { 2983 B.setInstr(MI); 2984 2985 Register VData = MI.getOperand(1).getReg(); 2986 LLT Ty = MRI.getType(VData); 2987 LLT EltTy = Ty.getScalarType(); 2988 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 2989 const LLT S32 = LLT::scalar(32); 2990 2991 VData = fixStoreSourceType(B, VData, IsFormat); 2992 Register RSrc = MI.getOperand(2).getReg(); 2993 2994 MachineMemOperand *MMO = *MI.memoperands_begin(); 2995 const int MemSize = MMO->getSize(); 2996 2997 unsigned ImmOffset; 2998 unsigned TotalOffset; 2999 3000 // The typed intrinsics add an immediate after the registers. 3001 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3002 3003 // The struct intrinsic variants add one additional operand over raw. 3004 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3005 Register VIndex; 3006 int OpOffset = 0; 3007 if (HasVIndex) { 3008 VIndex = MI.getOperand(3).getReg(); 3009 OpOffset = 1; 3010 } 3011 3012 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3013 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3014 3015 unsigned Format = 0; 3016 if (IsTyped) { 3017 Format = MI.getOperand(5 + OpOffset).getImm(); 3018 ++OpOffset; 3019 } 3020 3021 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3022 3023 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3024 if (TotalOffset != 0) 3025 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3026 3027 unsigned Opc; 3028 if (IsTyped) { 3029 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3030 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3031 } else if (IsFormat) { 3032 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3033 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3034 } else { 3035 switch (MemSize) { 3036 case 1: 3037 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3038 break; 3039 case 2: 3040 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3041 break; 3042 default: 3043 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3044 break; 3045 } 3046 } 3047 3048 if (!VIndex) 3049 VIndex = B.buildConstant(S32, 0).getReg(0); 3050 3051 auto MIB = B.buildInstr(Opc) 3052 .addUse(VData) // vdata 3053 .addUse(RSrc) // rsrc 3054 .addUse(VIndex) // vindex 3055 .addUse(VOffset) // voffset 3056 .addUse(SOffset) // soffset 3057 .addImm(ImmOffset); // offset(imm) 3058 3059 if (IsTyped) 3060 MIB.addImm(Format); 3061 3062 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3063 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3064 .addMemOperand(MMO); 3065 3066 MI.eraseFromParent(); 3067 return true; 3068 } 3069 3070 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3071 MachineRegisterInfo &MRI, 3072 MachineIRBuilder &B, 3073 bool IsFormat, 3074 bool IsTyped) const { 3075 B.setInstr(MI); 3076 3077 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3078 MachineMemOperand *MMO = *MI.memoperands_begin(); 3079 const int MemSize = MMO->getSize(); 3080 const LLT S32 = LLT::scalar(32); 3081 3082 Register Dst = MI.getOperand(0).getReg(); 3083 Register RSrc = MI.getOperand(2).getReg(); 3084 3085 // The typed intrinsics add an immediate after the registers. 3086 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3087 3088 // The struct intrinsic variants add one additional operand over raw. 3089 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3090 Register VIndex; 3091 int OpOffset = 0; 3092 if (HasVIndex) { 3093 VIndex = MI.getOperand(3).getReg(); 3094 OpOffset = 1; 3095 } 3096 3097 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3098 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3099 3100 unsigned Format = 0; 3101 if (IsTyped) { 3102 Format = MI.getOperand(5 + OpOffset).getImm(); 3103 ++OpOffset; 3104 } 3105 3106 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3107 unsigned ImmOffset; 3108 unsigned TotalOffset; 3109 3110 LLT Ty = MRI.getType(Dst); 3111 LLT EltTy = Ty.getScalarType(); 3112 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3113 const bool Unpacked = ST.hasUnpackedD16VMem(); 3114 3115 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3116 if (TotalOffset != 0) 3117 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3118 3119 unsigned Opc; 3120 3121 if (IsTyped) { 3122 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3123 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3124 } else if (IsFormat) { 3125 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3126 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3127 } else { 3128 switch (MemSize) { 3129 case 1: 3130 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3131 break; 3132 case 2: 3133 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3134 break; 3135 default: 3136 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3137 break; 3138 } 3139 } 3140 3141 Register LoadDstReg; 3142 3143 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3144 LLT UnpackedTy = Ty.changeElementSize(32); 3145 3146 if (IsExtLoad) 3147 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3148 else if (Unpacked && IsD16 && Ty.isVector()) 3149 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3150 else 3151 LoadDstReg = Dst; 3152 3153 if (!VIndex) 3154 VIndex = B.buildConstant(S32, 0).getReg(0); 3155 3156 auto MIB = B.buildInstr(Opc) 3157 .addDef(LoadDstReg) // vdata 3158 .addUse(RSrc) // rsrc 3159 .addUse(VIndex) // vindex 3160 .addUse(VOffset) // voffset 3161 .addUse(SOffset) // soffset 3162 .addImm(ImmOffset); // offset(imm) 3163 3164 if (IsTyped) 3165 MIB.addImm(Format); 3166 3167 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3168 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3169 .addMemOperand(MMO); 3170 3171 if (LoadDstReg != Dst) { 3172 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3173 3174 // Widen result for extending loads was widened. 3175 if (IsExtLoad) 3176 B.buildTrunc(Dst, LoadDstReg); 3177 else { 3178 // Repack to original 16-bit vector result 3179 // FIXME: G_TRUNC should work, but legalization currently fails 3180 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3181 SmallVector<Register, 4> Repack; 3182 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3183 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3184 B.buildMerge(Dst, Repack); 3185 } 3186 } 3187 3188 MI.eraseFromParent(); 3189 return true; 3190 } 3191 3192 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3193 MachineIRBuilder &B, 3194 bool IsInc) const { 3195 B.setInstr(MI); 3196 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3197 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3198 B.buildInstr(Opc) 3199 .addDef(MI.getOperand(0).getReg()) 3200 .addUse(MI.getOperand(2).getReg()) 3201 .addUse(MI.getOperand(3).getReg()) 3202 .cloneMemRefs(MI); 3203 MI.eraseFromParent(); 3204 return true; 3205 } 3206 3207 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3208 switch (IntrID) { 3209 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3210 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3211 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3212 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3213 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3214 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3215 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3216 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3217 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3218 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3219 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3220 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3221 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3222 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3223 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3224 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3225 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3226 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3227 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3228 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3229 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3230 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3231 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3232 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3233 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3234 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3235 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3236 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3237 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3238 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3239 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3240 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3241 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3242 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3243 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3244 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3245 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3246 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3247 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3248 default: 3249 llvm_unreachable("unhandled atomic opcode"); 3250 } 3251 } 3252 3253 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3254 MachineIRBuilder &B, 3255 Intrinsic::ID IID) const { 3256 B.setInstr(MI); 3257 3258 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3259 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3260 3261 Register Dst = MI.getOperand(0).getReg(); 3262 Register VData = MI.getOperand(2).getReg(); 3263 3264 Register CmpVal; 3265 int OpOffset = 0; 3266 3267 if (IsCmpSwap) { 3268 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3269 ++OpOffset; 3270 } 3271 3272 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3273 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3274 3275 // The struct intrinsic variants add one additional operand over raw. 3276 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3277 Register VIndex; 3278 if (HasVIndex) { 3279 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3280 ++OpOffset; 3281 } 3282 3283 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3284 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3285 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3286 3287 MachineMemOperand *MMO = *MI.memoperands_begin(); 3288 3289 unsigned ImmOffset; 3290 unsigned TotalOffset; 3291 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3292 if (TotalOffset != 0) 3293 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3294 3295 if (!VIndex) 3296 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3297 3298 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3299 .addDef(Dst) 3300 .addUse(VData); // vdata 3301 3302 if (IsCmpSwap) 3303 MIB.addReg(CmpVal); 3304 3305 MIB.addUse(RSrc) // rsrc 3306 .addUse(VIndex) // vindex 3307 .addUse(VOffset) // voffset 3308 .addUse(SOffset) // soffset 3309 .addImm(ImmOffset) // offset(imm) 3310 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3311 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3312 .addMemOperand(MMO); 3313 3314 MI.eraseFromParent(); 3315 return true; 3316 } 3317 3318 // Produce a vector of s16 elements from s32 pieces. 3319 static void truncToS16Vector(MachineIRBuilder &B, Register DstReg, 3320 ArrayRef<Register> UnmergeParts) { 3321 const LLT S16 = LLT::scalar(16); 3322 3323 SmallVector<Register, 4> RemergeParts(UnmergeParts.size()); 3324 for (int I = 0, E = UnmergeParts.size(); I != E; ++I) 3325 RemergeParts[I] = B.buildTrunc(S16, UnmergeParts[I]).getReg(0); 3326 3327 B.buildBuildVector(DstReg, RemergeParts); 3328 } 3329 3330 /// Convert a set of s32 registers to a result vector with s16 elements. 3331 static void bitcastToS16Vector(MachineIRBuilder &B, Register DstReg, 3332 ArrayRef<Register> UnmergeParts) { 3333 MachineRegisterInfo &MRI = *B.getMRI(); 3334 const LLT V2S16 = LLT::vector(2, 16); 3335 LLT TargetTy = MRI.getType(DstReg); 3336 int NumElts = UnmergeParts.size(); 3337 3338 if (NumElts == 1) { 3339 assert(TargetTy == V2S16); 3340 B.buildBitcast(DstReg, UnmergeParts[0]); 3341 return; 3342 } 3343 3344 SmallVector<Register, 4> RemergeParts(NumElts); 3345 for (int I = 0; I != NumElts; ++I) 3346 RemergeParts[I] = B.buildBitcast(V2S16, UnmergeParts[I]).getReg(0); 3347 3348 if (TargetTy.getSizeInBits() == 32u * NumElts) { 3349 B.buildConcatVectors(DstReg, RemergeParts); 3350 return; 3351 } 3352 3353 const LLT V3S16 = LLT::vector(3, 16); 3354 const LLT V6S16 = LLT::vector(6, 16); 3355 3356 // Widen to v6s16 and unpack v3 parts. 3357 assert(TargetTy == V3S16); 3358 3359 RemergeParts.push_back(B.buildUndef(V2S16).getReg(0)); 3360 auto Concat = B.buildConcatVectors(V6S16, RemergeParts); 3361 B.buildUnmerge({DstReg, MRI.createGenericVirtualRegister(V3S16)}, Concat); 3362 } 3363 3364 // FIXME: Just vector trunc should be sufficent, but legalization currently 3365 // broken. 3366 static void repackUnpackedD16Load(MachineIRBuilder &B, Register DstReg, 3367 Register WideDstReg) { 3368 const LLT S32 = LLT::scalar(32); 3369 const LLT S16 = LLT::scalar(16); 3370 3371 auto Unmerge = B.buildUnmerge(S32, WideDstReg); 3372 3373 int NumOps = Unmerge->getNumOperands() - 1; 3374 SmallVector<Register, 4> RemergeParts(NumOps); 3375 for (int I = 0; I != NumOps; ++I) 3376 RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0); 3377 3378 B.buildBuildVector(DstReg, RemergeParts); 3379 } 3380 3381 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3382 MachineInstr &MI, MachineIRBuilder &B, 3383 GISelChangeObserver &Observer, 3384 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3385 bool IsTFE = MI.getNumExplicitDefs() == 2; 3386 3387 // We are only processing the operands of d16 image operations on subtargets 3388 // that use the unpacked register layout, or need to repack the TFE result. 3389 3390 // TODO: Need to handle a16 images too 3391 // TODO: Do we need to guard against already legalized intrinsics? 3392 if (!IsTFE && !ST.hasUnpackedD16VMem()) 3393 return true; 3394 3395 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3396 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3397 3398 if (BaseOpcode->Atomic) // No d16 atomics, or TFE. 3399 return true; 3400 3401 B.setInstr(MI); 3402 3403 MachineRegisterInfo *MRI = B.getMRI(); 3404 const LLT S32 = LLT::scalar(32); 3405 const LLT S16 = LLT::scalar(16); 3406 3407 if (BaseOpcode->Store) { // No TFE for stores? 3408 Register VData = MI.getOperand(1).getReg(); 3409 LLT Ty = MRI->getType(VData); 3410 if (!Ty.isVector() || Ty.getElementType() != S16) 3411 return true; 3412 3413 B.setInstr(MI); 3414 3415 Observer.changingInstr(MI); 3416 MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData)); 3417 Observer.changedInstr(MI); 3418 return true; 3419 } 3420 3421 Register DstReg = MI.getOperand(0).getReg(); 3422 LLT Ty = MRI->getType(DstReg); 3423 const LLT EltTy = Ty.getScalarType(); 3424 const bool IsD16 = Ty.getScalarType() == S16; 3425 const unsigned NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3426 3427 if (IsTFE) { 3428 // In the IR, TFE is supposed to be used with a 2 element struct return 3429 // type. The intruction really returns these two values in one contiguous 3430 // register, with one additional dword beyond the loaded data. Rewrite the 3431 // return type to use a single register result. 3432 Register Dst1Reg = MI.getOperand(1).getReg(); 3433 if (MRI->getType(Dst1Reg) != S32) 3434 return false; 3435 3436 // TODO: Make sure the TFE operand bit is set. 3437 3438 // The raw dword aligned data component of the load. The only legal cases 3439 // where this matters should be when using the packed D16 format, for 3440 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3441 LLT RoundedTy; 3442 LLT TFETy; 3443 3444 if (IsD16 && ST.hasUnpackedD16VMem()) { 3445 RoundedTy = LLT::scalarOrVector(NumElts, 32); 3446 TFETy = LLT::vector(NumElts + 1, 32); 3447 } else { 3448 unsigned EltSize = Ty.getScalarSizeInBits(); 3449 unsigned RoundedElts = (Ty.getSizeInBits() + 31) / 32; 3450 unsigned RoundedSize = 32 * RoundedElts; 3451 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3452 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3453 } 3454 3455 Register TFEReg = MRI->createGenericVirtualRegister(TFETy); 3456 Observer.changingInstr(MI); 3457 3458 MI.getOperand(0).setReg(TFEReg); 3459 MI.RemoveOperand(1); 3460 3461 Observer.changedInstr(MI); 3462 3463 // Insert after the instruction. 3464 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3465 3466 // Now figure out how to copy the new result register back into the old 3467 // result. 3468 3469 SmallVector<Register, 5> UnmergeResults(TFETy.getNumElements(), Dst1Reg); 3470 int NumDataElts = TFETy.getNumElements() - 1; 3471 3472 if (!Ty.isVector()) { 3473 // Simplest case is a trivial unmerge (plus a truncate for d16). 3474 UnmergeResults[0] = Ty == S32 ? 3475 DstReg : MRI->createGenericVirtualRegister(S32); 3476 3477 B.buildUnmerge(UnmergeResults, TFEReg); 3478 if (Ty != S32) 3479 B.buildTrunc(DstReg, UnmergeResults[0]); 3480 return true; 3481 } 3482 3483 // We have to repack into a new vector of some kind. 3484 for (int I = 0; I != NumDataElts; ++I) 3485 UnmergeResults[I] = MRI->createGenericVirtualRegister(S32); 3486 B.buildUnmerge(UnmergeResults, TFEReg); 3487 3488 // Drop the final TFE element. 3489 ArrayRef<Register> DataPart(UnmergeResults.data(), NumDataElts); 3490 3491 if (EltTy == S32) 3492 B.buildBuildVector(DstReg, DataPart); 3493 else if (ST.hasUnpackedD16VMem()) 3494 truncToS16Vector(B, DstReg, DataPart); 3495 else 3496 bitcastToS16Vector(B, DstReg, DataPart); 3497 3498 return true; 3499 } 3500 3501 // Must be an image load. 3502 if (!Ty.isVector() || Ty.getElementType() != S16) 3503 return true; 3504 3505 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3506 3507 LLT WidenedTy = Ty.changeElementType(S32); 3508 Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy); 3509 3510 Observer.changingInstr(MI); 3511 MI.getOperand(0).setReg(WideDstReg); 3512 Observer.changedInstr(MI); 3513 3514 repackUnpackedD16Load(B, DstReg, WideDstReg); 3515 return true; 3516 } 3517 3518 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 3519 MachineInstr &MI, MachineIRBuilder &B, 3520 GISelChangeObserver &Observer) const { 3521 Register Dst = MI.getOperand(0).getReg(); 3522 LLT Ty = B.getMRI()->getType(Dst); 3523 unsigned Size = Ty.getSizeInBits(); 3524 MachineFunction &MF = B.getMF(); 3525 3526 Observer.changingInstr(MI); 3527 3528 // FIXME: We don't really need this intermediate instruction. The intrinsic 3529 // should be fixed to have a memory operand. Since it's readnone, we're not 3530 // allowed to add one. 3531 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 3532 MI.RemoveOperand(1); // Remove intrinsic ID 3533 3534 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 3535 // TODO: Should this use datalayout alignment? 3536 const unsigned MemSize = (Size + 7) / 8; 3537 const unsigned MemAlign = 4; 3538 MachineMemOperand *MMO = MF.getMachineMemOperand( 3539 MachinePointerInfo(), 3540 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 3541 MachineMemOperand::MOInvariant, MemSize, MemAlign); 3542 MI.addMemOperand(MF, MMO); 3543 3544 // There are no 96-bit result scalar loads, but widening to 128-bit should 3545 // always be legal. We may need to restore this to a 96-bit result if it turns 3546 // out this needs to be converted to a vector load during RegBankSelect. 3547 if (!isPowerOf2_32(Size)) { 3548 LegalizerHelper Helper(MF, *this, Observer, B); 3549 B.setInstr(MI); 3550 3551 if (Ty.isVector()) 3552 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 3553 else 3554 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 3555 } 3556 3557 Observer.changedInstr(MI); 3558 return true; 3559 } 3560 3561 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 3562 MachineIRBuilder &B, 3563 GISelChangeObserver &Observer) const { 3564 MachineRegisterInfo &MRI = *B.getMRI(); 3565 3566 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 3567 auto IntrID = MI.getIntrinsicID(); 3568 switch (IntrID) { 3569 case Intrinsic::amdgcn_if: 3570 case Intrinsic::amdgcn_else: { 3571 MachineInstr *Br = nullptr; 3572 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 3573 const SIRegisterInfo *TRI 3574 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 3575 3576 B.setInstr(*BrCond); 3577 Register Def = MI.getOperand(1).getReg(); 3578 Register Use = MI.getOperand(3).getReg(); 3579 3580 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 3581 if (Br) 3582 BrTarget = Br->getOperand(0).getMBB(); 3583 3584 if (IntrID == Intrinsic::amdgcn_if) { 3585 B.buildInstr(AMDGPU::SI_IF) 3586 .addDef(Def) 3587 .addUse(Use) 3588 .addMBB(BrTarget); 3589 } else { 3590 B.buildInstr(AMDGPU::SI_ELSE) 3591 .addDef(Def) 3592 .addUse(Use) 3593 .addMBB(BrTarget) 3594 .addImm(0); 3595 } 3596 3597 if (Br) 3598 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 3599 3600 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 3601 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 3602 MI.eraseFromParent(); 3603 BrCond->eraseFromParent(); 3604 return true; 3605 } 3606 3607 return false; 3608 } 3609 case Intrinsic::amdgcn_loop: { 3610 MachineInstr *Br = nullptr; 3611 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 3612 const SIRegisterInfo *TRI 3613 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 3614 3615 B.setInstr(*BrCond); 3616 3617 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 3618 if (Br) 3619 BrTarget = Br->getOperand(0).getMBB(); 3620 3621 Register Reg = MI.getOperand(2).getReg(); 3622 B.buildInstr(AMDGPU::SI_LOOP) 3623 .addUse(Reg) 3624 .addMBB(BrTarget); 3625 3626 if (Br) 3627 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 3628 3629 MI.eraseFromParent(); 3630 BrCond->eraseFromParent(); 3631 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 3632 return true; 3633 } 3634 3635 return false; 3636 } 3637 case Intrinsic::amdgcn_kernarg_segment_ptr: 3638 return legalizePreloadedArgIntrin( 3639 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3640 case Intrinsic::amdgcn_implicitarg_ptr: 3641 return legalizeImplicitArgPtr(MI, MRI, B); 3642 case Intrinsic::amdgcn_workitem_id_x: 3643 return legalizePreloadedArgIntrin(MI, MRI, B, 3644 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 3645 case Intrinsic::amdgcn_workitem_id_y: 3646 return legalizePreloadedArgIntrin(MI, MRI, B, 3647 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 3648 case Intrinsic::amdgcn_workitem_id_z: 3649 return legalizePreloadedArgIntrin(MI, MRI, B, 3650 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 3651 case Intrinsic::amdgcn_workgroup_id_x: 3652 return legalizePreloadedArgIntrin(MI, MRI, B, 3653 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 3654 case Intrinsic::amdgcn_workgroup_id_y: 3655 return legalizePreloadedArgIntrin(MI, MRI, B, 3656 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 3657 case Intrinsic::amdgcn_workgroup_id_z: 3658 return legalizePreloadedArgIntrin(MI, MRI, B, 3659 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 3660 case Intrinsic::amdgcn_dispatch_ptr: 3661 return legalizePreloadedArgIntrin(MI, MRI, B, 3662 AMDGPUFunctionArgInfo::DISPATCH_PTR); 3663 case Intrinsic::amdgcn_queue_ptr: 3664 return legalizePreloadedArgIntrin(MI, MRI, B, 3665 AMDGPUFunctionArgInfo::QUEUE_PTR); 3666 case Intrinsic::amdgcn_implicit_buffer_ptr: 3667 return legalizePreloadedArgIntrin( 3668 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 3669 case Intrinsic::amdgcn_dispatch_id: 3670 return legalizePreloadedArgIntrin(MI, MRI, B, 3671 AMDGPUFunctionArgInfo::DISPATCH_ID); 3672 case Intrinsic::amdgcn_fdiv_fast: 3673 return legalizeFDIVFastIntrin(MI, MRI, B); 3674 case Intrinsic::amdgcn_is_shared: 3675 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 3676 case Intrinsic::amdgcn_is_private: 3677 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 3678 case Intrinsic::amdgcn_wavefrontsize: { 3679 B.setInstr(MI); 3680 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 3681 MI.eraseFromParent(); 3682 return true; 3683 } 3684 case Intrinsic::amdgcn_s_buffer_load: 3685 return legalizeSBufferLoad(MI, B, Observer); 3686 case Intrinsic::amdgcn_raw_buffer_store: 3687 case Intrinsic::amdgcn_struct_buffer_store: 3688 return legalizeBufferStore(MI, MRI, B, false, false); 3689 case Intrinsic::amdgcn_raw_buffer_store_format: 3690 case Intrinsic::amdgcn_struct_buffer_store_format: 3691 return legalizeBufferStore(MI, MRI, B, false, true); 3692 case Intrinsic::amdgcn_raw_tbuffer_store: 3693 case Intrinsic::amdgcn_struct_tbuffer_store: 3694 return legalizeBufferStore(MI, MRI, B, true, true); 3695 case Intrinsic::amdgcn_raw_buffer_load: 3696 case Intrinsic::amdgcn_struct_buffer_load: 3697 return legalizeBufferLoad(MI, MRI, B, false, false); 3698 case Intrinsic::amdgcn_raw_buffer_load_format: 3699 case Intrinsic::amdgcn_struct_buffer_load_format: 3700 return legalizeBufferLoad(MI, MRI, B, true, false); 3701 case Intrinsic::amdgcn_raw_tbuffer_load: 3702 case Intrinsic::amdgcn_struct_tbuffer_load: 3703 return legalizeBufferLoad(MI, MRI, B, true, true); 3704 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3705 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3706 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3707 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3708 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3709 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3710 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3711 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3712 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3713 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3714 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3715 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3716 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3717 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3718 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3719 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3720 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3721 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3722 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3723 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3724 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3725 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3726 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3727 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3728 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3729 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3730 return legalizeBufferAtomic(MI, B, IntrID); 3731 case Intrinsic::amdgcn_atomic_inc: 3732 return legalizeAtomicIncDec(MI, B, true); 3733 case Intrinsic::amdgcn_atomic_dec: 3734 return legalizeAtomicIncDec(MI, B, false); 3735 default: { 3736 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 3737 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 3738 return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr); 3739 return true; 3740 } 3741 } 3742 3743 return true; 3744 } 3745