1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPU.h" 22 #include "AMDGPULegalizerInfo.h" 23 #include "AMDGPUTargetMachine.h" 24 #include "SIMachineFunctionInfo.h" 25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/CodeGen/TargetOpcodes.h" 28 #include "llvm/CodeGen/ValueTypes.h" 29 #include "llvm/IR/DerivedTypes.h" 30 #include "llvm/IR/DiagnosticInfo.h" 31 #include "llvm/IR/Type.h" 32 #include "llvm/Support/Debug.h" 33 34 #define DEBUG_TYPE "amdgpu-legalinfo" 35 36 using namespace llvm; 37 using namespace LegalizeActions; 38 using namespace LegalizeMutations; 39 using namespace LegalityPredicates; 40 41 42 static LegalityPredicate isMultiple32(unsigned TypeIdx, 43 unsigned MaxSize = 512) { 44 return [=](const LegalityQuery &Query) { 45 const LLT Ty = Query.Types[TypeIdx]; 46 const LLT EltTy = Ty.getScalarType(); 47 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 48 }; 49 } 50 51 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 52 return [=](const LegalityQuery &Query) { 53 const LLT Ty = Query.Types[TypeIdx]; 54 return Ty.isVector() && 55 Ty.getNumElements() % 2 != 0 && 56 Ty.getElementType().getSizeInBits() < 32; 57 }; 58 } 59 60 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 61 return [=](const LegalityQuery &Query) { 62 const LLT Ty = Query.Types[TypeIdx]; 63 const LLT EltTy = Ty.getElementType(); 64 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 65 }; 66 } 67 68 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 69 return [=](const LegalityQuery &Query) { 70 const LLT Ty = Query.Types[TypeIdx]; 71 const LLT EltTy = Ty.getElementType(); 72 unsigned Size = Ty.getSizeInBits(); 73 unsigned Pieces = (Size + 63) / 64; 74 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 75 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 76 }; 77 } 78 79 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 80 return [=](const LegalityQuery &Query) { 81 const LLT QueryTy = Query.Types[TypeIdx]; 82 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 83 }; 84 } 85 86 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT QueryTy = Query.Types[TypeIdx]; 89 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 90 }; 91 } 92 93 // Any combination of 32 or 64-bit elements up to 512 bits, and multiples of 94 // v2s16. 95 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 96 return [=](const LegalityQuery &Query) { 97 const LLT Ty = Query.Types[TypeIdx]; 98 if (Ty.isVector()) { 99 const int EltSize = Ty.getElementType().getSizeInBits(); 100 return EltSize == 32 || EltSize == 64 || 101 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 102 EltSize == 128 || EltSize == 256; 103 } 104 105 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 512; 106 }; 107 } 108 109 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 110 return [=](const LegalityQuery &Query) { 111 return Query.Types[TypeIdx].getElementType() == Type; 112 }; 113 } 114 115 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 116 const GCNTargetMachine &TM) 117 : ST(ST_) { 118 using namespace TargetOpcode; 119 120 auto GetAddrSpacePtr = [&TM](unsigned AS) { 121 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 122 }; 123 124 const LLT S1 = LLT::scalar(1); 125 const LLT S8 = LLT::scalar(8); 126 const LLT S16 = LLT::scalar(16); 127 const LLT S32 = LLT::scalar(32); 128 const LLT S64 = LLT::scalar(64); 129 const LLT S128 = LLT::scalar(128); 130 const LLT S256 = LLT::scalar(256); 131 const LLT S512 = LLT::scalar(512); 132 133 const LLT V2S16 = LLT::vector(2, 16); 134 const LLT V4S16 = LLT::vector(4, 16); 135 136 const LLT V2S32 = LLT::vector(2, 32); 137 const LLT V3S32 = LLT::vector(3, 32); 138 const LLT V4S32 = LLT::vector(4, 32); 139 const LLT V5S32 = LLT::vector(5, 32); 140 const LLT V6S32 = LLT::vector(6, 32); 141 const LLT V7S32 = LLT::vector(7, 32); 142 const LLT V8S32 = LLT::vector(8, 32); 143 const LLT V9S32 = LLT::vector(9, 32); 144 const LLT V10S32 = LLT::vector(10, 32); 145 const LLT V11S32 = LLT::vector(11, 32); 146 const LLT V12S32 = LLT::vector(12, 32); 147 const LLT V13S32 = LLT::vector(13, 32); 148 const LLT V14S32 = LLT::vector(14, 32); 149 const LLT V15S32 = LLT::vector(15, 32); 150 const LLT V16S32 = LLT::vector(16, 32); 151 152 const LLT V2S64 = LLT::vector(2, 64); 153 const LLT V3S64 = LLT::vector(3, 64); 154 const LLT V4S64 = LLT::vector(4, 64); 155 const LLT V5S64 = LLT::vector(5, 64); 156 const LLT V6S64 = LLT::vector(6, 64); 157 const LLT V7S64 = LLT::vector(7, 64); 158 const LLT V8S64 = LLT::vector(8, 64); 159 160 std::initializer_list<LLT> AllS32Vectors = 161 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 162 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32}; 163 std::initializer_list<LLT> AllS64Vectors = 164 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64}; 165 166 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 167 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 168 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 169 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 170 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 171 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 172 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 173 174 const LLT CodePtr = FlatPtr; 175 176 const std::initializer_list<LLT> AddrSpaces64 = { 177 GlobalPtr, ConstantPtr, FlatPtr 178 }; 179 180 const std::initializer_list<LLT> AddrSpaces32 = { 181 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 182 }; 183 184 const std::initializer_list<LLT> FPTypesBase = { 185 S32, S64 186 }; 187 188 const std::initializer_list<LLT> FPTypes16 = { 189 S32, S64, S16 190 }; 191 192 const std::initializer_list<LLT> FPTypesPK16 = { 193 S32, S64, S16, V2S16 194 }; 195 196 setAction({G_BRCOND, S1}, Legal); 197 198 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 199 // elements for v3s16 200 getActionDefinitionsBuilder(G_PHI) 201 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 202 .legalFor(AllS32Vectors) 203 .legalFor(AllS64Vectors) 204 .legalFor(AddrSpaces64) 205 .legalFor(AddrSpaces32) 206 .clampScalar(0, S32, S256) 207 .widenScalarToNextPow2(0, 32) 208 .clampMaxNumElements(0, S32, 16) 209 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 210 .legalIf(isPointer(0)); 211 212 if (ST.has16BitInsts()) { 213 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 214 .legalFor({S32, S16}) 215 .clampScalar(0, S16, S32) 216 .scalarize(0); 217 } else { 218 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 219 .legalFor({S32}) 220 .clampScalar(0, S32, S32) 221 .scalarize(0); 222 } 223 224 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 225 .legalFor({S32}) 226 .clampScalar(0, S32, S32) 227 .scalarize(0); 228 229 // Report legal for any types we can handle anywhere. For the cases only legal 230 // on the SALU, RegBankSelect will be able to re-legalize. 231 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 232 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 233 .clampScalar(0, S32, S64) 234 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 235 .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0)) 236 .widenScalarToNextPow2(0) 237 .scalarize(0); 238 239 getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO, 240 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 241 .legalFor({{S32, S1}}) 242 .clampScalar(0, S32, S32); 243 244 getActionDefinitionsBuilder(G_BITCAST) 245 .legalForCartesianProduct({S32, V2S16}) 246 .legalForCartesianProduct({S64, V2S32, V4S16}) 247 .legalForCartesianProduct({V2S64, V4S32}) 248 // Don't worry about the size constraint. 249 .legalIf(all(isPointer(0), isPointer(1))); 250 251 getActionDefinitionsBuilder(G_FCONSTANT) 252 .legalFor({S32, S64, S16}) 253 .clampScalar(0, S16, S64); 254 255 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 256 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 257 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 258 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 259 .clampScalarOrElt(0, S32, S512) 260 .legalIf(isMultiple32(0)) 261 .widenScalarToNextPow2(0, 32) 262 .clampMaxNumElements(0, S32, 16); 263 264 265 // FIXME: i1 operands to intrinsics should always be legal, but other i1 266 // values may not be legal. We need to figure out how to distinguish 267 // between these two scenarios. 268 getActionDefinitionsBuilder(G_CONSTANT) 269 .legalFor({S1, S32, S64, S16, GlobalPtr, 270 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 271 .clampScalar(0, S32, S64) 272 .widenScalarToNextPow2(0) 273 .legalIf(isPointer(0)); 274 275 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 276 getActionDefinitionsBuilder(G_GLOBAL_VALUE).customFor({LocalPtr}); 277 278 279 auto &FPOpActions = getActionDefinitionsBuilder( 280 { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA, G_FCANONICALIZE}) 281 .legalFor({S32, S64}); 282 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 283 .customFor({S32, S64}); 284 285 if (ST.has16BitInsts()) { 286 if (ST.hasVOP3PInsts()) 287 FPOpActions.legalFor({S16, V2S16}); 288 else 289 FPOpActions.legalFor({S16}); 290 291 TrigActions.customFor({S16}); 292 } 293 294 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 295 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 296 297 if (ST.hasVOP3PInsts()) { 298 MinNumMaxNum.customFor(FPTypesPK16) 299 .clampMaxNumElements(0, S16, 2) 300 .clampScalar(0, S16, S64) 301 .scalarize(0); 302 } else if (ST.has16BitInsts()) { 303 MinNumMaxNum.customFor(FPTypes16) 304 .clampScalar(0, S16, S64) 305 .scalarize(0); 306 } else { 307 MinNumMaxNum.customFor(FPTypesBase) 308 .clampScalar(0, S32, S64) 309 .scalarize(0); 310 } 311 312 // TODO: Implement 313 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); 314 315 if (ST.hasVOP3PInsts()) 316 FPOpActions.clampMaxNumElements(0, S16, 2); 317 318 FPOpActions 319 .scalarize(0) 320 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 321 322 TrigActions 323 .scalarize(0) 324 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 325 326 if (ST.has16BitInsts()) { 327 getActionDefinitionsBuilder(G_FSQRT) 328 .legalFor({S32, S64, S16}) 329 .scalarize(0) 330 .clampScalar(0, S16, S64); 331 } else { 332 getActionDefinitionsBuilder(G_FSQRT) 333 .legalFor({S32, S64}) 334 .scalarize(0) 335 .clampScalar(0, S32, S64); 336 } 337 338 getActionDefinitionsBuilder(G_FPTRUNC) 339 .legalFor({{S32, S64}, {S16, S32}}) 340 .scalarize(0); 341 342 getActionDefinitionsBuilder(G_FPEXT) 343 .legalFor({{S64, S32}, {S32, S16}}) 344 .lowerFor({{S64, S16}}) // FIXME: Implement 345 .scalarize(0); 346 347 // TODO: Verify V_BFI_B32 is generated from expanded bit ops. 348 getActionDefinitionsBuilder(G_FCOPYSIGN).lower(); 349 350 getActionDefinitionsBuilder(G_FSUB) 351 // Use actual fsub instruction 352 .legalFor({S32}) 353 // Must use fadd + fneg 354 .lowerFor({S64, S16, V2S16}) 355 .scalarize(0) 356 .clampScalar(0, S32, S64); 357 358 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 359 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 360 {S32, S1}, {S64, S1}, {S16, S1}, 361 // FIXME: Hack 362 {S64, LLT::scalar(33)}, 363 {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}}) 364 .scalarize(0); 365 366 getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 367 .legalFor({{S32, S32}, {S64, S32}}) 368 .lowerFor({{S32, S64}}) 369 .customFor({{S64, S64}}) 370 .scalarize(0); 371 372 getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 373 .legalFor({{S32, S32}, {S32, S64}}) 374 .scalarize(0); 375 376 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 377 .legalFor({S32, S64}) 378 .scalarize(0); 379 380 if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 381 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 382 .legalFor({S32, S64}) 383 .clampScalar(0, S32, S64) 384 .scalarize(0); 385 } else { 386 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 387 .legalFor({S32}) 388 .customFor({S64}) 389 .clampScalar(0, S32, S64) 390 .scalarize(0); 391 } 392 393 getActionDefinitionsBuilder(G_GEP) 394 .legalForCartesianProduct(AddrSpaces64, {S64}) 395 .legalForCartesianProduct(AddrSpaces32, {S32}) 396 .scalarize(0); 397 398 getActionDefinitionsBuilder(G_PTR_MASK) 399 .scalarize(0) 400 .alwaysLegal(); 401 402 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 403 404 auto &CmpBuilder = 405 getActionDefinitionsBuilder(G_ICMP) 406 .legalForCartesianProduct( 407 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 408 .legalFor({{S1, S32}, {S1, S64}}); 409 if (ST.has16BitInsts()) { 410 CmpBuilder.legalFor({{S1, S16}}); 411 } 412 413 CmpBuilder 414 .widenScalarToNextPow2(1) 415 .clampScalar(1, S32, S64) 416 .scalarize(0) 417 .legalIf(all(typeIs(0, S1), isPointer(1))); 418 419 getActionDefinitionsBuilder(G_FCMP) 420 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 421 .widenScalarToNextPow2(1) 422 .clampScalar(1, S32, S64) 423 .scalarize(0); 424 425 // FIXME: fexp, flog2, flog10 needs to be custom lowered. 426 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, 427 G_FLOG, G_FLOG2, G_FLOG10}) 428 .legalFor({S32}) 429 .scalarize(0); 430 431 // The 64-bit versions produce 32-bit results, but only on the SALU. 432 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, 433 G_CTTZ, G_CTTZ_ZERO_UNDEF, 434 G_CTPOP}) 435 .legalFor({{S32, S32}, {S32, S64}}) 436 .clampScalar(0, S32, S32) 437 .clampScalar(1, S32, S64) 438 .scalarize(0) 439 .widenScalarToNextPow2(0, 32) 440 .widenScalarToNextPow2(1, 32); 441 442 // TODO: Expand for > s32 443 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE}) 444 .legalFor({S32}) 445 .clampScalar(0, S32, S32) 446 .scalarize(0); 447 448 if (ST.has16BitInsts()) { 449 if (ST.hasVOP3PInsts()) { 450 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 451 .legalFor({S32, S16, V2S16}) 452 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 453 .clampMaxNumElements(0, S16, 2) 454 .clampScalar(0, S16, S32) 455 .widenScalarToNextPow2(0) 456 .scalarize(0); 457 } else { 458 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 459 .legalFor({S32, S16}) 460 .widenScalarToNextPow2(0) 461 .clampScalar(0, S16, S32) 462 .scalarize(0); 463 } 464 } else { 465 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 466 .legalFor({S32}) 467 .clampScalar(0, S32, S32) 468 .widenScalarToNextPow2(0) 469 .scalarize(0); 470 } 471 472 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 473 return [=](const LegalityQuery &Query) { 474 return Query.Types[TypeIdx0].getSizeInBits() < 475 Query.Types[TypeIdx1].getSizeInBits(); 476 }; 477 }; 478 479 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 480 return [=](const LegalityQuery &Query) { 481 return Query.Types[TypeIdx0].getSizeInBits() > 482 Query.Types[TypeIdx1].getSizeInBits(); 483 }; 484 }; 485 486 getActionDefinitionsBuilder(G_INTTOPTR) 487 // List the common cases 488 .legalForCartesianProduct(AddrSpaces64, {S64}) 489 .legalForCartesianProduct(AddrSpaces32, {S32}) 490 .scalarize(0) 491 // Accept any address space as long as the size matches 492 .legalIf(sameSize(0, 1)) 493 .widenScalarIf(smallerThan(1, 0), 494 [](const LegalityQuery &Query) { 495 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 496 }) 497 .narrowScalarIf(greaterThan(1, 0), 498 [](const LegalityQuery &Query) { 499 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 500 }); 501 502 getActionDefinitionsBuilder(G_PTRTOINT) 503 // List the common cases 504 .legalForCartesianProduct(AddrSpaces64, {S64}) 505 .legalForCartesianProduct(AddrSpaces32, {S32}) 506 .scalarize(0) 507 // Accept any address space as long as the size matches 508 .legalIf(sameSize(0, 1)) 509 .widenScalarIf(smallerThan(0, 1), 510 [](const LegalityQuery &Query) { 511 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 512 }) 513 .narrowScalarIf( 514 greaterThan(0, 1), 515 [](const LegalityQuery &Query) { 516 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 517 }); 518 519 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 520 .scalarize(0) 521 .custom(); 522 523 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 524 // handle some operations by just promoting the register during 525 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 526 getActionDefinitionsBuilder({G_LOAD, G_STORE}) 527 .narrowScalarIf([](const LegalityQuery &Query) { 528 unsigned Size = Query.Types[0].getSizeInBits(); 529 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 530 return (Size > 32 && MemSize < Size); 531 }, 532 [](const LegalityQuery &Query) { 533 return std::make_pair(0, LLT::scalar(32)); 534 }) 535 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 536 .fewerElementsIf([=](const LegalityQuery &Query) { 537 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 538 return (MemSize == 96) && 539 Query.Types[0].isVector() && 540 !ST.hasDwordx3LoadStores(); 541 }, 542 [=](const LegalityQuery &Query) { 543 return std::make_pair(0, V2S32); 544 }) 545 .legalIf([=](const LegalityQuery &Query) { 546 const LLT &Ty0 = Query.Types[0]; 547 548 unsigned Size = Ty0.getSizeInBits(); 549 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 550 if (Size < 32 || (Size > 32 && MemSize < Size)) 551 return false; 552 553 if (Ty0.isVector() && Size != MemSize) 554 return false; 555 556 // TODO: Decompose private loads into 4-byte components. 557 // TODO: Illegal flat loads on SI 558 switch (MemSize) { 559 case 8: 560 case 16: 561 return Size == 32; 562 case 32: 563 case 64: 564 case 128: 565 return true; 566 567 case 96: 568 return ST.hasDwordx3LoadStores(); 569 570 case 256: 571 case 512: 572 // TODO: Possibly support loads of i256 and i512 . This will require 573 // adding i256 and i512 types to MVT in order for to be able to use 574 // TableGen. 575 // TODO: Add support for other vector types, this will require 576 // defining more value mappings for the new types. 577 return Ty0.isVector() && (Ty0.getScalarType().getSizeInBits() == 32 || 578 Ty0.getScalarType().getSizeInBits() == 64); 579 580 default: 581 return false; 582 } 583 }) 584 .clampScalar(0, S32, S64); 585 586 587 // FIXME: Handle alignment requirements. 588 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 589 .legalForTypesWithMemDesc({ 590 {S32, GlobalPtr, 8, 8}, 591 {S32, GlobalPtr, 16, 8}, 592 {S32, LocalPtr, 8, 8}, 593 {S32, LocalPtr, 16, 8}, 594 {S32, PrivatePtr, 8, 8}, 595 {S32, PrivatePtr, 16, 8}}); 596 if (ST.hasFlatAddressSpace()) { 597 ExtLoads.legalForTypesWithMemDesc({{S32, FlatPtr, 8, 8}, 598 {S32, FlatPtr, 16, 8}}); 599 } 600 601 ExtLoads.clampScalar(0, S32, S32) 602 .widenScalarToNextPow2(0) 603 .unsupportedIfMemSizeNotPow2() 604 .lower(); 605 606 auto &Atomics = getActionDefinitionsBuilder( 607 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 608 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 609 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 610 G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG}) 611 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 612 {S64, GlobalPtr}, {S64, LocalPtr}}); 613 if (ST.hasFlatAddressSpace()) { 614 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 615 } 616 617 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 618 .legalFor({{S32, LocalPtr}}); 619 620 // TODO: Pointer types, any 32-bit or 64-bit vector 621 getActionDefinitionsBuilder(G_SELECT) 622 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 623 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 624 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1}) 625 .clampScalar(0, S16, S64) 626 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 627 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 628 .scalarize(1) 629 .clampMaxNumElements(0, S32, 2) 630 .clampMaxNumElements(0, LocalPtr, 2) 631 .clampMaxNumElements(0, PrivatePtr, 2) 632 .scalarize(0) 633 .widenScalarToNextPow2(0) 634 .legalIf(all(isPointer(0), typeIs(1, S1))); 635 636 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 637 // be more flexible with the shift amount type. 638 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 639 .legalFor({{S32, S32}, {S64, S32}}); 640 if (ST.has16BitInsts()) { 641 if (ST.hasVOP3PInsts()) { 642 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 643 .clampMaxNumElements(0, S16, 2); 644 } else 645 Shifts.legalFor({{S16, S32}, {S16, S16}}); 646 647 Shifts.clampScalar(1, S16, S32); 648 Shifts.clampScalar(0, S16, S64); 649 Shifts.widenScalarToNextPow2(0, 16); 650 } else { 651 // Make sure we legalize the shift amount type first, as the general 652 // expansion for the shifted type will produce much worse code if it hasn't 653 // been truncated already. 654 Shifts.clampScalar(1, S32, S32); 655 Shifts.clampScalar(0, S32, S64); 656 Shifts.widenScalarToNextPow2(0, 32); 657 } 658 Shifts.scalarize(0); 659 660 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 661 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 662 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 663 unsigned IdxTypeIdx = 2; 664 665 getActionDefinitionsBuilder(Op) 666 .customIf([=](const LegalityQuery &Query) { 667 const LLT EltTy = Query.Types[EltTypeIdx]; 668 const LLT VecTy = Query.Types[VecTypeIdx]; 669 const LLT IdxTy = Query.Types[IdxTypeIdx]; 670 return (EltTy.getSizeInBits() == 16 || 671 EltTy.getSizeInBits() % 32 == 0) && 672 VecTy.getSizeInBits() % 32 == 0 && 673 VecTy.getSizeInBits() <= 512 && 674 IdxTy.getSizeInBits() == 32; 675 }) 676 .clampScalar(EltTypeIdx, S32, S64) 677 .clampScalar(VecTypeIdx, S32, S64) 678 .clampScalar(IdxTypeIdx, S32, S32); 679 } 680 681 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 682 .unsupportedIf([=](const LegalityQuery &Query) { 683 const LLT &EltTy = Query.Types[1].getElementType(); 684 return Query.Types[0] != EltTy; 685 }); 686 687 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 688 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 689 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 690 691 // FIXME: Doesn't handle extract of illegal sizes. 692 getActionDefinitionsBuilder(Op) 693 .legalIf([=](const LegalityQuery &Query) { 694 const LLT BigTy = Query.Types[BigTyIdx]; 695 const LLT LitTy = Query.Types[LitTyIdx]; 696 return (BigTy.getSizeInBits() % 32 == 0) && 697 (LitTy.getSizeInBits() % 16 == 0); 698 }) 699 .widenScalarIf( 700 [=](const LegalityQuery &Query) { 701 const LLT BigTy = Query.Types[BigTyIdx]; 702 return (BigTy.getScalarSizeInBits() < 16); 703 }, 704 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 705 .widenScalarIf( 706 [=](const LegalityQuery &Query) { 707 const LLT LitTy = Query.Types[LitTyIdx]; 708 return (LitTy.getScalarSizeInBits() < 16); 709 }, 710 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 711 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 712 .widenScalarToNextPow2(BigTyIdx, 32); 713 714 } 715 716 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 717 .legalForCartesianProduct(AllS32Vectors, {S32}) 718 .legalForCartesianProduct(AllS64Vectors, {S64}) 719 .clampNumElements(0, V16S32, V16S32) 720 .clampNumElements(0, V2S64, V8S64); 721 722 if (ST.hasScalarPackInsts()) 723 BuildVector.legalFor({V2S16, S32}); 724 725 BuildVector 726 .minScalarSameAs(1, 0) 727 .legalIf(isRegisterType(0)) 728 .minScalarOrElt(0, S32); 729 730 if (ST.hasScalarPackInsts()) { 731 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 732 .legalFor({V2S16, S32}) 733 .lower(); 734 } else { 735 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 736 .lower(); 737 } 738 739 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 740 .legalIf(isRegisterType(0)); 741 742 // TODO: Don't fully scalarize v2s16 pieces 743 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 744 745 // Merge/Unmerge 746 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 747 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 748 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 749 750 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 751 const LLT &Ty = Query.Types[TypeIdx]; 752 if (Ty.isVector()) { 753 const LLT &EltTy = Ty.getElementType(); 754 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 755 return true; 756 if (!isPowerOf2_32(EltTy.getSizeInBits())) 757 return true; 758 } 759 return false; 760 }; 761 762 getActionDefinitionsBuilder(Op) 763 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 764 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 765 // worth considering the multiples of 64 since 2*192 and 2*384 are not 766 // valid. 767 .clampScalar(LitTyIdx, S16, S256) 768 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 769 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 770 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 771 elementTypeIs(1, S16)), 772 changeTo(1, V2S16)) 773 // Break up vectors with weird elements into scalars 774 .fewerElementsIf( 775 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 776 scalarize(0)) 777 .fewerElementsIf( 778 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 779 scalarize(1)) 780 .clampScalar(BigTyIdx, S32, S512) 781 .lowerFor({{S16, V2S16}}) 782 .widenScalarIf( 783 [=](const LegalityQuery &Query) { 784 const LLT &Ty = Query.Types[BigTyIdx]; 785 return !isPowerOf2_32(Ty.getSizeInBits()) && 786 Ty.getSizeInBits() % 16 != 0; 787 }, 788 [=](const LegalityQuery &Query) { 789 // Pick the next power of 2, or a multiple of 64 over 128. 790 // Whichever is smaller. 791 const LLT &Ty = Query.Types[BigTyIdx]; 792 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 793 if (NewSizeInBits >= 256) { 794 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 795 if (RoundedTo < NewSizeInBits) 796 NewSizeInBits = RoundedTo; 797 } 798 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 799 }) 800 .legalIf([=](const LegalityQuery &Query) { 801 const LLT &BigTy = Query.Types[BigTyIdx]; 802 const LLT &LitTy = Query.Types[LitTyIdx]; 803 804 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 805 return false; 806 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 807 return false; 808 809 return BigTy.getSizeInBits() % 16 == 0 && 810 LitTy.getSizeInBits() % 16 == 0 && 811 BigTy.getSizeInBits() <= 512; 812 }) 813 // Any vectors left are the wrong size. Scalarize them. 814 .scalarize(0) 815 .scalarize(1); 816 } 817 818 getActionDefinitionsBuilder(G_SEXT_INREG).lower(); 819 820 computeTables(); 821 verify(*ST.getInstrInfo()); 822 } 823 824 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 825 MachineRegisterInfo &MRI, 826 MachineIRBuilder &MIRBuilder, 827 GISelChangeObserver &Observer) const { 828 switch (MI.getOpcode()) { 829 case TargetOpcode::G_ADDRSPACE_CAST: 830 return legalizeAddrSpaceCast(MI, MRI, MIRBuilder); 831 case TargetOpcode::G_FRINT: 832 return legalizeFrint(MI, MRI, MIRBuilder); 833 case TargetOpcode::G_FCEIL: 834 return legalizeFceil(MI, MRI, MIRBuilder); 835 case TargetOpcode::G_INTRINSIC_TRUNC: 836 return legalizeIntrinsicTrunc(MI, MRI, MIRBuilder); 837 case TargetOpcode::G_SITOFP: 838 return legalizeITOFP(MI, MRI, MIRBuilder, true); 839 case TargetOpcode::G_UITOFP: 840 return legalizeITOFP(MI, MRI, MIRBuilder, false); 841 case TargetOpcode::G_FMINNUM: 842 case TargetOpcode::G_FMAXNUM: 843 case TargetOpcode::G_FMINNUM_IEEE: 844 case TargetOpcode::G_FMAXNUM_IEEE: 845 return legalizeMinNumMaxNum(MI, MRI, MIRBuilder); 846 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 847 return legalizeExtractVectorElt(MI, MRI, MIRBuilder); 848 case TargetOpcode::G_INSERT_VECTOR_ELT: 849 return legalizeInsertVectorElt(MI, MRI, MIRBuilder); 850 case TargetOpcode::G_FSIN: 851 case TargetOpcode::G_FCOS: 852 return legalizeSinCos(MI, MRI, MIRBuilder); 853 case TargetOpcode::G_GLOBAL_VALUE: 854 return legalizeGlobalValue(MI, MRI, MIRBuilder); 855 default: 856 return false; 857 } 858 859 llvm_unreachable("expected switch to return"); 860 } 861 862 Register AMDGPULegalizerInfo::getSegmentAperture( 863 unsigned AS, 864 MachineRegisterInfo &MRI, 865 MachineIRBuilder &MIRBuilder) const { 866 MachineFunction &MF = MIRBuilder.getMF(); 867 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 868 const LLT S32 = LLT::scalar(32); 869 870 if (ST.hasApertureRegs()) { 871 // FIXME: Use inline constants (src_{shared, private}_base) instead of 872 // getreg. 873 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 874 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 875 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 876 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 877 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 878 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 879 unsigned Encoding = 880 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 881 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 882 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 883 884 Register ApertureReg = MRI.createGenericVirtualRegister(S32); 885 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 886 887 MIRBuilder.buildInstr(AMDGPU::S_GETREG_B32) 888 .addDef(GetReg) 889 .addImm(Encoding); 890 MRI.setType(GetReg, S32); 891 892 auto ShiftAmt = MIRBuilder.buildConstant(S32, WidthM1 + 1); 893 MIRBuilder.buildInstr(TargetOpcode::G_SHL) 894 .addDef(ApertureReg) 895 .addUse(GetReg) 896 .addUse(ShiftAmt.getReg(0)); 897 898 return ApertureReg; 899 } 900 901 Register QueuePtr = MRI.createGenericVirtualRegister( 902 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 903 904 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 905 if (!loadInputValue(QueuePtr, MIRBuilder, &MFI->getArgInfo().QueuePtr)) 906 return Register(); 907 908 // Offset into amd_queue_t for group_segment_aperture_base_hi / 909 // private_segment_aperture_base_hi. 910 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 911 912 // FIXME: Don't use undef 913 Value *V = UndefValue::get(PointerType::get( 914 Type::getInt8Ty(MF.getFunction().getContext()), 915 AMDGPUAS::CONSTANT_ADDRESS)); 916 917 MachinePointerInfo PtrInfo(V, StructOffset); 918 MachineMemOperand *MMO = MF.getMachineMemOperand( 919 PtrInfo, 920 MachineMemOperand::MOLoad | 921 MachineMemOperand::MODereferenceable | 922 MachineMemOperand::MOInvariant, 923 4, 924 MinAlign(64, StructOffset)); 925 926 Register LoadResult = MRI.createGenericVirtualRegister(S32); 927 Register LoadAddr; 928 929 MIRBuilder.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 930 MIRBuilder.buildLoad(LoadResult, LoadAddr, *MMO); 931 return LoadResult; 932 } 933 934 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 935 MachineInstr &MI, MachineRegisterInfo &MRI, 936 MachineIRBuilder &MIRBuilder) const { 937 MachineFunction &MF = MIRBuilder.getMF(); 938 939 MIRBuilder.setInstr(MI); 940 941 const LLT S32 = LLT::scalar(32); 942 Register Dst = MI.getOperand(0).getReg(); 943 Register Src = MI.getOperand(1).getReg(); 944 945 LLT DstTy = MRI.getType(Dst); 946 LLT SrcTy = MRI.getType(Src); 947 unsigned DestAS = DstTy.getAddressSpace(); 948 unsigned SrcAS = SrcTy.getAddressSpace(); 949 950 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 951 // vector element. 952 assert(!DstTy.isVector()); 953 954 const AMDGPUTargetMachine &TM 955 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 956 957 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 958 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 959 MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BITCAST)); 960 return true; 961 } 962 963 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 964 // Truncate. 965 MIRBuilder.buildExtract(Dst, Src, 0); 966 MI.eraseFromParent(); 967 return true; 968 } 969 970 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 971 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 972 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 973 974 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 975 // another. Merge operands are required to be the same type, but creating an 976 // extra ptrtoint would be kind of pointless. 977 auto HighAddr = MIRBuilder.buildConstant( 978 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 979 MIRBuilder.buildMerge(Dst, {Src, HighAddr.getReg(0)}); 980 MI.eraseFromParent(); 981 return true; 982 } 983 984 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 985 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 986 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 987 unsigned NullVal = TM.getNullPointerValue(DestAS); 988 989 auto SegmentNull = MIRBuilder.buildConstant(DstTy, NullVal); 990 auto FlatNull = MIRBuilder.buildConstant(SrcTy, 0); 991 992 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy); 993 994 // Extract low 32-bits of the pointer. 995 MIRBuilder.buildExtract(PtrLo32, Src, 0); 996 997 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 998 MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); 999 MIRBuilder.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1000 1001 MI.eraseFromParent(); 1002 return true; 1003 } 1004 1005 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1006 return false; 1007 1008 if (!ST.hasFlatAddressSpace()) 1009 return false; 1010 1011 auto SegmentNull = 1012 MIRBuilder.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1013 auto FlatNull = 1014 MIRBuilder.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1015 1016 Register ApertureReg = getSegmentAperture(DestAS, MRI, MIRBuilder); 1017 if (!ApertureReg.isValid()) 1018 return false; 1019 1020 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1021 MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); 1022 1023 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy); 1024 1025 // Coerce the type of the low half of the result so we can use merge_values. 1026 Register SrcAsInt = MRI.createGenericVirtualRegister(S32); 1027 MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT) 1028 .addDef(SrcAsInt) 1029 .addUse(Src); 1030 1031 // TODO: Should we allow mismatched types but matching sizes in merges to 1032 // avoid the ptrtoint? 1033 MIRBuilder.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); 1034 MIRBuilder.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); 1035 1036 MI.eraseFromParent(); 1037 return true; 1038 } 1039 1040 bool AMDGPULegalizerInfo::legalizeFrint( 1041 MachineInstr &MI, MachineRegisterInfo &MRI, 1042 MachineIRBuilder &MIRBuilder) const { 1043 MIRBuilder.setInstr(MI); 1044 1045 Register Src = MI.getOperand(1).getReg(); 1046 LLT Ty = MRI.getType(Src); 1047 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1048 1049 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1050 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1051 1052 auto C1 = MIRBuilder.buildFConstant(Ty, C1Val); 1053 auto CopySign = MIRBuilder.buildFCopysign(Ty, C1, Src); 1054 1055 // TODO: Should this propagate fast-math-flags? 1056 auto Tmp1 = MIRBuilder.buildFAdd(Ty, Src, CopySign); 1057 auto Tmp2 = MIRBuilder.buildFSub(Ty, Tmp1, CopySign); 1058 1059 auto C2 = MIRBuilder.buildFConstant(Ty, C2Val); 1060 auto Fabs = MIRBuilder.buildFAbs(Ty, Src); 1061 1062 auto Cond = MIRBuilder.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1063 MIRBuilder.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1064 return true; 1065 } 1066 1067 bool AMDGPULegalizerInfo::legalizeFceil( 1068 MachineInstr &MI, MachineRegisterInfo &MRI, 1069 MachineIRBuilder &B) const { 1070 B.setInstr(MI); 1071 1072 const LLT S1 = LLT::scalar(1); 1073 const LLT S64 = LLT::scalar(64); 1074 1075 Register Src = MI.getOperand(1).getReg(); 1076 assert(MRI.getType(Src) == S64); 1077 1078 // result = trunc(src) 1079 // if (src > 0.0 && src != result) 1080 // result += 1.0 1081 1082 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src}); 1083 1084 const auto Zero = B.buildFConstant(S64, 0.0); 1085 const auto One = B.buildFConstant(S64, 1.0); 1086 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1087 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1088 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1089 auto Add = B.buildSelect(S64, And, One, Zero); 1090 1091 // TODO: Should this propagate fast-math-flags? 1092 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1093 return true; 1094 } 1095 1096 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1097 MachineIRBuilder &B) { 1098 const unsigned FractBits = 52; 1099 const unsigned ExpBits = 11; 1100 LLT S32 = LLT::scalar(32); 1101 1102 auto Const0 = B.buildConstant(S32, FractBits - 32); 1103 auto Const1 = B.buildConstant(S32, ExpBits); 1104 1105 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1106 .addUse(Const0.getReg(0)) 1107 .addUse(Const1.getReg(0)); 1108 1109 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1110 } 1111 1112 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1113 MachineInstr &MI, MachineRegisterInfo &MRI, 1114 MachineIRBuilder &B) const { 1115 B.setInstr(MI); 1116 1117 const LLT S1 = LLT::scalar(1); 1118 const LLT S32 = LLT::scalar(32); 1119 const LLT S64 = LLT::scalar(64); 1120 1121 Register Src = MI.getOperand(1).getReg(); 1122 assert(MRI.getType(Src) == S64); 1123 1124 // TODO: Should this use extract since the low half is unused? 1125 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1126 Register Hi = Unmerge.getReg(1); 1127 1128 // Extract the upper half, since this is where we will find the sign and 1129 // exponent. 1130 auto Exp = extractF64Exponent(Hi, B); 1131 1132 const unsigned FractBits = 52; 1133 1134 // Extract the sign bit. 1135 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1136 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1137 1138 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1139 1140 const auto Zero32 = B.buildConstant(S32, 0); 1141 1142 // Extend back to 64-bits. 1143 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); 1144 1145 auto Shr = B.buildAShr(S64, FractMask, Exp); 1146 auto Not = B.buildNot(S64, Shr); 1147 auto Tmp0 = B.buildAnd(S64, Src, Not); 1148 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1149 1150 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1151 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1152 1153 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1154 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1155 return true; 1156 } 1157 1158 bool AMDGPULegalizerInfo::legalizeITOFP( 1159 MachineInstr &MI, MachineRegisterInfo &MRI, 1160 MachineIRBuilder &B, bool Signed) const { 1161 B.setInstr(MI); 1162 1163 Register Dst = MI.getOperand(0).getReg(); 1164 Register Src = MI.getOperand(1).getReg(); 1165 1166 const LLT S64 = LLT::scalar(64); 1167 const LLT S32 = LLT::scalar(32); 1168 1169 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1170 1171 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1172 1173 auto CvtHi = Signed ? 1174 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1175 B.buildUITOFP(S64, Unmerge.getReg(1)); 1176 1177 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1178 1179 auto ThirtyTwo = B.buildConstant(S32, 32); 1180 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1181 .addUse(CvtHi.getReg(0)) 1182 .addUse(ThirtyTwo.getReg(0)); 1183 1184 // TODO: Should this propagate fast-math-flags? 1185 B.buildFAdd(Dst, LdExp, CvtLo); 1186 MI.eraseFromParent(); 1187 return true; 1188 } 1189 1190 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1191 MachineInstr &MI, MachineRegisterInfo &MRI, 1192 MachineIRBuilder &B) const { 1193 MachineFunction &MF = B.getMF(); 1194 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1195 1196 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1197 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1198 1199 // With ieee_mode disabled, the instructions have the correct behavior 1200 // already for G_FMINNUM/G_FMAXNUM 1201 if (!MFI->getMode().IEEE) 1202 return !IsIEEEOp; 1203 1204 if (IsIEEEOp) 1205 return true; 1206 1207 MachineIRBuilder HelperBuilder(MI); 1208 GISelObserverWrapper DummyObserver; 1209 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1210 HelperBuilder.setMBB(*MI.getParent()); 1211 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1212 } 1213 1214 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1215 MachineInstr &MI, MachineRegisterInfo &MRI, 1216 MachineIRBuilder &B) const { 1217 // TODO: Should move some of this into LegalizerHelper. 1218 1219 // TODO: Promote dynamic indexing of s16 to s32 1220 // TODO: Dynamic s64 indexing is only legal for SGPR. 1221 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); 1222 if (!IdxVal) // Dynamic case will be selected to register indexing. 1223 return true; 1224 1225 Register Dst = MI.getOperand(0).getReg(); 1226 Register Vec = MI.getOperand(1).getReg(); 1227 1228 LLT VecTy = MRI.getType(Vec); 1229 LLT EltTy = VecTy.getElementType(); 1230 assert(EltTy == MRI.getType(Dst)); 1231 1232 B.setInstr(MI); 1233 1234 if (IdxVal.getValue() < VecTy.getNumElements()) 1235 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); 1236 else 1237 B.buildUndef(Dst); 1238 1239 MI.eraseFromParent(); 1240 return true; 1241 } 1242 1243 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1244 MachineInstr &MI, MachineRegisterInfo &MRI, 1245 MachineIRBuilder &B) const { 1246 // TODO: Should move some of this into LegalizerHelper. 1247 1248 // TODO: Promote dynamic indexing of s16 to s32 1249 // TODO: Dynamic s64 indexing is only legal for SGPR. 1250 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); 1251 if (!IdxVal) // Dynamic case will be selected to register indexing. 1252 return true; 1253 1254 Register Dst = MI.getOperand(0).getReg(); 1255 Register Vec = MI.getOperand(1).getReg(); 1256 Register Ins = MI.getOperand(2).getReg(); 1257 1258 LLT VecTy = MRI.getType(Vec); 1259 LLT EltTy = VecTy.getElementType(); 1260 assert(EltTy == MRI.getType(Ins)); 1261 1262 B.setInstr(MI); 1263 1264 if (IdxVal.getValue() < VecTy.getNumElements()) 1265 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); 1266 else 1267 B.buildUndef(Dst); 1268 1269 MI.eraseFromParent(); 1270 return true; 1271 } 1272 1273 bool AMDGPULegalizerInfo::legalizeSinCos( 1274 MachineInstr &MI, MachineRegisterInfo &MRI, 1275 MachineIRBuilder &B) const { 1276 B.setInstr(MI); 1277 1278 Register DstReg = MI.getOperand(0).getReg(); 1279 Register SrcReg = MI.getOperand(1).getReg(); 1280 LLT Ty = MRI.getType(DstReg); 1281 unsigned Flags = MI.getFlags(); 1282 1283 Register TrigVal; 1284 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1285 if (ST.hasTrigReducedRange()) { 1286 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1287 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1288 .addUse(MulVal.getReg(0)) 1289 .setMIFlags(Flags).getReg(0); 1290 } else 1291 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1292 1293 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1294 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1295 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1296 .addUse(TrigVal) 1297 .setMIFlags(Flags); 1298 MI.eraseFromParent(); 1299 return true; 1300 } 1301 1302 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1303 MachineInstr &MI, MachineRegisterInfo &MRI, 1304 MachineIRBuilder &B) const { 1305 Register DstReg = MI.getOperand(0).getReg(); 1306 LLT Ty = MRI.getType(DstReg); 1307 unsigned AS = Ty.getAddressSpace(); 1308 1309 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1310 MachineFunction &MF = B.getMF(); 1311 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1312 1313 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1314 B.setInstr(MI); 1315 1316 if (!MFI->isEntryFunction()) { 1317 const Function &Fn = MF.getFunction(); 1318 DiagnosticInfoUnsupported BadLDSDecl( 1319 Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 1320 Fn.getContext().diagnose(BadLDSDecl); 1321 } 1322 1323 // TODO: We could emit code to handle the initialization somewhere. 1324 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1325 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1326 MI.eraseFromParent(); 1327 return true; 1328 } 1329 } else 1330 return false; 1331 1332 const Function &Fn = MF.getFunction(); 1333 DiagnosticInfoUnsupported BadInit( 1334 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1335 Fn.getContext().diagnose(BadInit); 1336 return true; 1337 } 1338 1339 // Return the use branch instruction, otherwise null if the usage is invalid. 1340 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 1341 MachineRegisterInfo &MRI) { 1342 Register CondDef = MI.getOperand(0).getReg(); 1343 if (!MRI.hasOneNonDBGUse(CondDef)) 1344 return nullptr; 1345 1346 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 1347 return UseMI.getParent() == MI.getParent() && 1348 UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr; 1349 } 1350 1351 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 1352 Register Reg, LLT Ty) const { 1353 Register LiveIn = MRI.getLiveInVirtReg(Reg); 1354 if (LiveIn) 1355 return LiveIn; 1356 1357 Register NewReg = MRI.createGenericVirtualRegister(Ty); 1358 MRI.addLiveIn(Reg, NewReg); 1359 return NewReg; 1360 } 1361 1362 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 1363 const ArgDescriptor *Arg) const { 1364 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 1365 return false; // TODO: Handle these 1366 1367 assert(Arg->getRegister().isPhysical()); 1368 1369 MachineRegisterInfo &MRI = *B.getMRI(); 1370 1371 LLT Ty = MRI.getType(DstReg); 1372 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 1373 1374 if (Arg->isMasked()) { 1375 // TODO: Should we try to emit this once in the entry block? 1376 const LLT S32 = LLT::scalar(32); 1377 const unsigned Mask = Arg->getMask(); 1378 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 1379 1380 auto ShiftAmt = B.buildConstant(S32, Shift); 1381 auto LShr = B.buildLShr(S32, LiveIn, ShiftAmt); 1382 B.buildAnd(DstReg, LShr, B.buildConstant(S32, Mask >> Shift)); 1383 } else 1384 B.buildCopy(DstReg, LiveIn); 1385 1386 // Insert the argument copy if it doens't already exist. 1387 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 1388 if (!MRI.getVRegDef(LiveIn)) { 1389 // FIXME: Should have scoped insert pt 1390 MachineBasicBlock &OrigInsBB = B.getMBB(); 1391 auto OrigInsPt = B.getInsertPt(); 1392 1393 MachineBasicBlock &EntryMBB = B.getMF().front(); 1394 EntryMBB.addLiveIn(Arg->getRegister()); 1395 B.setInsertPt(EntryMBB, EntryMBB.begin()); 1396 B.buildCopy(LiveIn, Arg->getRegister()); 1397 1398 B.setInsertPt(OrigInsBB, OrigInsPt); 1399 } 1400 1401 return true; 1402 } 1403 1404 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 1405 MachineInstr &MI, 1406 MachineRegisterInfo &MRI, 1407 MachineIRBuilder &B, 1408 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 1409 B.setInstr(MI); 1410 1411 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1412 1413 const ArgDescriptor *Arg; 1414 const TargetRegisterClass *RC; 1415 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 1416 if (!Arg) { 1417 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 1418 return false; 1419 } 1420 1421 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 1422 MI.eraseFromParent(); 1423 return true; 1424 } 1425 1426 return false; 1427 } 1428 1429 bool AMDGPULegalizerInfo::legalizeFDIVFast(MachineInstr &MI, 1430 MachineRegisterInfo &MRI, 1431 MachineIRBuilder &B) const { 1432 B.setInstr(MI); 1433 Register Res = MI.getOperand(0).getReg(); 1434 Register LHS = MI.getOperand(2).getReg(); 1435 Register RHS = MI.getOperand(3).getReg(); 1436 uint16_t Flags = MI.getFlags(); 1437 1438 LLT S32 = LLT::scalar(32); 1439 LLT S1 = LLT::scalar(1); 1440 1441 auto Abs = B.buildFAbs(S32, RHS, Flags); 1442 const APFloat C0Val(1.0f); 1443 1444 auto C0 = B.buildConstant(S32, 0x6f800000); 1445 auto C1 = B.buildConstant(S32, 0x2f800000); 1446 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 1447 1448 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 1449 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 1450 1451 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 1452 1453 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 1454 .addUse(Mul0.getReg(0)) 1455 .setMIFlags(Flags); 1456 1457 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 1458 1459 B.buildFMul(Res, Sel, Mul1, Flags); 1460 1461 MI.eraseFromParent(); 1462 return true; 1463 } 1464 1465 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 1466 MachineRegisterInfo &MRI, 1467 MachineIRBuilder &B) const { 1468 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1469 if (!MFI->isEntryFunction()) { 1470 return legalizePreloadedArgIntrin(MI, MRI, B, 1471 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 1472 } 1473 1474 B.setInstr(MI); 1475 1476 uint64_t Offset = 1477 ST.getTargetLowering()->getImplicitParameterOffset( 1478 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 1479 Register DstReg = MI.getOperand(0).getReg(); 1480 LLT DstTy = MRI.getType(DstReg); 1481 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 1482 1483 const ArgDescriptor *Arg; 1484 const TargetRegisterClass *RC; 1485 std::tie(Arg, RC) 1486 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 1487 if (!Arg) 1488 return false; 1489 1490 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 1491 if (!loadInputValue(KernargPtrReg, B, Arg)) 1492 return false; 1493 1494 B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 1495 MI.eraseFromParent(); 1496 return true; 1497 } 1498 1499 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 1500 MachineRegisterInfo &MRI, 1501 MachineIRBuilder &B, 1502 unsigned AddrSpace) const { 1503 B.setInstr(MI); 1504 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 1505 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 1506 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 1507 MI.eraseFromParent(); 1508 return true; 1509 } 1510 1511 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 1512 MachineRegisterInfo &MRI, 1513 MachineIRBuilder &B) const { 1514 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 1515 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { 1516 case Intrinsic::amdgcn_if: { 1517 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 1518 const SIRegisterInfo *TRI 1519 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 1520 1521 B.setInstr(*BrCond); 1522 Register Def = MI.getOperand(1).getReg(); 1523 Register Use = MI.getOperand(3).getReg(); 1524 B.buildInstr(AMDGPU::SI_IF) 1525 .addDef(Def) 1526 .addUse(Use) 1527 .addMBB(BrCond->getOperand(1).getMBB()); 1528 1529 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 1530 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 1531 MI.eraseFromParent(); 1532 BrCond->eraseFromParent(); 1533 return true; 1534 } 1535 1536 return false; 1537 } 1538 case Intrinsic::amdgcn_loop: { 1539 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 1540 const SIRegisterInfo *TRI 1541 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 1542 1543 B.setInstr(*BrCond); 1544 Register Reg = MI.getOperand(2).getReg(); 1545 B.buildInstr(AMDGPU::SI_LOOP) 1546 .addUse(Reg) 1547 .addMBB(BrCond->getOperand(1).getMBB()); 1548 MI.eraseFromParent(); 1549 BrCond->eraseFromParent(); 1550 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 1551 return true; 1552 } 1553 1554 return false; 1555 } 1556 case Intrinsic::amdgcn_kernarg_segment_ptr: 1557 return legalizePreloadedArgIntrin( 1558 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 1559 case Intrinsic::amdgcn_implicitarg_ptr: 1560 return legalizeImplicitArgPtr(MI, MRI, B); 1561 case Intrinsic::amdgcn_workitem_id_x: 1562 return legalizePreloadedArgIntrin(MI, MRI, B, 1563 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 1564 case Intrinsic::amdgcn_workitem_id_y: 1565 return legalizePreloadedArgIntrin(MI, MRI, B, 1566 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 1567 case Intrinsic::amdgcn_workitem_id_z: 1568 return legalizePreloadedArgIntrin(MI, MRI, B, 1569 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 1570 case Intrinsic::amdgcn_workgroup_id_x: 1571 return legalizePreloadedArgIntrin(MI, MRI, B, 1572 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 1573 case Intrinsic::amdgcn_workgroup_id_y: 1574 return legalizePreloadedArgIntrin(MI, MRI, B, 1575 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 1576 case Intrinsic::amdgcn_workgroup_id_z: 1577 return legalizePreloadedArgIntrin(MI, MRI, B, 1578 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 1579 case Intrinsic::amdgcn_dispatch_ptr: 1580 return legalizePreloadedArgIntrin(MI, MRI, B, 1581 AMDGPUFunctionArgInfo::DISPATCH_PTR); 1582 case Intrinsic::amdgcn_queue_ptr: 1583 return legalizePreloadedArgIntrin(MI, MRI, B, 1584 AMDGPUFunctionArgInfo::QUEUE_PTR); 1585 case Intrinsic::amdgcn_implicit_buffer_ptr: 1586 return legalizePreloadedArgIntrin( 1587 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 1588 case Intrinsic::amdgcn_dispatch_id: 1589 return legalizePreloadedArgIntrin(MI, MRI, B, 1590 AMDGPUFunctionArgInfo::DISPATCH_ID); 1591 case Intrinsic::amdgcn_fdiv_fast: 1592 return legalizeFDIVFast(MI, MRI, B); 1593 case Intrinsic::amdgcn_is_shared: 1594 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 1595 case Intrinsic::amdgcn_is_private: 1596 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 1597 case Intrinsic::amdgcn_wavefrontsize: { 1598 B.setInstr(MI); 1599 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 1600 MI.eraseFromParent(); 1601 return true; 1602 } 1603 default: 1604 return true; 1605 } 1606 1607 return true; 1608 } 1609