1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPULegalizerInfo.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "SIMachineFunctionInfo.h" 18 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 19 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 20 #include "llvm/CodeGen/TargetOpcodes.h" 21 #include "llvm/CodeGen/ValueTypes.h" 22 #include "llvm/IR/DerivedTypes.h" 23 #include "llvm/IR/Type.h" 24 #include "llvm/Support/Debug.h" 25 26 #define DEBUG_TYPE "amdgpu-legalinfo" 27 28 using namespace llvm; 29 using namespace LegalizeActions; 30 using namespace LegalizeMutations; 31 using namespace LegalityPredicates; 32 33 34 static LegalityPredicate isMultiple32(unsigned TypeIdx, 35 unsigned MaxSize = 512) { 36 return [=](const LegalityQuery &Query) { 37 const LLT Ty = Query.Types[TypeIdx]; 38 const LLT EltTy = Ty.getScalarType(); 39 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 40 }; 41 } 42 43 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 44 return [=](const LegalityQuery &Query) { 45 const LLT Ty = Query.Types[TypeIdx]; 46 return Ty.isVector() && 47 Ty.getNumElements() % 2 != 0 && 48 Ty.getElementType().getSizeInBits() < 32; 49 }; 50 } 51 52 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 53 return [=](const LegalityQuery &Query) { 54 const LLT Ty = Query.Types[TypeIdx]; 55 const LLT EltTy = Ty.getElementType(); 56 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 57 }; 58 } 59 60 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 61 return [=](const LegalityQuery &Query) { 62 const LLT Ty = Query.Types[TypeIdx]; 63 const LLT EltTy = Ty.getElementType(); 64 unsigned Size = Ty.getSizeInBits(); 65 unsigned Pieces = (Size + 63) / 64; 66 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 67 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 68 }; 69 } 70 71 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 72 return [=](const LegalityQuery &Query) { 73 const LLT QueryTy = Query.Types[TypeIdx]; 74 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 75 }; 76 } 77 78 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 79 return [=](const LegalityQuery &Query) { 80 const LLT QueryTy = Query.Types[TypeIdx]; 81 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 82 }; 83 } 84 85 // Any combination of 32 or 64-bit elements up to 512 bits, and multiples of 86 // v2s16. 87 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 88 return [=](const LegalityQuery &Query) { 89 const LLT Ty = Query.Types[TypeIdx]; 90 if (Ty.isVector()) { 91 const int EltSize = Ty.getElementType().getSizeInBits(); 92 return EltSize == 32 || EltSize == 64 || 93 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 94 EltSize == 128 || EltSize == 256; 95 } 96 97 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 512; 98 }; 99 } 100 101 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 102 return [=](const LegalityQuery &Query) { 103 return Query.Types[TypeIdx].getElementType() == Type; 104 }; 105 } 106 107 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 108 const GCNTargetMachine &TM) 109 : ST(ST_) { 110 using namespace TargetOpcode; 111 112 auto GetAddrSpacePtr = [&TM](unsigned AS) { 113 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 114 }; 115 116 const LLT S1 = LLT::scalar(1); 117 const LLT S8 = LLT::scalar(8); 118 const LLT S16 = LLT::scalar(16); 119 const LLT S32 = LLT::scalar(32); 120 const LLT S64 = LLT::scalar(64); 121 const LLT S128 = LLT::scalar(128); 122 const LLT S256 = LLT::scalar(256); 123 const LLT S512 = LLT::scalar(512); 124 125 const LLT V2S16 = LLT::vector(2, 16); 126 const LLT V4S16 = LLT::vector(4, 16); 127 128 const LLT V2S32 = LLT::vector(2, 32); 129 const LLT V3S32 = LLT::vector(3, 32); 130 const LLT V4S32 = LLT::vector(4, 32); 131 const LLT V5S32 = LLT::vector(5, 32); 132 const LLT V6S32 = LLT::vector(6, 32); 133 const LLT V7S32 = LLT::vector(7, 32); 134 const LLT V8S32 = LLT::vector(8, 32); 135 const LLT V9S32 = LLT::vector(9, 32); 136 const LLT V10S32 = LLT::vector(10, 32); 137 const LLT V11S32 = LLT::vector(11, 32); 138 const LLT V12S32 = LLT::vector(12, 32); 139 const LLT V13S32 = LLT::vector(13, 32); 140 const LLT V14S32 = LLT::vector(14, 32); 141 const LLT V15S32 = LLT::vector(15, 32); 142 const LLT V16S32 = LLT::vector(16, 32); 143 144 const LLT V2S64 = LLT::vector(2, 64); 145 const LLT V3S64 = LLT::vector(3, 64); 146 const LLT V4S64 = LLT::vector(4, 64); 147 const LLT V5S64 = LLT::vector(5, 64); 148 const LLT V6S64 = LLT::vector(6, 64); 149 const LLT V7S64 = LLT::vector(7, 64); 150 const LLT V8S64 = LLT::vector(8, 64); 151 152 std::initializer_list<LLT> AllS32Vectors = 153 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 154 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32}; 155 std::initializer_list<LLT> AllS64Vectors = 156 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64}; 157 158 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 159 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 160 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 161 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 162 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 163 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 164 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 165 166 const LLT CodePtr = FlatPtr; 167 168 const std::initializer_list<LLT> AddrSpaces64 = { 169 GlobalPtr, ConstantPtr, FlatPtr 170 }; 171 172 const std::initializer_list<LLT> AddrSpaces32 = { 173 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 174 }; 175 176 const std::initializer_list<LLT> FPTypesBase = { 177 S32, S64 178 }; 179 180 const std::initializer_list<LLT> FPTypes16 = { 181 S32, S64, S16 182 }; 183 184 const std::initializer_list<LLT> FPTypesPK16 = { 185 S32, S64, S16, V2S16 186 }; 187 188 setAction({G_BRCOND, S1}, Legal); 189 190 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 191 // elements for v3s16 192 getActionDefinitionsBuilder(G_PHI) 193 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 194 .legalFor(AllS32Vectors) 195 .legalFor(AllS64Vectors) 196 .legalFor(AddrSpaces64) 197 .legalFor(AddrSpaces32) 198 .clampScalar(0, S32, S256) 199 .widenScalarToNextPow2(0, 32) 200 .clampMaxNumElements(0, S32, 16) 201 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 202 .legalIf(isPointer(0)); 203 204 if (ST.has16BitInsts()) { 205 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 206 .legalFor({S32, S16}) 207 .clampScalar(0, S16, S32) 208 .scalarize(0); 209 } else { 210 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 211 .legalFor({S32}) 212 .clampScalar(0, S32, S32) 213 .scalarize(0); 214 } 215 216 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 217 .legalFor({S32}) 218 .clampScalar(0, S32, S32) 219 .scalarize(0); 220 221 // Report legal for any types we can handle anywhere. For the cases only legal 222 // on the SALU, RegBankSelect will be able to re-legalize. 223 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 224 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 225 .clampScalar(0, S32, S64) 226 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 227 .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0)) 228 .widenScalarToNextPow2(0) 229 .scalarize(0); 230 231 getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO, 232 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 233 .legalFor({{S32, S1}}) 234 .clampScalar(0, S32, S32); 235 236 getActionDefinitionsBuilder(G_BITCAST) 237 .legalForCartesianProduct({S32, V2S16}) 238 .legalForCartesianProduct({S64, V2S32, V4S16}) 239 .legalForCartesianProduct({V2S64, V4S32}) 240 // Don't worry about the size constraint. 241 .legalIf(all(isPointer(0), isPointer(1))); 242 243 if (ST.has16BitInsts()) { 244 getActionDefinitionsBuilder(G_FCONSTANT) 245 .legalFor({S32, S64, S16}) 246 .clampScalar(0, S16, S64); 247 } else { 248 getActionDefinitionsBuilder(G_FCONSTANT) 249 .legalFor({S32, S64}) 250 .clampScalar(0, S32, S64); 251 } 252 253 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 254 .legalFor({S1, S32, S64, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 255 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 256 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 257 .clampScalarOrElt(0, S32, S512) 258 .legalIf(isMultiple32(0)) 259 .widenScalarToNextPow2(0, 32) 260 .clampMaxNumElements(0, S32, 16); 261 262 263 // FIXME: i1 operands to intrinsics should always be legal, but other i1 264 // values may not be legal. We need to figure out how to distinguish 265 // between these two scenarios. 266 getActionDefinitionsBuilder(G_CONSTANT) 267 .legalFor({S1, S32, S64, GlobalPtr, 268 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 269 .clampScalar(0, S32, S64) 270 .widenScalarToNextPow2(0) 271 .legalIf(isPointer(0)); 272 273 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 274 275 auto &FPOpActions = getActionDefinitionsBuilder( 276 { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA, G_FCANONICALIZE}) 277 .legalFor({S32, S64}); 278 279 if (ST.has16BitInsts()) { 280 if (ST.hasVOP3PInsts()) 281 FPOpActions.legalFor({S16, V2S16}); 282 else 283 FPOpActions.legalFor({S16}); 284 } 285 286 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 287 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 288 289 if (ST.hasVOP3PInsts()) { 290 MinNumMaxNum.customFor(FPTypesPK16) 291 .clampMaxNumElements(0, S16, 2) 292 .clampScalar(0, S16, S64) 293 .scalarize(0); 294 } else if (ST.has16BitInsts()) { 295 MinNumMaxNum.customFor(FPTypes16) 296 .clampScalar(0, S16, S64) 297 .scalarize(0); 298 } else { 299 MinNumMaxNum.customFor(FPTypesBase) 300 .clampScalar(0, S32, S64) 301 .scalarize(0); 302 } 303 304 // TODO: Implement 305 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); 306 307 if (ST.hasVOP3PInsts()) 308 FPOpActions.clampMaxNumElements(0, S16, 2); 309 FPOpActions 310 .scalarize(0) 311 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 312 313 if (ST.has16BitInsts()) { 314 getActionDefinitionsBuilder(G_FSQRT) 315 .legalFor({S32, S64, S16}) 316 .scalarize(0) 317 .clampScalar(0, S16, S64); 318 } else { 319 getActionDefinitionsBuilder(G_FSQRT) 320 .legalFor({S32, S64}) 321 .scalarize(0) 322 .clampScalar(0, S32, S64); 323 } 324 325 getActionDefinitionsBuilder(G_FPTRUNC) 326 .legalFor({{S32, S64}, {S16, S32}}) 327 .scalarize(0); 328 329 getActionDefinitionsBuilder(G_FPEXT) 330 .legalFor({{S64, S32}, {S32, S16}}) 331 .lowerFor({{S64, S16}}) // FIXME: Implement 332 .scalarize(0); 333 334 // TODO: Verify V_BFI_B32 is generated from expanded bit ops. 335 getActionDefinitionsBuilder(G_FCOPYSIGN).lower(); 336 337 getActionDefinitionsBuilder(G_FSUB) 338 // Use actual fsub instruction 339 .legalFor({S32}) 340 // Must use fadd + fneg 341 .lowerFor({S64, S16, V2S16}) 342 .scalarize(0) 343 .clampScalar(0, S32, S64); 344 345 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 346 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 347 {S32, S1}, {S64, S1}, {S16, S1}, 348 // FIXME: Hack 349 {S64, LLT::scalar(33)}, 350 {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}}) 351 .scalarize(0); 352 353 getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 354 .legalFor({{S32, S32}, {S64, S32}}) 355 .lowerFor({{S32, S64}}) 356 .customFor({{S64, S64}}) 357 .scalarize(0); 358 359 getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 360 .legalFor({{S32, S32}, {S32, S64}}) 361 .scalarize(0); 362 363 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 364 .legalFor({S32, S64}) 365 .scalarize(0); 366 367 if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 368 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 369 .legalFor({S32, S64}) 370 .clampScalar(0, S32, S64) 371 .scalarize(0); 372 } else { 373 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 374 .legalFor({S32}) 375 .customFor({S64}) 376 .clampScalar(0, S32, S64) 377 .scalarize(0); 378 } 379 380 getActionDefinitionsBuilder(G_GEP) 381 .legalForCartesianProduct(AddrSpaces64, {S64}) 382 .legalForCartesianProduct(AddrSpaces32, {S32}) 383 .scalarize(0); 384 385 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 386 387 auto &CmpBuilder = 388 getActionDefinitionsBuilder(G_ICMP) 389 .legalForCartesianProduct( 390 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 391 .legalFor({{S1, S32}, {S1, S64}}); 392 if (ST.has16BitInsts()) { 393 CmpBuilder.legalFor({{S1, S16}}); 394 } 395 396 CmpBuilder 397 .widenScalarToNextPow2(1) 398 .clampScalar(1, S32, S64) 399 .scalarize(0) 400 .legalIf(all(typeIs(0, S1), isPointer(1))); 401 402 getActionDefinitionsBuilder(G_FCMP) 403 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 404 .widenScalarToNextPow2(1) 405 .clampScalar(1, S32, S64) 406 .scalarize(0); 407 408 // FIXME: fexp, flog2, flog10 needs to be custom lowered. 409 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, 410 G_FLOG, G_FLOG2, G_FLOG10}) 411 .legalFor({S32}) 412 .scalarize(0); 413 414 // The 64-bit versions produce 32-bit results, but only on the SALU. 415 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, 416 G_CTTZ, G_CTTZ_ZERO_UNDEF, 417 G_CTPOP}) 418 .legalFor({{S32, S32}, {S32, S64}}) 419 .clampScalar(0, S32, S32) 420 .clampScalar(1, S32, S64) 421 .scalarize(0) 422 .widenScalarToNextPow2(0, 32) 423 .widenScalarToNextPow2(1, 32); 424 425 // TODO: Expand for > s32 426 getActionDefinitionsBuilder(G_BSWAP) 427 .legalFor({S32}) 428 .clampScalar(0, S32, S32) 429 .scalarize(0); 430 431 if (ST.has16BitInsts()) { 432 if (ST.hasVOP3PInsts()) { 433 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 434 .legalFor({S32, S16, V2S16}) 435 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 436 .clampMaxNumElements(0, S16, 2) 437 .clampScalar(0, S16, S32) 438 .widenScalarToNextPow2(0) 439 .scalarize(0); 440 } else { 441 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 442 .legalFor({S32, S16}) 443 .widenScalarToNextPow2(0) 444 .clampScalar(0, S16, S32) 445 .scalarize(0); 446 } 447 } else { 448 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 449 .legalFor({S32}) 450 .clampScalar(0, S32, S32) 451 .widenScalarToNextPow2(0) 452 .scalarize(0); 453 } 454 455 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 456 return [=](const LegalityQuery &Query) { 457 return Query.Types[TypeIdx0].getSizeInBits() < 458 Query.Types[TypeIdx1].getSizeInBits(); 459 }; 460 }; 461 462 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 463 return [=](const LegalityQuery &Query) { 464 return Query.Types[TypeIdx0].getSizeInBits() > 465 Query.Types[TypeIdx1].getSizeInBits(); 466 }; 467 }; 468 469 getActionDefinitionsBuilder(G_INTTOPTR) 470 // List the common cases 471 .legalForCartesianProduct(AddrSpaces64, {S64}) 472 .legalForCartesianProduct(AddrSpaces32, {S32}) 473 .scalarize(0) 474 // Accept any address space as long as the size matches 475 .legalIf(sameSize(0, 1)) 476 .widenScalarIf(smallerThan(1, 0), 477 [](const LegalityQuery &Query) { 478 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 479 }) 480 .narrowScalarIf(greaterThan(1, 0), 481 [](const LegalityQuery &Query) { 482 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 483 }); 484 485 getActionDefinitionsBuilder(G_PTRTOINT) 486 // List the common cases 487 .legalForCartesianProduct(AddrSpaces64, {S64}) 488 .legalForCartesianProduct(AddrSpaces32, {S32}) 489 .scalarize(0) 490 // Accept any address space as long as the size matches 491 .legalIf(sameSize(0, 1)) 492 .widenScalarIf(smallerThan(0, 1), 493 [](const LegalityQuery &Query) { 494 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 495 }) 496 .narrowScalarIf( 497 greaterThan(0, 1), 498 [](const LegalityQuery &Query) { 499 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 500 }); 501 502 if (ST.hasFlatAddressSpace()) { 503 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 504 .scalarize(0) 505 .custom(); 506 } 507 508 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 509 // handle some operations by just promoting the register during 510 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 511 getActionDefinitionsBuilder({G_LOAD, G_STORE}) 512 .narrowScalarIf([](const LegalityQuery &Query) { 513 unsigned Size = Query.Types[0].getSizeInBits(); 514 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 515 return (Size > 32 && MemSize < Size); 516 }, 517 [](const LegalityQuery &Query) { 518 return std::make_pair(0, LLT::scalar(32)); 519 }) 520 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 521 .fewerElementsIf([=](const LegalityQuery &Query) { 522 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 523 return (MemSize == 96) && 524 Query.Types[0].isVector() && 525 !ST.hasDwordx3LoadStores(); 526 }, 527 [=](const LegalityQuery &Query) { 528 return std::make_pair(0, V2S32); 529 }) 530 .legalIf([=](const LegalityQuery &Query) { 531 const LLT &Ty0 = Query.Types[0]; 532 533 unsigned Size = Ty0.getSizeInBits(); 534 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 535 if (Size < 32 || (Size > 32 && MemSize < Size)) 536 return false; 537 538 if (Ty0.isVector() && Size != MemSize) 539 return false; 540 541 // TODO: Decompose private loads into 4-byte components. 542 // TODO: Illegal flat loads on SI 543 switch (MemSize) { 544 case 8: 545 case 16: 546 return Size == 32; 547 case 32: 548 case 64: 549 case 128: 550 return true; 551 552 case 96: 553 return ST.hasDwordx3LoadStores(); 554 555 case 256: 556 case 512: 557 // TODO: Possibly support loads of i256 and i512 . This will require 558 // adding i256 and i512 types to MVT in order for to be able to use 559 // TableGen. 560 // TODO: Add support for other vector types, this will require 561 // defining more value mappings for the new types. 562 return Ty0.isVector() && (Ty0.getScalarType().getSizeInBits() == 32 || 563 Ty0.getScalarType().getSizeInBits() == 64); 564 565 default: 566 return false; 567 } 568 }) 569 .clampScalar(0, S32, S64); 570 571 572 // FIXME: Handle alignment requirements. 573 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 574 .legalForTypesWithMemDesc({ 575 {S32, GlobalPtr, 8, 8}, 576 {S32, GlobalPtr, 16, 8}, 577 {S32, LocalPtr, 8, 8}, 578 {S32, LocalPtr, 16, 8}, 579 {S32, PrivatePtr, 8, 8}, 580 {S32, PrivatePtr, 16, 8}}); 581 if (ST.hasFlatAddressSpace()) { 582 ExtLoads.legalForTypesWithMemDesc({{S32, FlatPtr, 8, 8}, 583 {S32, FlatPtr, 16, 8}}); 584 } 585 586 ExtLoads.clampScalar(0, S32, S32) 587 .widenScalarToNextPow2(0) 588 .unsupportedIfMemSizeNotPow2() 589 .lower(); 590 591 auto &Atomics = getActionDefinitionsBuilder( 592 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 593 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 594 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 595 G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG}) 596 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 597 {S64, GlobalPtr}, {S64, LocalPtr}}); 598 if (ST.hasFlatAddressSpace()) { 599 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 600 } 601 602 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 603 .legalFor({{S32, LocalPtr}}); 604 605 // TODO: Pointer types, any 32-bit or 64-bit vector 606 getActionDefinitionsBuilder(G_SELECT) 607 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 608 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 609 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1}) 610 .clampScalar(0, S16, S64) 611 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 612 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 613 .scalarize(1) 614 .clampMaxNumElements(0, S32, 2) 615 .clampMaxNumElements(0, LocalPtr, 2) 616 .clampMaxNumElements(0, PrivatePtr, 2) 617 .scalarize(0) 618 .widenScalarToNextPow2(0) 619 .legalIf(all(isPointer(0), typeIs(1, S1))); 620 621 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 622 // be more flexible with the shift amount type. 623 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 624 .legalFor({{S32, S32}, {S64, S32}}); 625 if (ST.has16BitInsts()) { 626 if (ST.hasVOP3PInsts()) { 627 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 628 .clampMaxNumElements(0, S16, 2); 629 } else 630 Shifts.legalFor({{S16, S32}, {S16, S16}}); 631 632 Shifts.clampScalar(1, S16, S32); 633 Shifts.clampScalar(0, S16, S64); 634 Shifts.widenScalarToNextPow2(0, 16); 635 } else { 636 // Make sure we legalize the shift amount type first, as the general 637 // expansion for the shifted type will produce much worse code if it hasn't 638 // been truncated already. 639 Shifts.clampScalar(1, S32, S32); 640 Shifts.clampScalar(0, S32, S64); 641 Shifts.widenScalarToNextPow2(0, 32); 642 } 643 Shifts.scalarize(0); 644 645 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 646 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 647 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 648 unsigned IdxTypeIdx = 2; 649 650 getActionDefinitionsBuilder(Op) 651 .customIf([=](const LegalityQuery &Query) { 652 const LLT EltTy = Query.Types[EltTypeIdx]; 653 const LLT VecTy = Query.Types[VecTypeIdx]; 654 const LLT IdxTy = Query.Types[IdxTypeIdx]; 655 return (EltTy.getSizeInBits() == 16 || 656 EltTy.getSizeInBits() % 32 == 0) && 657 VecTy.getSizeInBits() % 32 == 0 && 658 VecTy.getSizeInBits() <= 512 && 659 IdxTy.getSizeInBits() == 32; 660 }) 661 .clampScalar(EltTypeIdx, S32, S64) 662 .clampScalar(VecTypeIdx, S32, S64) 663 .clampScalar(IdxTypeIdx, S32, S32); 664 } 665 666 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 667 .unsupportedIf([=](const LegalityQuery &Query) { 668 const LLT &EltTy = Query.Types[1].getElementType(); 669 return Query.Types[0] != EltTy; 670 }); 671 672 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 673 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 674 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 675 676 // FIXME: Doesn't handle extract of illegal sizes. 677 getActionDefinitionsBuilder(Op) 678 .legalIf([=](const LegalityQuery &Query) { 679 const LLT BigTy = Query.Types[BigTyIdx]; 680 const LLT LitTy = Query.Types[LitTyIdx]; 681 return (BigTy.getSizeInBits() % 32 == 0) && 682 (LitTy.getSizeInBits() % 16 == 0); 683 }) 684 .widenScalarIf( 685 [=](const LegalityQuery &Query) { 686 const LLT BigTy = Query.Types[BigTyIdx]; 687 return (BigTy.getScalarSizeInBits() < 16); 688 }, 689 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 690 .widenScalarIf( 691 [=](const LegalityQuery &Query) { 692 const LLT LitTy = Query.Types[LitTyIdx]; 693 return (LitTy.getScalarSizeInBits() < 16); 694 }, 695 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 696 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 697 .widenScalarToNextPow2(BigTyIdx, 32); 698 699 } 700 701 getActionDefinitionsBuilder(G_BUILD_VECTOR) 702 .legalForCartesianProduct(AllS32Vectors, {S32}) 703 .legalForCartesianProduct(AllS64Vectors, {S64}) 704 .clampNumElements(0, V16S32, V16S32) 705 .clampNumElements(0, V2S64, V8S64) 706 .minScalarSameAs(1, 0) 707 .legalIf(isRegisterType(0)) 708 .minScalarOrElt(0, S32); 709 710 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 711 .legalIf(isRegisterType(0)); 712 713 // TODO: Don't fully scalarize v2s16 pieces 714 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 715 716 // Merge/Unmerge 717 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 718 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 719 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 720 721 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 722 const LLT &Ty = Query.Types[TypeIdx]; 723 if (Ty.isVector()) { 724 const LLT &EltTy = Ty.getElementType(); 725 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 726 return true; 727 if (!isPowerOf2_32(EltTy.getSizeInBits())) 728 return true; 729 } 730 return false; 731 }; 732 733 getActionDefinitionsBuilder(Op) 734 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 735 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 736 // worth considering the multiples of 64 since 2*192 and 2*384 are not 737 // valid. 738 .clampScalar(LitTyIdx, S16, S256) 739 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 740 .legalIf(all(typeIs(0, S16), typeIs(1, LLT::vector(3, 16)))) // FIXME: Testing hack 741 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 742 elementTypeIs(1, S16)), 743 changeTo(1, V2S16)) 744 // Break up vectors with weird elements into scalars 745 .fewerElementsIf( 746 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 747 scalarize(0)) 748 .fewerElementsIf( 749 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 750 scalarize(1)) 751 .clampScalar(BigTyIdx, S32, S512) 752 .lowerFor({{S16, V2S16}}) 753 .widenScalarIf( 754 [=](const LegalityQuery &Query) { 755 const LLT &Ty = Query.Types[BigTyIdx]; 756 return !isPowerOf2_32(Ty.getSizeInBits()) && 757 Ty.getSizeInBits() % 16 != 0; 758 }, 759 [=](const LegalityQuery &Query) { 760 // Pick the next power of 2, or a multiple of 64 over 128. 761 // Whichever is smaller. 762 const LLT &Ty = Query.Types[BigTyIdx]; 763 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 764 if (NewSizeInBits >= 256) { 765 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 766 if (RoundedTo < NewSizeInBits) 767 NewSizeInBits = RoundedTo; 768 } 769 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 770 }) 771 .legalIf([=](const LegalityQuery &Query) { 772 const LLT &BigTy = Query.Types[BigTyIdx]; 773 const LLT &LitTy = Query.Types[LitTyIdx]; 774 775 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 776 return false; 777 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 778 return false; 779 780 return BigTy.getSizeInBits() % 16 == 0 && 781 LitTy.getSizeInBits() % 16 == 0 && 782 BigTy.getSizeInBits() <= 512; 783 }) 784 // Any vectors left are the wrong size. Scalarize them. 785 .scalarize(0) 786 .scalarize(1); 787 } 788 789 getActionDefinitionsBuilder(G_SEXT_INREG).lower(); 790 791 computeTables(); 792 verify(*ST.getInstrInfo()); 793 } 794 795 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 796 MachineRegisterInfo &MRI, 797 MachineIRBuilder &MIRBuilder, 798 GISelChangeObserver &Observer) const { 799 switch (MI.getOpcode()) { 800 case TargetOpcode::G_ADDRSPACE_CAST: 801 return legalizeAddrSpaceCast(MI, MRI, MIRBuilder); 802 case TargetOpcode::G_FRINT: 803 return legalizeFrint(MI, MRI, MIRBuilder); 804 case TargetOpcode::G_FCEIL: 805 return legalizeFceil(MI, MRI, MIRBuilder); 806 case TargetOpcode::G_INTRINSIC_TRUNC: 807 return legalizeIntrinsicTrunc(MI, MRI, MIRBuilder); 808 case TargetOpcode::G_SITOFP: 809 return legalizeITOFP(MI, MRI, MIRBuilder, true); 810 case TargetOpcode::G_UITOFP: 811 return legalizeITOFP(MI, MRI, MIRBuilder, false); 812 case TargetOpcode::G_FMINNUM: 813 case TargetOpcode::G_FMAXNUM: 814 case TargetOpcode::G_FMINNUM_IEEE: 815 case TargetOpcode::G_FMAXNUM_IEEE: 816 return legalizeMinNumMaxNum(MI, MRI, MIRBuilder); 817 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 818 return legalizeExtractVectorElt(MI, MRI, MIRBuilder); 819 case TargetOpcode::G_INSERT_VECTOR_ELT: 820 return legalizeInsertVectorElt(MI, MRI, MIRBuilder); 821 default: 822 return false; 823 } 824 825 llvm_unreachable("expected switch to return"); 826 } 827 828 Register AMDGPULegalizerInfo::getSegmentAperture( 829 unsigned AS, 830 MachineRegisterInfo &MRI, 831 MachineIRBuilder &MIRBuilder) const { 832 MachineFunction &MF = MIRBuilder.getMF(); 833 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 834 const LLT S32 = LLT::scalar(32); 835 836 if (ST.hasApertureRegs()) { 837 // FIXME: Use inline constants (src_{shared, private}_base) instead of 838 // getreg. 839 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 840 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 841 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 842 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 843 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 844 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 845 unsigned Encoding = 846 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 847 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 848 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 849 850 Register ApertureReg = MRI.createGenericVirtualRegister(S32); 851 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 852 853 MIRBuilder.buildInstr(AMDGPU::S_GETREG_B32) 854 .addDef(GetReg) 855 .addImm(Encoding); 856 MRI.setType(GetReg, S32); 857 858 auto ShiftAmt = MIRBuilder.buildConstant(S32, WidthM1 + 1); 859 MIRBuilder.buildInstr(TargetOpcode::G_SHL) 860 .addDef(ApertureReg) 861 .addUse(GetReg) 862 .addUse(ShiftAmt.getReg(0)); 863 864 return ApertureReg; 865 } 866 867 Register QueuePtr = MRI.createGenericVirtualRegister( 868 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 869 870 // FIXME: Placeholder until we can track the input registers. 871 MIRBuilder.buildConstant(QueuePtr, 0xdeadbeef); 872 873 // Offset into amd_queue_t for group_segment_aperture_base_hi / 874 // private_segment_aperture_base_hi. 875 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 876 877 // FIXME: Don't use undef 878 Value *V = UndefValue::get(PointerType::get( 879 Type::getInt8Ty(MF.getFunction().getContext()), 880 AMDGPUAS::CONSTANT_ADDRESS)); 881 882 MachinePointerInfo PtrInfo(V, StructOffset); 883 MachineMemOperand *MMO = MF.getMachineMemOperand( 884 PtrInfo, 885 MachineMemOperand::MOLoad | 886 MachineMemOperand::MODereferenceable | 887 MachineMemOperand::MOInvariant, 888 4, 889 MinAlign(64, StructOffset)); 890 891 Register LoadResult = MRI.createGenericVirtualRegister(S32); 892 Register LoadAddr; 893 894 MIRBuilder.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 895 MIRBuilder.buildLoad(LoadResult, LoadAddr, *MMO); 896 return LoadResult; 897 } 898 899 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 900 MachineInstr &MI, MachineRegisterInfo &MRI, 901 MachineIRBuilder &MIRBuilder) const { 902 MachineFunction &MF = MIRBuilder.getMF(); 903 904 MIRBuilder.setInstr(MI); 905 906 Register Dst = MI.getOperand(0).getReg(); 907 Register Src = MI.getOperand(1).getReg(); 908 909 LLT DstTy = MRI.getType(Dst); 910 LLT SrcTy = MRI.getType(Src); 911 unsigned DestAS = DstTy.getAddressSpace(); 912 unsigned SrcAS = SrcTy.getAddressSpace(); 913 914 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 915 // vector element. 916 assert(!DstTy.isVector()); 917 918 const AMDGPUTargetMachine &TM 919 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 920 921 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 922 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 923 MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BITCAST)); 924 return true; 925 } 926 927 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 928 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 929 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 930 unsigned NullVal = TM.getNullPointerValue(DestAS); 931 932 auto SegmentNull = MIRBuilder.buildConstant(DstTy, NullVal); 933 auto FlatNull = MIRBuilder.buildConstant(SrcTy, 0); 934 935 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy); 936 937 // Extract low 32-bits of the pointer. 938 MIRBuilder.buildExtract(PtrLo32, Src, 0); 939 940 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 941 MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); 942 MIRBuilder.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 943 944 MI.eraseFromParent(); 945 return true; 946 } 947 948 assert(SrcAS == AMDGPUAS::LOCAL_ADDRESS || 949 SrcAS == AMDGPUAS::PRIVATE_ADDRESS); 950 951 auto SegmentNull = 952 MIRBuilder.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 953 auto FlatNull = 954 MIRBuilder.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 955 956 Register ApertureReg = getSegmentAperture(DestAS, MRI, MIRBuilder); 957 958 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 959 MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); 960 961 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy); 962 963 // Coerce the type of the low half of the result so we can use merge_values. 964 Register SrcAsInt = MRI.createGenericVirtualRegister(LLT::scalar(32)); 965 MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT) 966 .addDef(SrcAsInt) 967 .addUse(Src); 968 969 // TODO: Should we allow mismatched types but matching sizes in merges to 970 // avoid the ptrtoint? 971 MIRBuilder.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); 972 MIRBuilder.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); 973 974 MI.eraseFromParent(); 975 return true; 976 } 977 978 bool AMDGPULegalizerInfo::legalizeFrint( 979 MachineInstr &MI, MachineRegisterInfo &MRI, 980 MachineIRBuilder &MIRBuilder) const { 981 MIRBuilder.setInstr(MI); 982 983 Register Src = MI.getOperand(1).getReg(); 984 LLT Ty = MRI.getType(Src); 985 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 986 987 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 988 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 989 990 auto C1 = MIRBuilder.buildFConstant(Ty, C1Val); 991 auto CopySign = MIRBuilder.buildFCopysign(Ty, C1, Src); 992 993 // TODO: Should this propagate fast-math-flags? 994 auto Tmp1 = MIRBuilder.buildFAdd(Ty, Src, CopySign); 995 auto Tmp2 = MIRBuilder.buildFSub(Ty, Tmp1, CopySign); 996 997 auto C2 = MIRBuilder.buildFConstant(Ty, C2Val); 998 auto Fabs = MIRBuilder.buildFAbs(Ty, Src); 999 1000 auto Cond = MIRBuilder.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1001 MIRBuilder.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1002 return true; 1003 } 1004 1005 bool AMDGPULegalizerInfo::legalizeFceil( 1006 MachineInstr &MI, MachineRegisterInfo &MRI, 1007 MachineIRBuilder &B) const { 1008 B.setInstr(MI); 1009 1010 const LLT S1 = LLT::scalar(1); 1011 const LLT S64 = LLT::scalar(64); 1012 1013 Register Src = MI.getOperand(1).getReg(); 1014 assert(MRI.getType(Src) == S64); 1015 1016 // result = trunc(src) 1017 // if (src > 0.0 && src != result) 1018 // result += 1.0 1019 1020 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src}); 1021 1022 const auto Zero = B.buildFConstant(S64, 0.0); 1023 const auto One = B.buildFConstant(S64, 1.0); 1024 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1025 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1026 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1027 auto Add = B.buildSelect(S64, And, One, Zero); 1028 1029 // TODO: Should this propagate fast-math-flags? 1030 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1031 return true; 1032 } 1033 1034 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1035 MachineIRBuilder &B) { 1036 const unsigned FractBits = 52; 1037 const unsigned ExpBits = 11; 1038 LLT S32 = LLT::scalar(32); 1039 1040 auto Const0 = B.buildConstant(S32, FractBits - 32); 1041 auto Const1 = B.buildConstant(S32, ExpBits); 1042 1043 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1044 .addUse(Const0.getReg(0)) 1045 .addUse(Const1.getReg(0)); 1046 1047 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1048 } 1049 1050 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1051 MachineInstr &MI, MachineRegisterInfo &MRI, 1052 MachineIRBuilder &B) const { 1053 B.setInstr(MI); 1054 1055 const LLT S1 = LLT::scalar(1); 1056 const LLT S32 = LLT::scalar(32); 1057 const LLT S64 = LLT::scalar(64); 1058 1059 Register Src = MI.getOperand(1).getReg(); 1060 assert(MRI.getType(Src) == S64); 1061 1062 // TODO: Should this use extract since the low half is unused? 1063 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1064 Register Hi = Unmerge.getReg(1); 1065 1066 // Extract the upper half, since this is where we will find the sign and 1067 // exponent. 1068 auto Exp = extractF64Exponent(Hi, B); 1069 1070 const unsigned FractBits = 52; 1071 1072 // Extract the sign bit. 1073 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1074 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1075 1076 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1077 1078 const auto Zero32 = B.buildConstant(S32, 0); 1079 1080 // Extend back to 64-bits. 1081 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); 1082 1083 auto Shr = B.buildAShr(S64, FractMask, Exp); 1084 auto Not = B.buildNot(S64, Shr); 1085 auto Tmp0 = B.buildAnd(S64, Src, Not); 1086 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1087 1088 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1089 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1090 1091 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1092 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1093 return true; 1094 } 1095 1096 bool AMDGPULegalizerInfo::legalizeITOFP( 1097 MachineInstr &MI, MachineRegisterInfo &MRI, 1098 MachineIRBuilder &B, bool Signed) const { 1099 B.setInstr(MI); 1100 1101 Register Dst = MI.getOperand(0).getReg(); 1102 Register Src = MI.getOperand(1).getReg(); 1103 1104 const LLT S64 = LLT::scalar(64); 1105 const LLT S32 = LLT::scalar(32); 1106 1107 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1108 1109 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1110 1111 auto CvtHi = Signed ? 1112 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1113 B.buildUITOFP(S64, Unmerge.getReg(1)); 1114 1115 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1116 1117 auto ThirtyTwo = B.buildConstant(S32, 32); 1118 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1119 .addUse(CvtHi.getReg(0)) 1120 .addUse(ThirtyTwo.getReg(0)); 1121 1122 // TODO: Should this propagate fast-math-flags? 1123 B.buildFAdd(Dst, LdExp, CvtLo); 1124 MI.eraseFromParent(); 1125 return true; 1126 } 1127 1128 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1129 MachineInstr &MI, MachineRegisterInfo &MRI, 1130 MachineIRBuilder &B) const { 1131 MachineFunction &MF = B.getMF(); 1132 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1133 1134 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1135 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1136 1137 // With ieee_mode disabled, the instructions have the correct behavior 1138 // already for G_FMINNUM/G_FMAXNUM 1139 if (!MFI->getMode().IEEE) 1140 return !IsIEEEOp; 1141 1142 if (IsIEEEOp) 1143 return true; 1144 1145 MachineIRBuilder HelperBuilder(MI); 1146 GISelObserverWrapper DummyObserver; 1147 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1148 HelperBuilder.setMBB(*MI.getParent()); 1149 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1150 } 1151 1152 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1153 MachineInstr &MI, MachineRegisterInfo &MRI, 1154 MachineIRBuilder &B) const { 1155 // TODO: Should move some of this into LegalizerHelper. 1156 1157 // TODO: Promote dynamic indexing of s16 to s32 1158 // TODO: Dynamic s64 indexing is only legal for SGPR. 1159 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); 1160 if (!IdxVal) // Dynamic case will be selected to register indexing. 1161 return true; 1162 1163 Register Dst = MI.getOperand(0).getReg(); 1164 Register Vec = MI.getOperand(1).getReg(); 1165 1166 LLT VecTy = MRI.getType(Vec); 1167 LLT EltTy = VecTy.getElementType(); 1168 assert(EltTy == MRI.getType(Dst)); 1169 1170 B.setInstr(MI); 1171 1172 if (IdxVal.getValue() < VecTy.getNumElements()) 1173 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); 1174 else 1175 B.buildUndef(Dst); 1176 1177 MI.eraseFromParent(); 1178 return true; 1179 } 1180 1181 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1182 MachineInstr &MI, MachineRegisterInfo &MRI, 1183 MachineIRBuilder &B) const { 1184 // TODO: Should move some of this into LegalizerHelper. 1185 1186 // TODO: Promote dynamic indexing of s16 to s32 1187 // TODO: Dynamic s64 indexing is only legal for SGPR. 1188 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); 1189 if (!IdxVal) // Dynamic case will be selected to register indexing. 1190 return true; 1191 1192 Register Dst = MI.getOperand(0).getReg(); 1193 Register Vec = MI.getOperand(1).getReg(); 1194 Register Ins = MI.getOperand(2).getReg(); 1195 1196 LLT VecTy = MRI.getType(Vec); 1197 LLT EltTy = VecTy.getElementType(); 1198 assert(EltTy == MRI.getType(Ins)); 1199 1200 B.setInstr(MI); 1201 1202 if (IdxVal.getValue() < VecTy.getNumElements()) 1203 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); 1204 else 1205 B.buildUndef(Dst); 1206 1207 MI.eraseFromParent(); 1208 return true; 1209 } 1210 1211 // Return the use branch instruction, otherwise null if the usage is invalid. 1212 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 1213 MachineRegisterInfo &MRI) { 1214 Register CondDef = MI.getOperand(0).getReg(); 1215 if (!MRI.hasOneNonDBGUse(CondDef)) 1216 return nullptr; 1217 1218 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 1219 return UseMI.getParent() == MI.getParent() && 1220 UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr; 1221 } 1222 1223 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 1224 Register Reg, LLT Ty) const { 1225 Register LiveIn = MRI.getLiveInVirtReg(Reg); 1226 if (LiveIn) 1227 return LiveIn; 1228 1229 Register NewReg = MRI.createGenericVirtualRegister(Ty); 1230 MRI.addLiveIn(Reg, NewReg); 1231 return NewReg; 1232 } 1233 1234 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 1235 const ArgDescriptor *Arg) const { 1236 if (!Arg->isRegister()) 1237 return false; // TODO: Handle these 1238 1239 assert(Arg->getRegister() != 0); 1240 assert(Arg->getRegister().isPhysical()); 1241 1242 MachineRegisterInfo &MRI = *B.getMRI(); 1243 1244 LLT Ty = MRI.getType(DstReg); 1245 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 1246 1247 if (Arg->isMasked()) { 1248 // TODO: Should we try to emit this once in the entry block? 1249 const LLT S32 = LLT::scalar(32); 1250 const unsigned Mask = Arg->getMask(); 1251 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 1252 1253 auto ShiftAmt = B.buildConstant(S32, Shift); 1254 auto LShr = B.buildLShr(S32, LiveIn, ShiftAmt); 1255 B.buildAnd(DstReg, LShr, B.buildConstant(S32, Mask >> Shift)); 1256 } else 1257 B.buildCopy(DstReg, LiveIn); 1258 1259 // Insert the argument copy if it doens't already exist. 1260 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 1261 if (!MRI.getVRegDef(LiveIn)) { 1262 MachineBasicBlock &EntryMBB = B.getMF().front(); 1263 EntryMBB.addLiveIn(Arg->getRegister()); 1264 B.setInsertPt(EntryMBB, EntryMBB.begin()); 1265 B.buildCopy(LiveIn, Arg->getRegister()); 1266 } 1267 1268 return true; 1269 } 1270 1271 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 1272 MachineInstr &MI, 1273 MachineRegisterInfo &MRI, 1274 MachineIRBuilder &B, 1275 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 1276 B.setInstr(MI); 1277 1278 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1279 1280 const ArgDescriptor *Arg; 1281 const TargetRegisterClass *RC; 1282 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 1283 if (!Arg) { 1284 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 1285 return false; 1286 } 1287 1288 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 1289 MI.eraseFromParent(); 1290 return true; 1291 } 1292 1293 return false; 1294 } 1295 1296 bool AMDGPULegalizerInfo::legalizeFDIVFast(MachineInstr &MI, 1297 MachineRegisterInfo &MRI, 1298 MachineIRBuilder &B) const { 1299 B.setInstr(MI); 1300 Register Res = MI.getOperand(0).getReg(); 1301 Register LHS = MI.getOperand(2).getReg(); 1302 Register RHS = MI.getOperand(3).getReg(); 1303 uint16_t Flags = MI.getFlags(); 1304 1305 LLT S32 = LLT::scalar(32); 1306 LLT S1 = LLT::scalar(1); 1307 1308 auto Abs = B.buildFAbs(S32, RHS, Flags); 1309 const APFloat C0Val(1.0f); 1310 1311 auto C0 = B.buildConstant(S32, 0x6f800000); 1312 auto C1 = B.buildConstant(S32, 0x2f800000); 1313 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 1314 1315 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 1316 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 1317 1318 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 1319 1320 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 1321 .addUse(Mul0.getReg(0)) 1322 .setMIFlags(Flags); 1323 1324 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 1325 1326 B.buildFMul(Res, Sel, Mul1, Flags); 1327 1328 MI.eraseFromParent(); 1329 return true; 1330 } 1331 1332 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 1333 MachineRegisterInfo &MRI, 1334 MachineIRBuilder &B) const { 1335 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1336 if (!MFI->isEntryFunction()) { 1337 return legalizePreloadedArgIntrin(MI, MRI, B, 1338 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 1339 } 1340 1341 B.setInstr(MI); 1342 1343 uint64_t Offset = 1344 ST.getTargetLowering()->getImplicitParameterOffset( 1345 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 1346 Register DstReg = MI.getOperand(0).getReg(); 1347 LLT DstTy = MRI.getType(DstReg); 1348 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 1349 1350 const ArgDescriptor *Arg; 1351 const TargetRegisterClass *RC; 1352 std::tie(Arg, RC) 1353 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 1354 if (!Arg) 1355 return false; 1356 1357 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 1358 if (!loadInputValue(KernargPtrReg, B, Arg)) 1359 return false; 1360 1361 B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 1362 MI.eraseFromParent(); 1363 return true; 1364 } 1365 1366 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 1367 MachineRegisterInfo &MRI, 1368 MachineIRBuilder &B) const { 1369 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 1370 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { 1371 case Intrinsic::amdgcn_if: { 1372 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 1373 const SIRegisterInfo *TRI 1374 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 1375 1376 B.setInstr(*BrCond); 1377 Register Def = MI.getOperand(1).getReg(); 1378 Register Use = MI.getOperand(3).getReg(); 1379 B.buildInstr(AMDGPU::SI_IF) 1380 .addDef(Def) 1381 .addUse(Use) 1382 .addMBB(BrCond->getOperand(1).getMBB()); 1383 1384 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 1385 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 1386 MI.eraseFromParent(); 1387 BrCond->eraseFromParent(); 1388 return true; 1389 } 1390 1391 return false; 1392 } 1393 case Intrinsic::amdgcn_loop: { 1394 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 1395 const SIRegisterInfo *TRI 1396 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 1397 1398 B.setInstr(*BrCond); 1399 Register Reg = MI.getOperand(2).getReg(); 1400 B.buildInstr(AMDGPU::SI_LOOP) 1401 .addUse(Reg) 1402 .addMBB(BrCond->getOperand(1).getMBB()); 1403 MI.eraseFromParent(); 1404 BrCond->eraseFromParent(); 1405 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 1406 return true; 1407 } 1408 1409 return false; 1410 } 1411 case Intrinsic::amdgcn_kernarg_segment_ptr: 1412 return legalizePreloadedArgIntrin( 1413 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 1414 case Intrinsic::amdgcn_implicitarg_ptr: 1415 return legalizeImplicitArgPtr(MI, MRI, B); 1416 case Intrinsic::amdgcn_workitem_id_x: 1417 return legalizePreloadedArgIntrin(MI, MRI, B, 1418 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 1419 case Intrinsic::amdgcn_workitem_id_y: 1420 return legalizePreloadedArgIntrin(MI, MRI, B, 1421 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 1422 case Intrinsic::amdgcn_workitem_id_z: 1423 return legalizePreloadedArgIntrin(MI, MRI, B, 1424 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 1425 case Intrinsic::amdgcn_workgroup_id_x: 1426 return legalizePreloadedArgIntrin(MI, MRI, B, 1427 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 1428 case Intrinsic::amdgcn_workgroup_id_y: 1429 return legalizePreloadedArgIntrin(MI, MRI, B, 1430 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 1431 case Intrinsic::amdgcn_workgroup_id_z: 1432 return legalizePreloadedArgIntrin(MI, MRI, B, 1433 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 1434 case Intrinsic::amdgcn_dispatch_ptr: 1435 return legalizePreloadedArgIntrin(MI, MRI, B, 1436 AMDGPUFunctionArgInfo::DISPATCH_PTR); 1437 case Intrinsic::amdgcn_queue_ptr: 1438 return legalizePreloadedArgIntrin(MI, MRI, B, 1439 AMDGPUFunctionArgInfo::QUEUE_PTR); 1440 case Intrinsic::amdgcn_implicit_buffer_ptr: 1441 return legalizePreloadedArgIntrin( 1442 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 1443 case Intrinsic::amdgcn_dispatch_id: 1444 return legalizePreloadedArgIntrin(MI, MRI, B, 1445 AMDGPUFunctionArgInfo::DISPATCH_ID); 1446 case Intrinsic::amdgcn_fdiv_fast: 1447 return legalizeFDIVFast(MI, MRI, B); 1448 default: 1449 return true; 1450 } 1451 1452 return true; 1453 } 1454