1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPULegalizerInfo.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "SIMachineFunctionInfo.h" 18 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 19 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 20 #include "llvm/CodeGen/TargetOpcodes.h" 21 #include "llvm/CodeGen/ValueTypes.h" 22 #include "llvm/IR/DerivedTypes.h" 23 #include "llvm/IR/Type.h" 24 #include "llvm/Support/Debug.h" 25 26 #define DEBUG_TYPE "amdgpu-legalinfo" 27 28 using namespace llvm; 29 using namespace LegalizeActions; 30 using namespace LegalizeMutations; 31 using namespace LegalityPredicates; 32 33 34 static LegalityPredicate isMultiple32(unsigned TypeIdx, 35 unsigned MaxSize = 512) { 36 return [=](const LegalityQuery &Query) { 37 const LLT Ty = Query.Types[TypeIdx]; 38 const LLT EltTy = Ty.getScalarType(); 39 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 40 }; 41 } 42 43 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 44 return [=](const LegalityQuery &Query) { 45 const LLT Ty = Query.Types[TypeIdx]; 46 return Ty.isVector() && 47 Ty.getNumElements() % 2 != 0 && 48 Ty.getElementType().getSizeInBits() < 32; 49 }; 50 } 51 52 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 53 return [=](const LegalityQuery &Query) { 54 const LLT Ty = Query.Types[TypeIdx]; 55 const LLT EltTy = Ty.getElementType(); 56 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 57 }; 58 } 59 60 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 61 return [=](const LegalityQuery &Query) { 62 const LLT Ty = Query.Types[TypeIdx]; 63 const LLT EltTy = Ty.getElementType(); 64 unsigned Size = Ty.getSizeInBits(); 65 unsigned Pieces = (Size + 63) / 64; 66 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 67 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 68 }; 69 } 70 71 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 72 return [=](const LegalityQuery &Query) { 73 const LLT QueryTy = Query.Types[TypeIdx]; 74 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 75 }; 76 } 77 78 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 79 return [=](const LegalityQuery &Query) { 80 const LLT QueryTy = Query.Types[TypeIdx]; 81 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 82 }; 83 } 84 85 // Any combination of 32 or 64-bit elements up to 512 bits, and multiples of 86 // v2s16. 87 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 88 return [=](const LegalityQuery &Query) { 89 const LLT Ty = Query.Types[TypeIdx]; 90 if (Ty.isVector()) { 91 const int EltSize = Ty.getElementType().getSizeInBits(); 92 return EltSize == 32 || EltSize == 64 || 93 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 94 EltSize == 128 || EltSize == 256; 95 } 96 97 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 512; 98 }; 99 } 100 101 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 102 const GCNTargetMachine &TM) 103 : ST(ST_) { 104 using namespace TargetOpcode; 105 106 auto GetAddrSpacePtr = [&TM](unsigned AS) { 107 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 108 }; 109 110 const LLT S1 = LLT::scalar(1); 111 const LLT S8 = LLT::scalar(8); 112 const LLT S16 = LLT::scalar(16); 113 const LLT S32 = LLT::scalar(32); 114 const LLT S64 = LLT::scalar(64); 115 const LLT S128 = LLT::scalar(128); 116 const LLT S256 = LLT::scalar(256); 117 const LLT S512 = LLT::scalar(512); 118 119 const LLT V2S16 = LLT::vector(2, 16); 120 const LLT V4S16 = LLT::vector(4, 16); 121 122 const LLT V2S32 = LLT::vector(2, 32); 123 const LLT V3S32 = LLT::vector(3, 32); 124 const LLT V4S32 = LLT::vector(4, 32); 125 const LLT V5S32 = LLT::vector(5, 32); 126 const LLT V6S32 = LLT::vector(6, 32); 127 const LLT V7S32 = LLT::vector(7, 32); 128 const LLT V8S32 = LLT::vector(8, 32); 129 const LLT V9S32 = LLT::vector(9, 32); 130 const LLT V10S32 = LLT::vector(10, 32); 131 const LLT V11S32 = LLT::vector(11, 32); 132 const LLT V12S32 = LLT::vector(12, 32); 133 const LLT V13S32 = LLT::vector(13, 32); 134 const LLT V14S32 = LLT::vector(14, 32); 135 const LLT V15S32 = LLT::vector(15, 32); 136 const LLT V16S32 = LLT::vector(16, 32); 137 138 const LLT V2S64 = LLT::vector(2, 64); 139 const LLT V3S64 = LLT::vector(3, 64); 140 const LLT V4S64 = LLT::vector(4, 64); 141 const LLT V5S64 = LLT::vector(5, 64); 142 const LLT V6S64 = LLT::vector(6, 64); 143 const LLT V7S64 = LLT::vector(7, 64); 144 const LLT V8S64 = LLT::vector(8, 64); 145 146 std::initializer_list<LLT> AllS32Vectors = 147 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 148 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32}; 149 std::initializer_list<LLT> AllS64Vectors = 150 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64}; 151 152 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 153 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 154 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 155 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 156 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 157 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 158 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 159 160 const LLT CodePtr = FlatPtr; 161 162 const std::initializer_list<LLT> AddrSpaces64 = { 163 GlobalPtr, ConstantPtr, FlatPtr 164 }; 165 166 const std::initializer_list<LLT> AddrSpaces32 = { 167 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 168 }; 169 170 const std::initializer_list<LLT> FPTypesBase = { 171 S32, S64 172 }; 173 174 const std::initializer_list<LLT> FPTypes16 = { 175 S32, S64, S16 176 }; 177 178 const std::initializer_list<LLT> FPTypesPK16 = { 179 S32, S64, S16, V2S16 180 }; 181 182 setAction({G_BRCOND, S1}, Legal); 183 184 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 185 // elements for v3s16 186 getActionDefinitionsBuilder(G_PHI) 187 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 188 .legalFor(AllS32Vectors) 189 .legalFor(AllS64Vectors) 190 .legalFor(AddrSpaces64) 191 .legalFor(AddrSpaces32) 192 .clampScalar(0, S32, S256) 193 .widenScalarToNextPow2(0, 32) 194 .clampMaxNumElements(0, S32, 16) 195 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 196 .legalIf(isPointer(0)); 197 198 if (ST.has16BitInsts()) { 199 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 200 .legalFor({S32, S16}) 201 .clampScalar(0, S16, S32) 202 .scalarize(0); 203 } else { 204 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 205 .legalFor({S32}) 206 .clampScalar(0, S32, S32) 207 .scalarize(0); 208 } 209 210 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 211 .legalFor({S32}) 212 .clampScalar(0, S32, S32) 213 .scalarize(0); 214 215 // Report legal for any types we can handle anywhere. For the cases only legal 216 // on the SALU, RegBankSelect will be able to re-legalize. 217 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 218 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 219 .clampScalar(0, S32, S64) 220 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 221 .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0)) 222 .widenScalarToNextPow2(0) 223 .scalarize(0); 224 225 getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO, 226 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 227 .legalFor({{S32, S1}}) 228 .clampScalar(0, S32, S32); 229 230 getActionDefinitionsBuilder(G_BITCAST) 231 .legalForCartesianProduct({S32, V2S16}) 232 .legalForCartesianProduct({S64, V2S32, V4S16}) 233 .legalForCartesianProduct({V2S64, V4S32}) 234 // Don't worry about the size constraint. 235 .legalIf(all(isPointer(0), isPointer(1))); 236 237 if (ST.has16BitInsts()) { 238 getActionDefinitionsBuilder(G_FCONSTANT) 239 .legalFor({S32, S64, S16}) 240 .clampScalar(0, S16, S64); 241 } else { 242 getActionDefinitionsBuilder(G_FCONSTANT) 243 .legalFor({S32, S64}) 244 .clampScalar(0, S32, S64); 245 } 246 247 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 248 .legalFor({S1, S32, S64, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 249 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 250 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 251 .clampScalarOrElt(0, S32, S512) 252 .legalIf(isMultiple32(0)) 253 .widenScalarToNextPow2(0, 32) 254 .clampMaxNumElements(0, S32, 16); 255 256 257 // FIXME: i1 operands to intrinsics should always be legal, but other i1 258 // values may not be legal. We need to figure out how to distinguish 259 // between these two scenarios. 260 getActionDefinitionsBuilder(G_CONSTANT) 261 .legalFor({S1, S32, S64, GlobalPtr, 262 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 263 .clampScalar(0, S32, S64) 264 .widenScalarToNextPow2(0) 265 .legalIf(isPointer(0)); 266 267 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 268 269 auto &FPOpActions = getActionDefinitionsBuilder( 270 { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA, G_FCANONICALIZE}) 271 .legalFor({S32, S64}); 272 273 if (ST.has16BitInsts()) { 274 if (ST.hasVOP3PInsts()) 275 FPOpActions.legalFor({S16, V2S16}); 276 else 277 FPOpActions.legalFor({S16}); 278 } 279 280 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 281 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 282 283 if (ST.hasVOP3PInsts()) { 284 MinNumMaxNum.customFor(FPTypesPK16) 285 .clampMaxNumElements(0, S16, 2) 286 .clampScalar(0, S16, S64) 287 .scalarize(0); 288 } else if (ST.has16BitInsts()) { 289 MinNumMaxNum.customFor(FPTypes16) 290 .clampScalar(0, S16, S64) 291 .scalarize(0); 292 } else { 293 MinNumMaxNum.customFor(FPTypesBase) 294 .clampScalar(0, S32, S64) 295 .scalarize(0); 296 } 297 298 // TODO: Implement 299 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); 300 301 if (ST.hasVOP3PInsts()) 302 FPOpActions.clampMaxNumElements(0, S16, 2); 303 FPOpActions 304 .scalarize(0) 305 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 306 307 if (ST.has16BitInsts()) { 308 getActionDefinitionsBuilder(G_FSQRT) 309 .legalFor({S32, S64, S16}) 310 .scalarize(0) 311 .clampScalar(0, S16, S64); 312 } else { 313 getActionDefinitionsBuilder(G_FSQRT) 314 .legalFor({S32, S64}) 315 .scalarize(0) 316 .clampScalar(0, S32, S64); 317 } 318 319 getActionDefinitionsBuilder(G_FPTRUNC) 320 .legalFor({{S32, S64}, {S16, S32}}) 321 .scalarize(0); 322 323 getActionDefinitionsBuilder(G_FPEXT) 324 .legalFor({{S64, S32}, {S32, S16}}) 325 .lowerFor({{S64, S16}}) // FIXME: Implement 326 .scalarize(0); 327 328 // TODO: Verify V_BFI_B32 is generated from expanded bit ops. 329 getActionDefinitionsBuilder(G_FCOPYSIGN).lower(); 330 331 getActionDefinitionsBuilder(G_FSUB) 332 // Use actual fsub instruction 333 .legalFor({S32}) 334 // Must use fadd + fneg 335 .lowerFor({S64, S16, V2S16}) 336 .scalarize(0) 337 .clampScalar(0, S32, S64); 338 339 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 340 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 341 {S32, S1}, {S64, S1}, {S16, S1}, 342 // FIXME: Hack 343 {S64, LLT::scalar(33)}, 344 {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}}) 345 .scalarize(0); 346 347 getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 348 .legalFor({{S32, S32}, {S64, S32}}) 349 .lowerFor({{S32, S64}}) 350 .customFor({{S64, S64}}) 351 .scalarize(0); 352 353 getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 354 .legalFor({{S32, S32}, {S32, S64}}) 355 .scalarize(0); 356 357 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 358 .legalFor({S32, S64}) 359 .scalarize(0); 360 361 if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 362 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 363 .legalFor({S32, S64}) 364 .clampScalar(0, S32, S64) 365 .scalarize(0); 366 } else { 367 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 368 .legalFor({S32}) 369 .customFor({S64}) 370 .clampScalar(0, S32, S64) 371 .scalarize(0); 372 } 373 374 getActionDefinitionsBuilder(G_GEP) 375 .legalForCartesianProduct(AddrSpaces64, {S64}) 376 .legalForCartesianProduct(AddrSpaces32, {S32}) 377 .scalarize(0); 378 379 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 380 381 auto &CmpBuilder = 382 getActionDefinitionsBuilder(G_ICMP) 383 .legalForCartesianProduct( 384 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 385 .legalFor({{S1, S32}, {S1, S64}}); 386 if (ST.has16BitInsts()) { 387 CmpBuilder.legalFor({{S1, S16}}); 388 } 389 390 CmpBuilder 391 .widenScalarToNextPow2(1) 392 .clampScalar(1, S32, S64) 393 .scalarize(0) 394 .legalIf(all(typeIs(0, S1), isPointer(1))); 395 396 getActionDefinitionsBuilder(G_FCMP) 397 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 398 .widenScalarToNextPow2(1) 399 .clampScalar(1, S32, S64) 400 .scalarize(0); 401 402 // FIXME: fexp, flog2, flog10 needs to be custom lowered. 403 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, 404 G_FLOG, G_FLOG2, G_FLOG10}) 405 .legalFor({S32}) 406 .scalarize(0); 407 408 // The 64-bit versions produce 32-bit results, but only on the SALU. 409 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, 410 G_CTTZ, G_CTTZ_ZERO_UNDEF, 411 G_CTPOP}) 412 .legalFor({{S32, S32}, {S32, S64}}) 413 .clampScalar(0, S32, S32) 414 .clampScalar(1, S32, S64) 415 .scalarize(0) 416 .widenScalarToNextPow2(0, 32) 417 .widenScalarToNextPow2(1, 32); 418 419 // TODO: Expand for > s32 420 getActionDefinitionsBuilder(G_BSWAP) 421 .legalFor({S32}) 422 .clampScalar(0, S32, S32) 423 .scalarize(0); 424 425 if (ST.has16BitInsts()) { 426 if (ST.hasVOP3PInsts()) { 427 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 428 .legalFor({S32, S16, V2S16}) 429 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 430 .clampMaxNumElements(0, S16, 2) 431 .clampScalar(0, S16, S32) 432 .widenScalarToNextPow2(0) 433 .scalarize(0); 434 } else { 435 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 436 .legalFor({S32, S16}) 437 .widenScalarToNextPow2(0) 438 .clampScalar(0, S16, S32) 439 .scalarize(0); 440 } 441 } else { 442 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 443 .legalFor({S32}) 444 .clampScalar(0, S32, S32) 445 .widenScalarToNextPow2(0) 446 .scalarize(0); 447 } 448 449 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 450 return [=](const LegalityQuery &Query) { 451 return Query.Types[TypeIdx0].getSizeInBits() < 452 Query.Types[TypeIdx1].getSizeInBits(); 453 }; 454 }; 455 456 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 457 return [=](const LegalityQuery &Query) { 458 return Query.Types[TypeIdx0].getSizeInBits() > 459 Query.Types[TypeIdx1].getSizeInBits(); 460 }; 461 }; 462 463 getActionDefinitionsBuilder(G_INTTOPTR) 464 // List the common cases 465 .legalForCartesianProduct(AddrSpaces64, {S64}) 466 .legalForCartesianProduct(AddrSpaces32, {S32}) 467 .scalarize(0) 468 // Accept any address space as long as the size matches 469 .legalIf(sameSize(0, 1)) 470 .widenScalarIf(smallerThan(1, 0), 471 [](const LegalityQuery &Query) { 472 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 473 }) 474 .narrowScalarIf(greaterThan(1, 0), 475 [](const LegalityQuery &Query) { 476 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 477 }); 478 479 getActionDefinitionsBuilder(G_PTRTOINT) 480 // List the common cases 481 .legalForCartesianProduct(AddrSpaces64, {S64}) 482 .legalForCartesianProduct(AddrSpaces32, {S32}) 483 .scalarize(0) 484 // Accept any address space as long as the size matches 485 .legalIf(sameSize(0, 1)) 486 .widenScalarIf(smallerThan(0, 1), 487 [](const LegalityQuery &Query) { 488 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 489 }) 490 .narrowScalarIf( 491 greaterThan(0, 1), 492 [](const LegalityQuery &Query) { 493 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 494 }); 495 496 if (ST.hasFlatAddressSpace()) { 497 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 498 .scalarize(0) 499 .custom(); 500 } 501 502 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 503 // handle some operations by just promoting the register during 504 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 505 getActionDefinitionsBuilder({G_LOAD, G_STORE}) 506 .narrowScalarIf([](const LegalityQuery &Query) { 507 unsigned Size = Query.Types[0].getSizeInBits(); 508 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 509 return (Size > 32 && MemSize < Size); 510 }, 511 [](const LegalityQuery &Query) { 512 return std::make_pair(0, LLT::scalar(32)); 513 }) 514 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 515 .fewerElementsIf([=](const LegalityQuery &Query) { 516 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 517 return (MemSize == 96) && 518 Query.Types[0].isVector() && 519 !ST.hasDwordx3LoadStores(); 520 }, 521 [=](const LegalityQuery &Query) { 522 return std::make_pair(0, V2S32); 523 }) 524 .legalIf([=](const LegalityQuery &Query) { 525 const LLT &Ty0 = Query.Types[0]; 526 527 unsigned Size = Ty0.getSizeInBits(); 528 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 529 if (Size < 32 || (Size > 32 && MemSize < Size)) 530 return false; 531 532 if (Ty0.isVector() && Size != MemSize) 533 return false; 534 535 // TODO: Decompose private loads into 4-byte components. 536 // TODO: Illegal flat loads on SI 537 switch (MemSize) { 538 case 8: 539 case 16: 540 return Size == 32; 541 case 32: 542 case 64: 543 case 128: 544 return true; 545 546 case 96: 547 return ST.hasDwordx3LoadStores(); 548 549 case 256: 550 case 512: 551 // TODO: Possibly support loads of i256 and i512 . This will require 552 // adding i256 and i512 types to MVT in order for to be able to use 553 // TableGen. 554 // TODO: Add support for other vector types, this will require 555 // defining more value mappings for the new types. 556 return Ty0.isVector() && (Ty0.getScalarType().getSizeInBits() == 32 || 557 Ty0.getScalarType().getSizeInBits() == 64); 558 559 default: 560 return false; 561 } 562 }) 563 .clampScalar(0, S32, S64); 564 565 566 // FIXME: Handle alignment requirements. 567 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 568 .legalForTypesWithMemDesc({ 569 {S32, GlobalPtr, 8, 8}, 570 {S32, GlobalPtr, 16, 8}, 571 {S32, LocalPtr, 8, 8}, 572 {S32, LocalPtr, 16, 8}, 573 {S32, PrivatePtr, 8, 8}, 574 {S32, PrivatePtr, 16, 8}}); 575 if (ST.hasFlatAddressSpace()) { 576 ExtLoads.legalForTypesWithMemDesc({{S32, FlatPtr, 8, 8}, 577 {S32, FlatPtr, 16, 8}}); 578 } 579 580 ExtLoads.clampScalar(0, S32, S32) 581 .widenScalarToNextPow2(0) 582 .unsupportedIfMemSizeNotPow2() 583 .lower(); 584 585 auto &Atomics = getActionDefinitionsBuilder( 586 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 587 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 588 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 589 G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG}) 590 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 591 {S64, GlobalPtr}, {S64, LocalPtr}}); 592 if (ST.hasFlatAddressSpace()) { 593 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 594 } 595 596 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 597 .legalFor({{S32, LocalPtr}}); 598 599 // TODO: Pointer types, any 32-bit or 64-bit vector 600 getActionDefinitionsBuilder(G_SELECT) 601 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 602 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 603 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1}) 604 .clampScalar(0, S16, S64) 605 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 606 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 607 .scalarize(1) 608 .clampMaxNumElements(0, S32, 2) 609 .clampMaxNumElements(0, LocalPtr, 2) 610 .clampMaxNumElements(0, PrivatePtr, 2) 611 .scalarize(0) 612 .widenScalarToNextPow2(0) 613 .legalIf(all(isPointer(0), typeIs(1, S1))); 614 615 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 616 // be more flexible with the shift amount type. 617 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 618 .legalFor({{S32, S32}, {S64, S32}}); 619 if (ST.has16BitInsts()) { 620 if (ST.hasVOP3PInsts()) { 621 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 622 .clampMaxNumElements(0, S16, 2); 623 } else 624 Shifts.legalFor({{S16, S32}, {S16, S16}}); 625 626 Shifts.clampScalar(1, S16, S32); 627 Shifts.clampScalar(0, S16, S64); 628 Shifts.widenScalarToNextPow2(0, 16); 629 } else { 630 // Make sure we legalize the shift amount type first, as the general 631 // expansion for the shifted type will produce much worse code if it hasn't 632 // been truncated already. 633 Shifts.clampScalar(1, S32, S32); 634 Shifts.clampScalar(0, S32, S64); 635 Shifts.widenScalarToNextPow2(0, 32); 636 } 637 Shifts.scalarize(0); 638 639 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 640 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 641 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 642 unsigned IdxTypeIdx = 2; 643 644 getActionDefinitionsBuilder(Op) 645 .customIf([=](const LegalityQuery &Query) { 646 const LLT EltTy = Query.Types[EltTypeIdx]; 647 const LLT VecTy = Query.Types[VecTypeIdx]; 648 const LLT IdxTy = Query.Types[IdxTypeIdx]; 649 return (EltTy.getSizeInBits() == 16 || 650 EltTy.getSizeInBits() % 32 == 0) && 651 VecTy.getSizeInBits() % 32 == 0 && 652 VecTy.getSizeInBits() <= 512 && 653 IdxTy.getSizeInBits() == 32; 654 }) 655 .clampScalar(EltTypeIdx, S32, S64) 656 .clampScalar(VecTypeIdx, S32, S64) 657 .clampScalar(IdxTypeIdx, S32, S32); 658 } 659 660 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 661 .unsupportedIf([=](const LegalityQuery &Query) { 662 const LLT &EltTy = Query.Types[1].getElementType(); 663 return Query.Types[0] != EltTy; 664 }); 665 666 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 667 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 668 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 669 670 // FIXME: Doesn't handle extract of illegal sizes. 671 getActionDefinitionsBuilder(Op) 672 .legalIf([=](const LegalityQuery &Query) { 673 const LLT BigTy = Query.Types[BigTyIdx]; 674 const LLT LitTy = Query.Types[LitTyIdx]; 675 return (BigTy.getSizeInBits() % 32 == 0) && 676 (LitTy.getSizeInBits() % 16 == 0); 677 }) 678 .widenScalarIf( 679 [=](const LegalityQuery &Query) { 680 const LLT BigTy = Query.Types[BigTyIdx]; 681 return (BigTy.getScalarSizeInBits() < 16); 682 }, 683 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 684 .widenScalarIf( 685 [=](const LegalityQuery &Query) { 686 const LLT LitTy = Query.Types[LitTyIdx]; 687 return (LitTy.getScalarSizeInBits() < 16); 688 }, 689 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 690 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 691 .widenScalarToNextPow2(BigTyIdx, 32); 692 693 } 694 695 getActionDefinitionsBuilder(G_BUILD_VECTOR) 696 .legalForCartesianProduct(AllS32Vectors, {S32}) 697 .legalForCartesianProduct(AllS64Vectors, {S64}) 698 .clampNumElements(0, V16S32, V16S32) 699 .clampNumElements(0, V2S64, V8S64) 700 .minScalarSameAs(1, 0) 701 .legalIf(isRegisterType(0)) 702 .minScalarOrElt(0, S32); 703 704 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 705 .legalIf(isRegisterType(0)); 706 707 // Merge/Unmerge 708 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 709 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 710 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 711 712 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 713 const LLT &Ty = Query.Types[TypeIdx]; 714 if (Ty.isVector()) { 715 const LLT &EltTy = Ty.getElementType(); 716 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 717 return true; 718 if (!isPowerOf2_32(EltTy.getSizeInBits())) 719 return true; 720 } 721 return false; 722 }; 723 724 getActionDefinitionsBuilder(Op) 725 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 726 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 727 // worth considering the multiples of 64 since 2*192 and 2*384 are not 728 // valid. 729 .clampScalar(LitTyIdx, S16, S256) 730 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 731 732 // Break up vectors with weird elements into scalars 733 .fewerElementsIf( 734 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 735 scalarize(0)) 736 .fewerElementsIf( 737 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 738 scalarize(1)) 739 .clampScalar(BigTyIdx, S32, S512) 740 .widenScalarIf( 741 [=](const LegalityQuery &Query) { 742 const LLT &Ty = Query.Types[BigTyIdx]; 743 return !isPowerOf2_32(Ty.getSizeInBits()) && 744 Ty.getSizeInBits() % 16 != 0; 745 }, 746 [=](const LegalityQuery &Query) { 747 // Pick the next power of 2, or a multiple of 64 over 128. 748 // Whichever is smaller. 749 const LLT &Ty = Query.Types[BigTyIdx]; 750 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 751 if (NewSizeInBits >= 256) { 752 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 753 if (RoundedTo < NewSizeInBits) 754 NewSizeInBits = RoundedTo; 755 } 756 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 757 }) 758 .legalIf([=](const LegalityQuery &Query) { 759 const LLT &BigTy = Query.Types[BigTyIdx]; 760 const LLT &LitTy = Query.Types[LitTyIdx]; 761 762 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 763 return false; 764 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 765 return false; 766 767 return BigTy.getSizeInBits() % 16 == 0 && 768 LitTy.getSizeInBits() % 16 == 0 && 769 BigTy.getSizeInBits() <= 512; 770 }) 771 // Any vectors left are the wrong size. Scalarize them. 772 .scalarize(0) 773 .scalarize(1); 774 } 775 776 computeTables(); 777 verify(*ST.getInstrInfo()); 778 } 779 780 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 781 MachineRegisterInfo &MRI, 782 MachineIRBuilder &MIRBuilder, 783 GISelChangeObserver &Observer) const { 784 switch (MI.getOpcode()) { 785 case TargetOpcode::G_ADDRSPACE_CAST: 786 return legalizeAddrSpaceCast(MI, MRI, MIRBuilder); 787 case TargetOpcode::G_FRINT: 788 return legalizeFrint(MI, MRI, MIRBuilder); 789 case TargetOpcode::G_FCEIL: 790 return legalizeFceil(MI, MRI, MIRBuilder); 791 case TargetOpcode::G_INTRINSIC_TRUNC: 792 return legalizeIntrinsicTrunc(MI, MRI, MIRBuilder); 793 case TargetOpcode::G_SITOFP: 794 return legalizeITOFP(MI, MRI, MIRBuilder, true); 795 case TargetOpcode::G_UITOFP: 796 return legalizeITOFP(MI, MRI, MIRBuilder, false); 797 case TargetOpcode::G_FMINNUM: 798 case TargetOpcode::G_FMAXNUM: 799 case TargetOpcode::G_FMINNUM_IEEE: 800 case TargetOpcode::G_FMAXNUM_IEEE: 801 return legalizeMinNumMaxNum(MI, MRI, MIRBuilder); 802 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 803 return legalizeExtractVectorElt(MI, MRI, MIRBuilder); 804 case TargetOpcode::G_INSERT_VECTOR_ELT: 805 return legalizeInsertVectorElt(MI, MRI, MIRBuilder); 806 default: 807 return false; 808 } 809 810 llvm_unreachable("expected switch to return"); 811 } 812 813 Register AMDGPULegalizerInfo::getSegmentAperture( 814 unsigned AS, 815 MachineRegisterInfo &MRI, 816 MachineIRBuilder &MIRBuilder) const { 817 MachineFunction &MF = MIRBuilder.getMF(); 818 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 819 const LLT S32 = LLT::scalar(32); 820 821 if (ST.hasApertureRegs()) { 822 // FIXME: Use inline constants (src_{shared, private}_base) instead of 823 // getreg. 824 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 825 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 826 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 827 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 828 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 829 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 830 unsigned Encoding = 831 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 832 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 833 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 834 835 Register ApertureReg = MRI.createGenericVirtualRegister(S32); 836 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 837 838 MIRBuilder.buildInstr(AMDGPU::S_GETREG_B32) 839 .addDef(GetReg) 840 .addImm(Encoding); 841 MRI.setType(GetReg, S32); 842 843 auto ShiftAmt = MIRBuilder.buildConstant(S32, WidthM1 + 1); 844 MIRBuilder.buildInstr(TargetOpcode::G_SHL) 845 .addDef(ApertureReg) 846 .addUse(GetReg) 847 .addUse(ShiftAmt.getReg(0)); 848 849 return ApertureReg; 850 } 851 852 Register QueuePtr = MRI.createGenericVirtualRegister( 853 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 854 855 // FIXME: Placeholder until we can track the input registers. 856 MIRBuilder.buildConstant(QueuePtr, 0xdeadbeef); 857 858 // Offset into amd_queue_t for group_segment_aperture_base_hi / 859 // private_segment_aperture_base_hi. 860 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 861 862 // FIXME: Don't use undef 863 Value *V = UndefValue::get(PointerType::get( 864 Type::getInt8Ty(MF.getFunction().getContext()), 865 AMDGPUAS::CONSTANT_ADDRESS)); 866 867 MachinePointerInfo PtrInfo(V, StructOffset); 868 MachineMemOperand *MMO = MF.getMachineMemOperand( 869 PtrInfo, 870 MachineMemOperand::MOLoad | 871 MachineMemOperand::MODereferenceable | 872 MachineMemOperand::MOInvariant, 873 4, 874 MinAlign(64, StructOffset)); 875 876 Register LoadResult = MRI.createGenericVirtualRegister(S32); 877 Register LoadAddr; 878 879 MIRBuilder.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 880 MIRBuilder.buildLoad(LoadResult, LoadAddr, *MMO); 881 return LoadResult; 882 } 883 884 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 885 MachineInstr &MI, MachineRegisterInfo &MRI, 886 MachineIRBuilder &MIRBuilder) const { 887 MachineFunction &MF = MIRBuilder.getMF(); 888 889 MIRBuilder.setInstr(MI); 890 891 Register Dst = MI.getOperand(0).getReg(); 892 Register Src = MI.getOperand(1).getReg(); 893 894 LLT DstTy = MRI.getType(Dst); 895 LLT SrcTy = MRI.getType(Src); 896 unsigned DestAS = DstTy.getAddressSpace(); 897 unsigned SrcAS = SrcTy.getAddressSpace(); 898 899 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 900 // vector element. 901 assert(!DstTy.isVector()); 902 903 const AMDGPUTargetMachine &TM 904 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 905 906 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 907 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 908 MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BITCAST)); 909 return true; 910 } 911 912 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 913 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 914 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 915 unsigned NullVal = TM.getNullPointerValue(DestAS); 916 917 auto SegmentNull = MIRBuilder.buildConstant(DstTy, NullVal); 918 auto FlatNull = MIRBuilder.buildConstant(SrcTy, 0); 919 920 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy); 921 922 // Extract low 32-bits of the pointer. 923 MIRBuilder.buildExtract(PtrLo32, Src, 0); 924 925 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 926 MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); 927 MIRBuilder.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 928 929 MI.eraseFromParent(); 930 return true; 931 } 932 933 assert(SrcAS == AMDGPUAS::LOCAL_ADDRESS || 934 SrcAS == AMDGPUAS::PRIVATE_ADDRESS); 935 936 auto SegmentNull = 937 MIRBuilder.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 938 auto FlatNull = 939 MIRBuilder.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 940 941 Register ApertureReg = getSegmentAperture(DestAS, MRI, MIRBuilder); 942 943 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 944 MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); 945 946 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy); 947 948 // Coerce the type of the low half of the result so we can use merge_values. 949 Register SrcAsInt = MRI.createGenericVirtualRegister(LLT::scalar(32)); 950 MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT) 951 .addDef(SrcAsInt) 952 .addUse(Src); 953 954 // TODO: Should we allow mismatched types but matching sizes in merges to 955 // avoid the ptrtoint? 956 MIRBuilder.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); 957 MIRBuilder.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); 958 959 MI.eraseFromParent(); 960 return true; 961 } 962 963 bool AMDGPULegalizerInfo::legalizeFrint( 964 MachineInstr &MI, MachineRegisterInfo &MRI, 965 MachineIRBuilder &MIRBuilder) const { 966 MIRBuilder.setInstr(MI); 967 968 Register Src = MI.getOperand(1).getReg(); 969 LLT Ty = MRI.getType(Src); 970 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 971 972 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 973 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 974 975 auto C1 = MIRBuilder.buildFConstant(Ty, C1Val); 976 auto CopySign = MIRBuilder.buildFCopysign(Ty, C1, Src); 977 978 // TODO: Should this propagate fast-math-flags? 979 auto Tmp1 = MIRBuilder.buildFAdd(Ty, Src, CopySign); 980 auto Tmp2 = MIRBuilder.buildFSub(Ty, Tmp1, CopySign); 981 982 auto C2 = MIRBuilder.buildFConstant(Ty, C2Val); 983 auto Fabs = MIRBuilder.buildFAbs(Ty, Src); 984 985 auto Cond = MIRBuilder.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 986 MIRBuilder.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 987 return true; 988 } 989 990 bool AMDGPULegalizerInfo::legalizeFceil( 991 MachineInstr &MI, MachineRegisterInfo &MRI, 992 MachineIRBuilder &B) const { 993 B.setInstr(MI); 994 995 const LLT S1 = LLT::scalar(1); 996 const LLT S64 = LLT::scalar(64); 997 998 Register Src = MI.getOperand(1).getReg(); 999 assert(MRI.getType(Src) == S64); 1000 1001 // result = trunc(src) 1002 // if (src > 0.0 && src != result) 1003 // result += 1.0 1004 1005 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src}); 1006 1007 const auto Zero = B.buildFConstant(S64, 0.0); 1008 const auto One = B.buildFConstant(S64, 1.0); 1009 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1010 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1011 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1012 auto Add = B.buildSelect(S64, And, One, Zero); 1013 1014 // TODO: Should this propagate fast-math-flags? 1015 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1016 return true; 1017 } 1018 1019 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1020 MachineIRBuilder &B) { 1021 const unsigned FractBits = 52; 1022 const unsigned ExpBits = 11; 1023 LLT S32 = LLT::scalar(32); 1024 1025 auto Const0 = B.buildConstant(S32, FractBits - 32); 1026 auto Const1 = B.buildConstant(S32, ExpBits); 1027 1028 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1029 .addUse(Const0.getReg(0)) 1030 .addUse(Const1.getReg(0)); 1031 1032 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1033 } 1034 1035 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1036 MachineInstr &MI, MachineRegisterInfo &MRI, 1037 MachineIRBuilder &B) const { 1038 B.setInstr(MI); 1039 1040 const LLT S1 = LLT::scalar(1); 1041 const LLT S32 = LLT::scalar(32); 1042 const LLT S64 = LLT::scalar(64); 1043 1044 Register Src = MI.getOperand(1).getReg(); 1045 assert(MRI.getType(Src) == S64); 1046 1047 // TODO: Should this use extract since the low half is unused? 1048 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1049 Register Hi = Unmerge.getReg(1); 1050 1051 // Extract the upper half, since this is where we will find the sign and 1052 // exponent. 1053 auto Exp = extractF64Exponent(Hi, B); 1054 1055 const unsigned FractBits = 52; 1056 1057 // Extract the sign bit. 1058 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1059 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1060 1061 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1062 1063 const auto Zero32 = B.buildConstant(S32, 0); 1064 1065 // Extend back to 64-bits. 1066 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); 1067 1068 auto Shr = B.buildAShr(S64, FractMask, Exp); 1069 auto Not = B.buildNot(S64, Shr); 1070 auto Tmp0 = B.buildAnd(S64, Src, Not); 1071 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1072 1073 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1074 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1075 1076 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1077 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1078 return true; 1079 } 1080 1081 bool AMDGPULegalizerInfo::legalizeITOFP( 1082 MachineInstr &MI, MachineRegisterInfo &MRI, 1083 MachineIRBuilder &B, bool Signed) const { 1084 B.setInstr(MI); 1085 1086 Register Dst = MI.getOperand(0).getReg(); 1087 Register Src = MI.getOperand(1).getReg(); 1088 1089 const LLT S64 = LLT::scalar(64); 1090 const LLT S32 = LLT::scalar(32); 1091 1092 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1093 1094 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1095 1096 auto CvtHi = Signed ? 1097 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1098 B.buildUITOFP(S64, Unmerge.getReg(1)); 1099 1100 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1101 1102 auto ThirtyTwo = B.buildConstant(S32, 32); 1103 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1104 .addUse(CvtHi.getReg(0)) 1105 .addUse(ThirtyTwo.getReg(0)); 1106 1107 // TODO: Should this propagate fast-math-flags? 1108 B.buildFAdd(Dst, LdExp, CvtLo); 1109 MI.eraseFromParent(); 1110 return true; 1111 } 1112 1113 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1114 MachineInstr &MI, MachineRegisterInfo &MRI, 1115 MachineIRBuilder &B) const { 1116 MachineFunction &MF = B.getMF(); 1117 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1118 1119 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1120 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1121 1122 // With ieee_mode disabled, the instructions have the correct behavior 1123 // already for G_FMINNUM/G_FMAXNUM 1124 if (!MFI->getMode().IEEE) 1125 return !IsIEEEOp; 1126 1127 if (IsIEEEOp) 1128 return true; 1129 1130 MachineIRBuilder HelperBuilder(MI); 1131 GISelObserverWrapper DummyObserver; 1132 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1133 HelperBuilder.setMBB(*MI.getParent()); 1134 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1135 } 1136 1137 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1138 MachineInstr &MI, MachineRegisterInfo &MRI, 1139 MachineIRBuilder &B) const { 1140 // TODO: Should move some of this into LegalizerHelper. 1141 1142 // TODO: Promote dynamic indexing of s16 to s32 1143 // TODO: Dynamic s64 indexing is only legal for SGPR. 1144 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); 1145 if (!IdxVal) // Dynamic case will be selected to register indexing. 1146 return true; 1147 1148 Register Dst = MI.getOperand(0).getReg(); 1149 Register Vec = MI.getOperand(1).getReg(); 1150 1151 LLT VecTy = MRI.getType(Vec); 1152 LLT EltTy = VecTy.getElementType(); 1153 assert(EltTy == MRI.getType(Dst)); 1154 1155 B.setInstr(MI); 1156 1157 if (IdxVal.getValue() < VecTy.getNumElements()) 1158 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); 1159 else 1160 B.buildUndef(Dst); 1161 1162 MI.eraseFromParent(); 1163 return true; 1164 } 1165 1166 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1167 MachineInstr &MI, MachineRegisterInfo &MRI, 1168 MachineIRBuilder &B) const { 1169 // TODO: Should move some of this into LegalizerHelper. 1170 1171 // TODO: Promote dynamic indexing of s16 to s32 1172 // TODO: Dynamic s64 indexing is only legal for SGPR. 1173 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); 1174 if (!IdxVal) // Dynamic case will be selected to register indexing. 1175 return true; 1176 1177 Register Dst = MI.getOperand(0).getReg(); 1178 Register Vec = MI.getOperand(1).getReg(); 1179 Register Ins = MI.getOperand(2).getReg(); 1180 1181 LLT VecTy = MRI.getType(Vec); 1182 LLT EltTy = VecTy.getElementType(); 1183 assert(EltTy == MRI.getType(Ins)); 1184 1185 B.setInstr(MI); 1186 1187 if (IdxVal.getValue() < VecTy.getNumElements()) 1188 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); 1189 else 1190 B.buildUndef(Dst); 1191 1192 MI.eraseFromParent(); 1193 return true; 1194 } 1195 1196 // Return the use branch instruction, otherwise null if the usage is invalid. 1197 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 1198 MachineRegisterInfo &MRI) { 1199 Register CondDef = MI.getOperand(0).getReg(); 1200 if (!MRI.hasOneNonDBGUse(CondDef)) 1201 return nullptr; 1202 1203 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 1204 return UseMI.getParent() == MI.getParent() && 1205 UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr; 1206 } 1207 1208 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 1209 Register Reg, LLT Ty) const { 1210 Register LiveIn = MRI.getLiveInVirtReg(Reg); 1211 if (LiveIn) 1212 return LiveIn; 1213 1214 Register NewReg = MRI.createGenericVirtualRegister(Ty); 1215 MRI.addLiveIn(Reg, NewReg); 1216 return NewReg; 1217 } 1218 1219 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 1220 const ArgDescriptor *Arg) const { 1221 if (!Arg->isRegister()) 1222 return false; // TODO: Handle these 1223 1224 assert(Arg->getRegister() != 0); 1225 assert(Arg->getRegister().isPhysical()); 1226 1227 MachineRegisterInfo &MRI = *B.getMRI(); 1228 1229 LLT Ty = MRI.getType(DstReg); 1230 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 1231 1232 if (Arg->isMasked()) { 1233 // TODO: Should we try to emit this once in the entry block? 1234 const LLT S32 = LLT::scalar(32); 1235 const unsigned Mask = Arg->getMask(); 1236 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 1237 1238 auto ShiftAmt = B.buildConstant(S32, Shift); 1239 auto LShr = B.buildLShr(S32, LiveIn, ShiftAmt); 1240 B.buildAnd(DstReg, LShr, B.buildConstant(S32, Mask >> Shift)); 1241 } else 1242 B.buildCopy(DstReg, LiveIn); 1243 1244 // Insert the argument copy if it doens't already exist. 1245 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 1246 if (!MRI.getVRegDef(LiveIn)) { 1247 MachineBasicBlock &EntryMBB = B.getMF().front(); 1248 EntryMBB.addLiveIn(Arg->getRegister()); 1249 B.setInsertPt(EntryMBB, EntryMBB.begin()); 1250 B.buildCopy(LiveIn, Arg->getRegister()); 1251 } 1252 1253 return true; 1254 } 1255 1256 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 1257 MachineInstr &MI, 1258 MachineRegisterInfo &MRI, 1259 MachineIRBuilder &B, 1260 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 1261 B.setInstr(MI); 1262 1263 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1264 1265 const ArgDescriptor *Arg; 1266 const TargetRegisterClass *RC; 1267 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 1268 if (!Arg) { 1269 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 1270 return false; 1271 } 1272 1273 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 1274 MI.eraseFromParent(); 1275 return true; 1276 } 1277 1278 return false; 1279 } 1280 1281 bool AMDGPULegalizerInfo::legalizeFDIVFast(MachineInstr &MI, 1282 MachineRegisterInfo &MRI, 1283 MachineIRBuilder &B) const { 1284 B.setInstr(MI); 1285 Register Res = MI.getOperand(0).getReg(); 1286 Register LHS = MI.getOperand(2).getReg(); 1287 Register RHS = MI.getOperand(3).getReg(); 1288 uint16_t Flags = MI.getFlags(); 1289 1290 LLT S32 = LLT::scalar(32); 1291 LLT S1 = LLT::scalar(1); 1292 1293 auto Abs = B.buildFAbs(S32, RHS, Flags); 1294 const APFloat C0Val(1.0f); 1295 1296 auto C0 = B.buildConstant(S32, 0x6f800000); 1297 auto C1 = B.buildConstant(S32, 0x2f800000); 1298 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 1299 1300 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 1301 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 1302 1303 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 1304 1305 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 1306 .addUse(Mul0.getReg(0)) 1307 .setMIFlags(Flags); 1308 1309 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 1310 1311 B.buildFMul(Res, Sel, Mul1, Flags); 1312 1313 MI.eraseFromParent(); 1314 return true; 1315 } 1316 1317 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 1318 MachineRegisterInfo &MRI, 1319 MachineIRBuilder &B) const { 1320 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1321 if (!MFI->isEntryFunction()) { 1322 return legalizePreloadedArgIntrin(MI, MRI, B, 1323 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 1324 } 1325 1326 B.setInstr(MI); 1327 1328 uint64_t Offset = 1329 ST.getTargetLowering()->getImplicitParameterOffset( 1330 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 1331 Register DstReg = MI.getOperand(0).getReg(); 1332 LLT DstTy = MRI.getType(DstReg); 1333 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 1334 1335 const ArgDescriptor *Arg; 1336 const TargetRegisterClass *RC; 1337 std::tie(Arg, RC) 1338 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 1339 if (!Arg) 1340 return false; 1341 1342 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 1343 if (!loadInputValue(KernargPtrReg, B, Arg)) 1344 return false; 1345 1346 B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 1347 MI.eraseFromParent(); 1348 return true; 1349 } 1350 1351 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 1352 MachineRegisterInfo &MRI, 1353 MachineIRBuilder &B) const { 1354 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 1355 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { 1356 case Intrinsic::amdgcn_if: { 1357 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 1358 const SIRegisterInfo *TRI 1359 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 1360 1361 B.setInstr(*BrCond); 1362 Register Def = MI.getOperand(1).getReg(); 1363 Register Use = MI.getOperand(3).getReg(); 1364 B.buildInstr(AMDGPU::SI_IF) 1365 .addDef(Def) 1366 .addUse(Use) 1367 .addMBB(BrCond->getOperand(1).getMBB()); 1368 1369 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 1370 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 1371 MI.eraseFromParent(); 1372 BrCond->eraseFromParent(); 1373 return true; 1374 } 1375 1376 return false; 1377 } 1378 case Intrinsic::amdgcn_loop: { 1379 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 1380 const SIRegisterInfo *TRI 1381 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 1382 1383 B.setInstr(*BrCond); 1384 Register Reg = MI.getOperand(2).getReg(); 1385 B.buildInstr(AMDGPU::SI_LOOP) 1386 .addUse(Reg) 1387 .addMBB(BrCond->getOperand(1).getMBB()); 1388 MI.eraseFromParent(); 1389 BrCond->eraseFromParent(); 1390 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 1391 return true; 1392 } 1393 1394 return false; 1395 } 1396 case Intrinsic::amdgcn_kernarg_segment_ptr: 1397 return legalizePreloadedArgIntrin( 1398 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 1399 case Intrinsic::amdgcn_implicitarg_ptr: 1400 return legalizeImplicitArgPtr(MI, MRI, B); 1401 case Intrinsic::amdgcn_workitem_id_x: 1402 return legalizePreloadedArgIntrin(MI, MRI, B, 1403 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 1404 case Intrinsic::amdgcn_workitem_id_y: 1405 return legalizePreloadedArgIntrin(MI, MRI, B, 1406 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 1407 case Intrinsic::amdgcn_workitem_id_z: 1408 return legalizePreloadedArgIntrin(MI, MRI, B, 1409 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 1410 case Intrinsic::amdgcn_workgroup_id_x: 1411 return legalizePreloadedArgIntrin(MI, MRI, B, 1412 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 1413 case Intrinsic::amdgcn_workgroup_id_y: 1414 return legalizePreloadedArgIntrin(MI, MRI, B, 1415 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 1416 case Intrinsic::amdgcn_workgroup_id_z: 1417 return legalizePreloadedArgIntrin(MI, MRI, B, 1418 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 1419 case Intrinsic::amdgcn_dispatch_ptr: 1420 return legalizePreloadedArgIntrin(MI, MRI, B, 1421 AMDGPUFunctionArgInfo::DISPATCH_PTR); 1422 case Intrinsic::amdgcn_queue_ptr: 1423 return legalizePreloadedArgIntrin(MI, MRI, B, 1424 AMDGPUFunctionArgInfo::QUEUE_PTR); 1425 case Intrinsic::amdgcn_implicit_buffer_ptr: 1426 return legalizePreloadedArgIntrin( 1427 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 1428 case Intrinsic::amdgcn_dispatch_id: 1429 return legalizePreloadedArgIntrin(MI, MRI, B, 1430 AMDGPUFunctionArgInfo::DISPATCH_ID); 1431 case Intrinsic::amdgcn_fdiv_fast: 1432 return legalizeFDIVFast(MI, MRI, B); 1433 default: 1434 return true; 1435 } 1436 1437 return true; 1438 } 1439