1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPULegalizerInfo.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "SIMachineFunctionInfo.h" 18 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 19 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 20 #include "llvm/CodeGen/TargetOpcodes.h" 21 #include "llvm/CodeGen/ValueTypes.h" 22 #include "llvm/IR/DerivedTypes.h" 23 #include "llvm/IR/Type.h" 24 #include "llvm/Support/Debug.h" 25 26 #define DEBUG_TYPE "amdgpu-legalinfo" 27 28 using namespace llvm; 29 using namespace LegalizeActions; 30 using namespace LegalizeMutations; 31 using namespace LegalityPredicates; 32 33 34 static LegalityPredicate isMultiple32(unsigned TypeIdx, 35 unsigned MaxSize = 512) { 36 return [=](const LegalityQuery &Query) { 37 const LLT Ty = Query.Types[TypeIdx]; 38 const LLT EltTy = Ty.getScalarType(); 39 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 40 }; 41 } 42 43 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 44 return [=](const LegalityQuery &Query) { 45 const LLT Ty = Query.Types[TypeIdx]; 46 return Ty.isVector() && 47 Ty.getNumElements() % 2 != 0 && 48 Ty.getElementType().getSizeInBits() < 32; 49 }; 50 } 51 52 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 53 return [=](const LegalityQuery &Query) { 54 const LLT Ty = Query.Types[TypeIdx]; 55 const LLT EltTy = Ty.getElementType(); 56 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 57 }; 58 } 59 60 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 61 return [=](const LegalityQuery &Query) { 62 const LLT Ty = Query.Types[TypeIdx]; 63 const LLT EltTy = Ty.getElementType(); 64 unsigned Size = Ty.getSizeInBits(); 65 unsigned Pieces = (Size + 63) / 64; 66 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 67 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 68 }; 69 } 70 71 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 72 return [=](const LegalityQuery &Query) { 73 const LLT QueryTy = Query.Types[TypeIdx]; 74 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 75 }; 76 } 77 78 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 79 return [=](const LegalityQuery &Query) { 80 const LLT QueryTy = Query.Types[TypeIdx]; 81 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 82 }; 83 } 84 85 // Any combination of 32 or 64-bit elements up to 512 bits, and multiples of 86 // v2s16. 87 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 88 return [=](const LegalityQuery &Query) { 89 const LLT Ty = Query.Types[TypeIdx]; 90 if (Ty.isVector()) { 91 const int EltSize = Ty.getElementType().getSizeInBits(); 92 return EltSize == 32 || EltSize == 64 || 93 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 94 EltSize == 128 || EltSize == 256; 95 } 96 97 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 512; 98 }; 99 } 100 101 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 102 const GCNTargetMachine &TM) 103 : ST(ST_) { 104 using namespace TargetOpcode; 105 106 auto GetAddrSpacePtr = [&TM](unsigned AS) { 107 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 108 }; 109 110 const LLT S1 = LLT::scalar(1); 111 const LLT S8 = LLT::scalar(8); 112 const LLT S16 = LLT::scalar(16); 113 const LLT S32 = LLT::scalar(32); 114 const LLT S64 = LLT::scalar(64); 115 const LLT S128 = LLT::scalar(128); 116 const LLT S256 = LLT::scalar(256); 117 const LLT S512 = LLT::scalar(512); 118 119 const LLT V2S16 = LLT::vector(2, 16); 120 const LLT V4S16 = LLT::vector(4, 16); 121 122 const LLT V2S32 = LLT::vector(2, 32); 123 const LLT V3S32 = LLT::vector(3, 32); 124 const LLT V4S32 = LLT::vector(4, 32); 125 const LLT V5S32 = LLT::vector(5, 32); 126 const LLT V6S32 = LLT::vector(6, 32); 127 const LLT V7S32 = LLT::vector(7, 32); 128 const LLT V8S32 = LLT::vector(8, 32); 129 const LLT V9S32 = LLT::vector(9, 32); 130 const LLT V10S32 = LLT::vector(10, 32); 131 const LLT V11S32 = LLT::vector(11, 32); 132 const LLT V12S32 = LLT::vector(12, 32); 133 const LLT V13S32 = LLT::vector(13, 32); 134 const LLT V14S32 = LLT::vector(14, 32); 135 const LLT V15S32 = LLT::vector(15, 32); 136 const LLT V16S32 = LLT::vector(16, 32); 137 138 const LLT V2S64 = LLT::vector(2, 64); 139 const LLT V3S64 = LLT::vector(3, 64); 140 const LLT V4S64 = LLT::vector(4, 64); 141 const LLT V5S64 = LLT::vector(5, 64); 142 const LLT V6S64 = LLT::vector(6, 64); 143 const LLT V7S64 = LLT::vector(7, 64); 144 const LLT V8S64 = LLT::vector(8, 64); 145 146 std::initializer_list<LLT> AllS32Vectors = 147 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 148 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32}; 149 std::initializer_list<LLT> AllS64Vectors = 150 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64}; 151 152 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 153 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 154 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 155 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 156 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 157 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 158 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 159 160 const LLT CodePtr = FlatPtr; 161 162 const std::initializer_list<LLT> AddrSpaces64 = { 163 GlobalPtr, ConstantPtr, FlatPtr 164 }; 165 166 const std::initializer_list<LLT> AddrSpaces32 = { 167 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 168 }; 169 170 const std::initializer_list<LLT> FPTypesBase = { 171 S32, S64 172 }; 173 174 const std::initializer_list<LLT> FPTypes16 = { 175 S32, S64, S16 176 }; 177 178 const std::initializer_list<LLT> FPTypesPK16 = { 179 S32, S64, S16, V2S16 180 }; 181 182 setAction({G_BRCOND, S1}, Legal); 183 184 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 185 // elements for v3s16 186 getActionDefinitionsBuilder(G_PHI) 187 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 188 .legalFor(AllS32Vectors) 189 .legalFor(AllS64Vectors) 190 .legalFor(AddrSpaces64) 191 .legalFor(AddrSpaces32) 192 .clampScalar(0, S32, S256) 193 .widenScalarToNextPow2(0, 32) 194 .clampMaxNumElements(0, S32, 16) 195 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 196 .legalIf(isPointer(0)); 197 198 if (ST.has16BitInsts()) { 199 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 200 .legalFor({S32, S16}) 201 .clampScalar(0, S16, S32) 202 .scalarize(0); 203 } else { 204 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 205 .legalFor({S32}) 206 .clampScalar(0, S32, S32) 207 .scalarize(0); 208 } 209 210 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 211 .legalFor({S32}) 212 .clampScalar(0, S32, S32) 213 .scalarize(0); 214 215 // Report legal for any types we can handle anywhere. For the cases only legal 216 // on the SALU, RegBankSelect will be able to re-legalize. 217 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 218 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 219 .clampScalar(0, S32, S64) 220 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 221 .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0)) 222 .widenScalarToNextPow2(0) 223 .scalarize(0); 224 225 getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO, 226 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 227 .legalFor({{S32, S1}}) 228 .clampScalar(0, S32, S32); 229 230 getActionDefinitionsBuilder(G_BITCAST) 231 .legalForCartesianProduct({S32, V2S16}) 232 .legalForCartesianProduct({S64, V2S32, V4S16}) 233 .legalForCartesianProduct({V2S64, V4S32}) 234 // Don't worry about the size constraint. 235 .legalIf(all(isPointer(0), isPointer(1))); 236 237 if (ST.has16BitInsts()) { 238 getActionDefinitionsBuilder(G_FCONSTANT) 239 .legalFor({S32, S64, S16}) 240 .clampScalar(0, S16, S64); 241 } else { 242 getActionDefinitionsBuilder(G_FCONSTANT) 243 .legalFor({S32, S64}) 244 .clampScalar(0, S32, S64); 245 } 246 247 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 248 .legalFor({S1, S32, S64, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 249 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 250 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 251 .clampScalarOrElt(0, S32, S512) 252 .legalIf(isMultiple32(0)) 253 .widenScalarToNextPow2(0, 32) 254 .clampMaxNumElements(0, S32, 16); 255 256 257 // FIXME: i1 operands to intrinsics should always be legal, but other i1 258 // values may not be legal. We need to figure out how to distinguish 259 // between these two scenarios. 260 getActionDefinitionsBuilder(G_CONSTANT) 261 .legalFor({S1, S32, S64, GlobalPtr, 262 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 263 .clampScalar(0, S32, S64) 264 .widenScalarToNextPow2(0) 265 .legalIf(isPointer(0)); 266 267 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 268 269 auto &FPOpActions = getActionDefinitionsBuilder( 270 { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA, G_FCANONICALIZE}) 271 .legalFor({S32, S64}); 272 273 if (ST.has16BitInsts()) { 274 if (ST.hasVOP3PInsts()) 275 FPOpActions.legalFor({S16, V2S16}); 276 else 277 FPOpActions.legalFor({S16}); 278 } 279 280 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 281 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 282 283 if (ST.hasVOP3PInsts()) { 284 MinNumMaxNum.customFor(FPTypesPK16) 285 .clampMaxNumElements(0, S16, 2) 286 .clampScalar(0, S16, S64) 287 .scalarize(0); 288 } else if (ST.has16BitInsts()) { 289 MinNumMaxNum.customFor(FPTypes16) 290 .clampScalar(0, S16, S64) 291 .scalarize(0); 292 } else { 293 MinNumMaxNum.customFor(FPTypesBase) 294 .clampScalar(0, S32, S64) 295 .scalarize(0); 296 } 297 298 // TODO: Implement 299 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); 300 301 if (ST.hasVOP3PInsts()) 302 FPOpActions.clampMaxNumElements(0, S16, 2); 303 FPOpActions 304 .scalarize(0) 305 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 306 307 if (ST.has16BitInsts()) { 308 getActionDefinitionsBuilder(G_FSQRT) 309 .legalFor({S32, S64, S16}) 310 .scalarize(0) 311 .clampScalar(0, S16, S64); 312 } else { 313 getActionDefinitionsBuilder(G_FSQRT) 314 .legalFor({S32, S64}) 315 .scalarize(0) 316 .clampScalar(0, S32, S64); 317 } 318 319 getActionDefinitionsBuilder(G_FPTRUNC) 320 .legalFor({{S32, S64}, {S16, S32}}) 321 .scalarize(0); 322 323 getActionDefinitionsBuilder(G_FPEXT) 324 .legalFor({{S64, S32}, {S32, S16}}) 325 .lowerFor({{S64, S16}}) // FIXME: Implement 326 .scalarize(0); 327 328 // TODO: Verify V_BFI_B32 is generated from expanded bit ops. 329 getActionDefinitionsBuilder(G_FCOPYSIGN).lower(); 330 331 getActionDefinitionsBuilder(G_FSUB) 332 // Use actual fsub instruction 333 .legalFor({S32}) 334 // Must use fadd + fneg 335 .lowerFor({S64, S16, V2S16}) 336 .scalarize(0) 337 .clampScalar(0, S32, S64); 338 339 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 340 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 341 {S32, S1}, {S64, S1}, {S16, S1}, 342 // FIXME: Hack 343 {S64, LLT::scalar(33)}, 344 {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}}) 345 .scalarize(0); 346 347 getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 348 .legalFor({{S32, S32}, {S64, S32}}) 349 .lowerFor({{S32, S64}}) 350 .customFor({{S64, S64}}) 351 .scalarize(0); 352 353 getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 354 .legalFor({{S32, S32}, {S32, S64}}) 355 .scalarize(0); 356 357 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 358 .legalFor({S32, S64}) 359 .scalarize(0); 360 361 if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 362 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 363 .legalFor({S32, S64}) 364 .clampScalar(0, S32, S64) 365 .scalarize(0); 366 } else { 367 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 368 .legalFor({S32}) 369 .customFor({S64}) 370 .clampScalar(0, S32, S64) 371 .scalarize(0); 372 } 373 374 getActionDefinitionsBuilder(G_GEP) 375 .legalForCartesianProduct(AddrSpaces64, {S64}) 376 .legalForCartesianProduct(AddrSpaces32, {S32}) 377 .scalarize(0); 378 379 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 380 381 auto &CmpBuilder = 382 getActionDefinitionsBuilder(G_ICMP) 383 .legalForCartesianProduct( 384 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 385 .legalFor({{S1, S32}, {S1, S64}}); 386 if (ST.has16BitInsts()) { 387 CmpBuilder.legalFor({{S1, S16}}); 388 } 389 390 CmpBuilder 391 .widenScalarToNextPow2(1) 392 .clampScalar(1, S32, S64) 393 .scalarize(0) 394 .legalIf(all(typeIs(0, S1), isPointer(1))); 395 396 getActionDefinitionsBuilder(G_FCMP) 397 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 398 .widenScalarToNextPow2(1) 399 .clampScalar(1, S32, S64) 400 .scalarize(0); 401 402 // FIXME: fexp, flog2, flog10 needs to be custom lowered. 403 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, 404 G_FLOG, G_FLOG2, G_FLOG10}) 405 .legalFor({S32}) 406 .scalarize(0); 407 408 // The 64-bit versions produce 32-bit results, but only on the SALU. 409 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, 410 G_CTTZ, G_CTTZ_ZERO_UNDEF, 411 G_CTPOP}) 412 .legalFor({{S32, S32}, {S32, S64}}) 413 .clampScalar(0, S32, S32) 414 .clampScalar(1, S32, S64) 415 .scalarize(0) 416 .widenScalarToNextPow2(0, 32) 417 .widenScalarToNextPow2(1, 32); 418 419 // TODO: Expand for > s32 420 getActionDefinitionsBuilder(G_BSWAP) 421 .legalFor({S32}) 422 .clampScalar(0, S32, S32) 423 .scalarize(0); 424 425 if (ST.has16BitInsts()) { 426 if (ST.hasVOP3PInsts()) { 427 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 428 .legalFor({S32, S16, V2S16}) 429 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 430 .clampMaxNumElements(0, S16, 2) 431 .clampScalar(0, S16, S32) 432 .widenScalarToNextPow2(0) 433 .scalarize(0); 434 } else { 435 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 436 .legalFor({S32, S16}) 437 .widenScalarToNextPow2(0) 438 .clampScalar(0, S16, S32) 439 .scalarize(0); 440 } 441 } else { 442 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 443 .legalFor({S32}) 444 .clampScalar(0, S32, S32) 445 .widenScalarToNextPow2(0) 446 .scalarize(0); 447 } 448 449 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 450 return [=](const LegalityQuery &Query) { 451 return Query.Types[TypeIdx0].getSizeInBits() < 452 Query.Types[TypeIdx1].getSizeInBits(); 453 }; 454 }; 455 456 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 457 return [=](const LegalityQuery &Query) { 458 return Query.Types[TypeIdx0].getSizeInBits() > 459 Query.Types[TypeIdx1].getSizeInBits(); 460 }; 461 }; 462 463 getActionDefinitionsBuilder(G_INTTOPTR) 464 // List the common cases 465 .legalForCartesianProduct(AddrSpaces64, {S64}) 466 .legalForCartesianProduct(AddrSpaces32, {S32}) 467 .scalarize(0) 468 // Accept any address space as long as the size matches 469 .legalIf(sameSize(0, 1)) 470 .widenScalarIf(smallerThan(1, 0), 471 [](const LegalityQuery &Query) { 472 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 473 }) 474 .narrowScalarIf(greaterThan(1, 0), 475 [](const LegalityQuery &Query) { 476 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 477 }); 478 479 getActionDefinitionsBuilder(G_PTRTOINT) 480 // List the common cases 481 .legalForCartesianProduct(AddrSpaces64, {S64}) 482 .legalForCartesianProduct(AddrSpaces32, {S32}) 483 .scalarize(0) 484 // Accept any address space as long as the size matches 485 .legalIf(sameSize(0, 1)) 486 .widenScalarIf(smallerThan(0, 1), 487 [](const LegalityQuery &Query) { 488 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 489 }) 490 .narrowScalarIf( 491 greaterThan(0, 1), 492 [](const LegalityQuery &Query) { 493 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 494 }); 495 496 if (ST.hasFlatAddressSpace()) { 497 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 498 .scalarize(0) 499 .custom(); 500 } 501 502 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 503 // handle some operations by just promoting the register during 504 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 505 getActionDefinitionsBuilder({G_LOAD, G_STORE}) 506 .narrowScalarIf([](const LegalityQuery &Query) { 507 unsigned Size = Query.Types[0].getSizeInBits(); 508 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 509 return (Size > 32 && MemSize < Size); 510 }, 511 [](const LegalityQuery &Query) { 512 return std::make_pair(0, LLT::scalar(32)); 513 }) 514 .fewerElementsIf([=](const LegalityQuery &Query) { 515 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 516 return (MemSize == 96) && 517 Query.Types[0].isVector() && 518 !ST.hasDwordx3LoadStores(); 519 }, 520 [=](const LegalityQuery &Query) { 521 return std::make_pair(0, V2S32); 522 }) 523 .legalIf([=](const LegalityQuery &Query) { 524 const LLT &Ty0 = Query.Types[0]; 525 526 unsigned Size = Ty0.getSizeInBits(); 527 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 528 if (Size < 32 || (Size > 32 && MemSize < Size)) 529 return false; 530 531 if (Ty0.isVector() && Size != MemSize) 532 return false; 533 534 // TODO: Decompose private loads into 4-byte components. 535 // TODO: Illegal flat loads on SI 536 switch (MemSize) { 537 case 8: 538 case 16: 539 return Size == 32; 540 case 32: 541 case 64: 542 case 128: 543 return true; 544 545 case 96: 546 return ST.hasDwordx3LoadStores(); 547 548 case 256: 549 case 512: 550 // TODO: Possibly support loads of i256 and i512 . This will require 551 // adding i256 and i512 types to MVT in order for to be able to use 552 // TableGen. 553 // TODO: Add support for other vector types, this will require 554 // defining more value mappings for the new types. 555 return Ty0.isVector() && (Ty0.getScalarType().getSizeInBits() == 32 || 556 Ty0.getScalarType().getSizeInBits() == 64); 557 558 default: 559 return false; 560 } 561 }) 562 .clampScalar(0, S32, S64); 563 564 565 // FIXME: Handle alignment requirements. 566 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 567 .legalForTypesWithMemDesc({ 568 {S32, GlobalPtr, 8, 8}, 569 {S32, GlobalPtr, 16, 8}, 570 {S32, LocalPtr, 8, 8}, 571 {S32, LocalPtr, 16, 8}, 572 {S32, PrivatePtr, 8, 8}, 573 {S32, PrivatePtr, 16, 8}}); 574 if (ST.hasFlatAddressSpace()) { 575 ExtLoads.legalForTypesWithMemDesc({{S32, FlatPtr, 8, 8}, 576 {S32, FlatPtr, 16, 8}}); 577 } 578 579 ExtLoads.clampScalar(0, S32, S32) 580 .widenScalarToNextPow2(0) 581 .unsupportedIfMemSizeNotPow2() 582 .lower(); 583 584 auto &Atomics = getActionDefinitionsBuilder( 585 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 586 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 587 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 588 G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG}) 589 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 590 {S64, GlobalPtr}, {S64, LocalPtr}}); 591 if (ST.hasFlatAddressSpace()) { 592 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 593 } 594 595 // TODO: Pointer types, any 32-bit or 64-bit vector 596 getActionDefinitionsBuilder(G_SELECT) 597 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 598 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 599 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1}) 600 .clampScalar(0, S16, S64) 601 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 602 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 603 .scalarize(1) 604 .clampMaxNumElements(0, S32, 2) 605 .clampMaxNumElements(0, LocalPtr, 2) 606 .clampMaxNumElements(0, PrivatePtr, 2) 607 .scalarize(0) 608 .widenScalarToNextPow2(0) 609 .legalIf(all(isPointer(0), typeIs(1, S1))); 610 611 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 612 // be more flexible with the shift amount type. 613 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 614 .legalFor({{S32, S32}, {S64, S32}}); 615 if (ST.has16BitInsts()) { 616 if (ST.hasVOP3PInsts()) { 617 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 618 .clampMaxNumElements(0, S16, 2); 619 } else 620 Shifts.legalFor({{S16, S32}, {S16, S16}}); 621 622 Shifts.clampScalar(1, S16, S32); 623 Shifts.clampScalar(0, S16, S64); 624 Shifts.widenScalarToNextPow2(0, 16); 625 } else { 626 // Make sure we legalize the shift amount type first, as the general 627 // expansion for the shifted type will produce much worse code if it hasn't 628 // been truncated already. 629 Shifts.clampScalar(1, S32, S32); 630 Shifts.clampScalar(0, S32, S64); 631 Shifts.widenScalarToNextPow2(0, 32); 632 } 633 Shifts.scalarize(0); 634 635 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 636 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 637 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 638 unsigned IdxTypeIdx = 2; 639 640 getActionDefinitionsBuilder(Op) 641 .customIf([=](const LegalityQuery &Query) { 642 const LLT EltTy = Query.Types[EltTypeIdx]; 643 const LLT VecTy = Query.Types[VecTypeIdx]; 644 const LLT IdxTy = Query.Types[IdxTypeIdx]; 645 return (EltTy.getSizeInBits() == 16 || 646 EltTy.getSizeInBits() % 32 == 0) && 647 VecTy.getSizeInBits() % 32 == 0 && 648 VecTy.getSizeInBits() <= 512 && 649 IdxTy.getSizeInBits() == 32; 650 }) 651 .clampScalar(EltTypeIdx, S32, S64) 652 .clampScalar(VecTypeIdx, S32, S64) 653 .clampScalar(IdxTypeIdx, S32, S32); 654 } 655 656 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 657 .unsupportedIf([=](const LegalityQuery &Query) { 658 const LLT &EltTy = Query.Types[1].getElementType(); 659 return Query.Types[0] != EltTy; 660 }); 661 662 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 663 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 664 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 665 666 // FIXME: Doesn't handle extract of illegal sizes. 667 getActionDefinitionsBuilder(Op) 668 .legalIf([=](const LegalityQuery &Query) { 669 const LLT BigTy = Query.Types[BigTyIdx]; 670 const LLT LitTy = Query.Types[LitTyIdx]; 671 return (BigTy.getSizeInBits() % 32 == 0) && 672 (LitTy.getSizeInBits() % 16 == 0); 673 }) 674 .widenScalarIf( 675 [=](const LegalityQuery &Query) { 676 const LLT BigTy = Query.Types[BigTyIdx]; 677 return (BigTy.getScalarSizeInBits() < 16); 678 }, 679 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 680 .widenScalarIf( 681 [=](const LegalityQuery &Query) { 682 const LLT LitTy = Query.Types[LitTyIdx]; 683 return (LitTy.getScalarSizeInBits() < 16); 684 }, 685 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 686 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 687 .widenScalarToNextPow2(BigTyIdx, 32); 688 689 } 690 691 getActionDefinitionsBuilder(G_BUILD_VECTOR) 692 .legalForCartesianProduct(AllS32Vectors, {S32}) 693 .legalForCartesianProduct(AllS64Vectors, {S64}) 694 .clampNumElements(0, V16S32, V16S32) 695 .clampNumElements(0, V2S64, V8S64) 696 .minScalarSameAs(1, 0) 697 .legalIf(isRegisterType(0)) 698 .minScalarOrElt(0, S32); 699 700 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 701 .legalIf(isRegisterType(0)); 702 703 // Merge/Unmerge 704 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 705 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 706 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 707 708 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 709 const LLT &Ty = Query.Types[TypeIdx]; 710 if (Ty.isVector()) { 711 const LLT &EltTy = Ty.getElementType(); 712 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 713 return true; 714 if (!isPowerOf2_32(EltTy.getSizeInBits())) 715 return true; 716 } 717 return false; 718 }; 719 720 getActionDefinitionsBuilder(Op) 721 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 722 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 723 // worth considering the multiples of 64 since 2*192 and 2*384 are not 724 // valid. 725 .clampScalar(LitTyIdx, S16, S256) 726 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 727 728 // Break up vectors with weird elements into scalars 729 .fewerElementsIf( 730 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 731 scalarize(0)) 732 .fewerElementsIf( 733 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 734 scalarize(1)) 735 .clampScalar(BigTyIdx, S32, S512) 736 .widenScalarIf( 737 [=](const LegalityQuery &Query) { 738 const LLT &Ty = Query.Types[BigTyIdx]; 739 return !isPowerOf2_32(Ty.getSizeInBits()) && 740 Ty.getSizeInBits() % 16 != 0; 741 }, 742 [=](const LegalityQuery &Query) { 743 // Pick the next power of 2, or a multiple of 64 over 128. 744 // Whichever is smaller. 745 const LLT &Ty = Query.Types[BigTyIdx]; 746 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 747 if (NewSizeInBits >= 256) { 748 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 749 if (RoundedTo < NewSizeInBits) 750 NewSizeInBits = RoundedTo; 751 } 752 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 753 }) 754 .legalIf([=](const LegalityQuery &Query) { 755 const LLT &BigTy = Query.Types[BigTyIdx]; 756 const LLT &LitTy = Query.Types[LitTyIdx]; 757 758 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 759 return false; 760 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 761 return false; 762 763 return BigTy.getSizeInBits() % 16 == 0 && 764 LitTy.getSizeInBits() % 16 == 0 && 765 BigTy.getSizeInBits() <= 512; 766 }) 767 // Any vectors left are the wrong size. Scalarize them. 768 .scalarize(0) 769 .scalarize(1); 770 } 771 772 computeTables(); 773 verify(*ST.getInstrInfo()); 774 } 775 776 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 777 MachineRegisterInfo &MRI, 778 MachineIRBuilder &MIRBuilder, 779 GISelChangeObserver &Observer) const { 780 switch (MI.getOpcode()) { 781 case TargetOpcode::G_ADDRSPACE_CAST: 782 return legalizeAddrSpaceCast(MI, MRI, MIRBuilder); 783 case TargetOpcode::G_FRINT: 784 return legalizeFrint(MI, MRI, MIRBuilder); 785 case TargetOpcode::G_FCEIL: 786 return legalizeFceil(MI, MRI, MIRBuilder); 787 case TargetOpcode::G_INTRINSIC_TRUNC: 788 return legalizeIntrinsicTrunc(MI, MRI, MIRBuilder); 789 case TargetOpcode::G_SITOFP: 790 return legalizeITOFP(MI, MRI, MIRBuilder, true); 791 case TargetOpcode::G_UITOFP: 792 return legalizeITOFP(MI, MRI, MIRBuilder, false); 793 case TargetOpcode::G_FMINNUM: 794 case TargetOpcode::G_FMAXNUM: 795 case TargetOpcode::G_FMINNUM_IEEE: 796 case TargetOpcode::G_FMAXNUM_IEEE: 797 return legalizeMinNumMaxNum(MI, MRI, MIRBuilder); 798 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 799 return legalizeExtractVectorElt(MI, MRI, MIRBuilder); 800 case TargetOpcode::G_INSERT_VECTOR_ELT: 801 return legalizeInsertVectorElt(MI, MRI, MIRBuilder); 802 default: 803 return false; 804 } 805 806 llvm_unreachable("expected switch to return"); 807 } 808 809 Register AMDGPULegalizerInfo::getSegmentAperture( 810 unsigned AS, 811 MachineRegisterInfo &MRI, 812 MachineIRBuilder &MIRBuilder) const { 813 MachineFunction &MF = MIRBuilder.getMF(); 814 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 815 const LLT S32 = LLT::scalar(32); 816 817 if (ST.hasApertureRegs()) { 818 // FIXME: Use inline constants (src_{shared, private}_base) instead of 819 // getreg. 820 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 821 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 822 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 823 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 824 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 825 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 826 unsigned Encoding = 827 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 828 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 829 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 830 831 Register ApertureReg = MRI.createGenericVirtualRegister(S32); 832 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 833 834 MIRBuilder.buildInstr(AMDGPU::S_GETREG_B32) 835 .addDef(GetReg) 836 .addImm(Encoding); 837 MRI.setType(GetReg, S32); 838 839 auto ShiftAmt = MIRBuilder.buildConstant(S32, WidthM1 + 1); 840 MIRBuilder.buildInstr(TargetOpcode::G_SHL) 841 .addDef(ApertureReg) 842 .addUse(GetReg) 843 .addUse(ShiftAmt.getReg(0)); 844 845 return ApertureReg; 846 } 847 848 Register QueuePtr = MRI.createGenericVirtualRegister( 849 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 850 851 // FIXME: Placeholder until we can track the input registers. 852 MIRBuilder.buildConstant(QueuePtr, 0xdeadbeef); 853 854 // Offset into amd_queue_t for group_segment_aperture_base_hi / 855 // private_segment_aperture_base_hi. 856 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 857 858 // FIXME: Don't use undef 859 Value *V = UndefValue::get(PointerType::get( 860 Type::getInt8Ty(MF.getFunction().getContext()), 861 AMDGPUAS::CONSTANT_ADDRESS)); 862 863 MachinePointerInfo PtrInfo(V, StructOffset); 864 MachineMemOperand *MMO = MF.getMachineMemOperand( 865 PtrInfo, 866 MachineMemOperand::MOLoad | 867 MachineMemOperand::MODereferenceable | 868 MachineMemOperand::MOInvariant, 869 4, 870 MinAlign(64, StructOffset)); 871 872 Register LoadResult = MRI.createGenericVirtualRegister(S32); 873 Register LoadAddr; 874 875 MIRBuilder.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 876 MIRBuilder.buildLoad(LoadResult, LoadAddr, *MMO); 877 return LoadResult; 878 } 879 880 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 881 MachineInstr &MI, MachineRegisterInfo &MRI, 882 MachineIRBuilder &MIRBuilder) const { 883 MachineFunction &MF = MIRBuilder.getMF(); 884 885 MIRBuilder.setInstr(MI); 886 887 Register Dst = MI.getOperand(0).getReg(); 888 Register Src = MI.getOperand(1).getReg(); 889 890 LLT DstTy = MRI.getType(Dst); 891 LLT SrcTy = MRI.getType(Src); 892 unsigned DestAS = DstTy.getAddressSpace(); 893 unsigned SrcAS = SrcTy.getAddressSpace(); 894 895 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 896 // vector element. 897 assert(!DstTy.isVector()); 898 899 const AMDGPUTargetMachine &TM 900 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 901 902 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 903 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 904 MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BITCAST)); 905 return true; 906 } 907 908 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 909 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 910 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 911 unsigned NullVal = TM.getNullPointerValue(DestAS); 912 913 auto SegmentNull = MIRBuilder.buildConstant(DstTy, NullVal); 914 auto FlatNull = MIRBuilder.buildConstant(SrcTy, 0); 915 916 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy); 917 918 // Extract low 32-bits of the pointer. 919 MIRBuilder.buildExtract(PtrLo32, Src, 0); 920 921 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 922 MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); 923 MIRBuilder.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 924 925 MI.eraseFromParent(); 926 return true; 927 } 928 929 assert(SrcAS == AMDGPUAS::LOCAL_ADDRESS || 930 SrcAS == AMDGPUAS::PRIVATE_ADDRESS); 931 932 auto SegmentNull = 933 MIRBuilder.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 934 auto FlatNull = 935 MIRBuilder.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 936 937 Register ApertureReg = getSegmentAperture(DestAS, MRI, MIRBuilder); 938 939 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 940 MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); 941 942 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy); 943 944 // Coerce the type of the low half of the result so we can use merge_values. 945 Register SrcAsInt = MRI.createGenericVirtualRegister(LLT::scalar(32)); 946 MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT) 947 .addDef(SrcAsInt) 948 .addUse(Src); 949 950 // TODO: Should we allow mismatched types but matching sizes in merges to 951 // avoid the ptrtoint? 952 MIRBuilder.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); 953 MIRBuilder.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); 954 955 MI.eraseFromParent(); 956 return true; 957 } 958 959 bool AMDGPULegalizerInfo::legalizeFrint( 960 MachineInstr &MI, MachineRegisterInfo &MRI, 961 MachineIRBuilder &MIRBuilder) const { 962 MIRBuilder.setInstr(MI); 963 964 Register Src = MI.getOperand(1).getReg(); 965 LLT Ty = MRI.getType(Src); 966 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 967 968 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 969 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 970 971 auto C1 = MIRBuilder.buildFConstant(Ty, C1Val); 972 auto CopySign = MIRBuilder.buildFCopysign(Ty, C1, Src); 973 974 // TODO: Should this propagate fast-math-flags? 975 auto Tmp1 = MIRBuilder.buildFAdd(Ty, Src, CopySign); 976 auto Tmp2 = MIRBuilder.buildFSub(Ty, Tmp1, CopySign); 977 978 auto C2 = MIRBuilder.buildFConstant(Ty, C2Val); 979 auto Fabs = MIRBuilder.buildFAbs(Ty, Src); 980 981 auto Cond = MIRBuilder.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 982 MIRBuilder.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 983 return true; 984 } 985 986 bool AMDGPULegalizerInfo::legalizeFceil( 987 MachineInstr &MI, MachineRegisterInfo &MRI, 988 MachineIRBuilder &B) const { 989 B.setInstr(MI); 990 991 const LLT S1 = LLT::scalar(1); 992 const LLT S64 = LLT::scalar(64); 993 994 Register Src = MI.getOperand(1).getReg(); 995 assert(MRI.getType(Src) == S64); 996 997 // result = trunc(src) 998 // if (src > 0.0 && src != result) 999 // result += 1.0 1000 1001 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src}); 1002 1003 const auto Zero = B.buildFConstant(S64, 0.0); 1004 const auto One = B.buildFConstant(S64, 1.0); 1005 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1006 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1007 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1008 auto Add = B.buildSelect(S64, And, One, Zero); 1009 1010 // TODO: Should this propagate fast-math-flags? 1011 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1012 return true; 1013 } 1014 1015 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1016 MachineIRBuilder &B) { 1017 const unsigned FractBits = 52; 1018 const unsigned ExpBits = 11; 1019 LLT S32 = LLT::scalar(32); 1020 1021 auto Const0 = B.buildConstant(S32, FractBits - 32); 1022 auto Const1 = B.buildConstant(S32, ExpBits); 1023 1024 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1025 .addUse(Const0.getReg(0)) 1026 .addUse(Const1.getReg(0)); 1027 1028 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1029 } 1030 1031 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1032 MachineInstr &MI, MachineRegisterInfo &MRI, 1033 MachineIRBuilder &B) const { 1034 B.setInstr(MI); 1035 1036 const LLT S1 = LLT::scalar(1); 1037 const LLT S32 = LLT::scalar(32); 1038 const LLT S64 = LLT::scalar(64); 1039 1040 Register Src = MI.getOperand(1).getReg(); 1041 assert(MRI.getType(Src) == S64); 1042 1043 // TODO: Should this use extract since the low half is unused? 1044 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1045 Register Hi = Unmerge.getReg(1); 1046 1047 // Extract the upper half, since this is where we will find the sign and 1048 // exponent. 1049 auto Exp = extractF64Exponent(Hi, B); 1050 1051 const unsigned FractBits = 52; 1052 1053 // Extract the sign bit. 1054 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1055 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1056 1057 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1058 1059 const auto Zero32 = B.buildConstant(S32, 0); 1060 1061 // Extend back to 64-bits. 1062 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); 1063 1064 auto Shr = B.buildAShr(S64, FractMask, Exp); 1065 auto Not = B.buildNot(S64, Shr); 1066 auto Tmp0 = B.buildAnd(S64, Src, Not); 1067 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1068 1069 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1070 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1071 1072 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1073 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1074 return true; 1075 } 1076 1077 bool AMDGPULegalizerInfo::legalizeITOFP( 1078 MachineInstr &MI, MachineRegisterInfo &MRI, 1079 MachineIRBuilder &B, bool Signed) const { 1080 B.setInstr(MI); 1081 1082 Register Dst = MI.getOperand(0).getReg(); 1083 Register Src = MI.getOperand(1).getReg(); 1084 1085 const LLT S64 = LLT::scalar(64); 1086 const LLT S32 = LLT::scalar(32); 1087 1088 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1089 1090 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1091 1092 auto CvtHi = Signed ? 1093 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1094 B.buildUITOFP(S64, Unmerge.getReg(1)); 1095 1096 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1097 1098 auto ThirtyTwo = B.buildConstant(S32, 32); 1099 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1100 .addUse(CvtHi.getReg(0)) 1101 .addUse(ThirtyTwo.getReg(0)); 1102 1103 // TODO: Should this propagate fast-math-flags? 1104 B.buildFAdd(Dst, LdExp, CvtLo); 1105 MI.eraseFromParent(); 1106 return true; 1107 } 1108 1109 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1110 MachineInstr &MI, MachineRegisterInfo &MRI, 1111 MachineIRBuilder &B) const { 1112 MachineFunction &MF = B.getMF(); 1113 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1114 1115 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1116 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1117 1118 // With ieee_mode disabled, the instructions have the correct behavior 1119 // already for G_FMINNUM/G_FMAXNUM 1120 if (!MFI->getMode().IEEE) 1121 return !IsIEEEOp; 1122 1123 if (IsIEEEOp) 1124 return true; 1125 1126 MachineIRBuilder HelperBuilder(MI); 1127 GISelObserverWrapper DummyObserver; 1128 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1129 HelperBuilder.setMBB(*MI.getParent()); 1130 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1131 } 1132 1133 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1134 MachineInstr &MI, MachineRegisterInfo &MRI, 1135 MachineIRBuilder &B) const { 1136 // TODO: Should move some of this into LegalizerHelper. 1137 1138 // TODO: Promote dynamic indexing of s16 to s32 1139 // TODO: Dynamic s64 indexing is only legal for SGPR. 1140 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); 1141 if (!IdxVal) // Dynamic case will be selected to register indexing. 1142 return true; 1143 1144 Register Dst = MI.getOperand(0).getReg(); 1145 Register Vec = MI.getOperand(1).getReg(); 1146 1147 LLT VecTy = MRI.getType(Vec); 1148 LLT EltTy = VecTy.getElementType(); 1149 assert(EltTy == MRI.getType(Dst)); 1150 1151 B.setInstr(MI); 1152 1153 if (IdxVal.getValue() < VecTy.getNumElements()) 1154 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); 1155 else 1156 B.buildUndef(Dst); 1157 1158 MI.eraseFromParent(); 1159 return true; 1160 } 1161 1162 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1163 MachineInstr &MI, MachineRegisterInfo &MRI, 1164 MachineIRBuilder &B) const { 1165 // TODO: Should move some of this into LegalizerHelper. 1166 1167 // TODO: Promote dynamic indexing of s16 to s32 1168 // TODO: Dynamic s64 indexing is only legal for SGPR. 1169 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); 1170 if (!IdxVal) // Dynamic case will be selected to register indexing. 1171 return true; 1172 1173 Register Dst = MI.getOperand(0).getReg(); 1174 Register Vec = MI.getOperand(1).getReg(); 1175 Register Ins = MI.getOperand(2).getReg(); 1176 1177 LLT VecTy = MRI.getType(Vec); 1178 LLT EltTy = VecTy.getElementType(); 1179 assert(EltTy == MRI.getType(Ins)); 1180 1181 B.setInstr(MI); 1182 1183 if (IdxVal.getValue() < VecTy.getNumElements()) 1184 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); 1185 else 1186 B.buildUndef(Dst); 1187 1188 MI.eraseFromParent(); 1189 return true; 1190 } 1191 1192 // Return the use branch instruction, otherwise null if the usage is invalid. 1193 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 1194 MachineRegisterInfo &MRI) { 1195 Register CondDef = MI.getOperand(0).getReg(); 1196 if (!MRI.hasOneNonDBGUse(CondDef)) 1197 return nullptr; 1198 1199 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 1200 return UseMI.getParent() == MI.getParent() && 1201 UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr; 1202 } 1203 1204 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 1205 Register Reg, LLT Ty) const { 1206 Register LiveIn = MRI.getLiveInVirtReg(Reg); 1207 if (LiveIn) 1208 return LiveIn; 1209 1210 Register NewReg = MRI.createGenericVirtualRegister(Ty); 1211 MRI.addLiveIn(Reg, NewReg); 1212 return NewReg; 1213 } 1214 1215 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 1216 const ArgDescriptor *Arg) const { 1217 if (!Arg->isRegister()) 1218 return false; // TODO: Handle these 1219 1220 assert(Arg->getRegister() != 0); 1221 assert(Arg->getRegister().isPhysical()); 1222 1223 MachineRegisterInfo &MRI = *B.getMRI(); 1224 1225 LLT Ty = MRI.getType(DstReg); 1226 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 1227 1228 if (Arg->isMasked()) { 1229 // TODO: Should we try to emit this once in the entry block? 1230 const LLT S32 = LLT::scalar(32); 1231 const unsigned Mask = Arg->getMask(); 1232 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 1233 1234 auto ShiftAmt = B.buildConstant(S32, Shift); 1235 auto LShr = B.buildLShr(S32, LiveIn, ShiftAmt); 1236 B.buildAnd(DstReg, LShr, B.buildConstant(S32, Mask >> Shift)); 1237 } else 1238 B.buildCopy(DstReg, LiveIn); 1239 1240 // Insert the argument copy if it doens't already exist. 1241 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 1242 if (!MRI.getVRegDef(LiveIn)) { 1243 MachineBasicBlock &EntryMBB = B.getMF().front(); 1244 EntryMBB.addLiveIn(Arg->getRegister()); 1245 B.setInsertPt(EntryMBB, EntryMBB.begin()); 1246 B.buildCopy(LiveIn, Arg->getRegister()); 1247 } 1248 1249 return true; 1250 } 1251 1252 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 1253 MachineInstr &MI, 1254 MachineRegisterInfo &MRI, 1255 MachineIRBuilder &B, 1256 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 1257 B.setInstr(MI); 1258 1259 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1260 1261 const ArgDescriptor *Arg; 1262 const TargetRegisterClass *RC; 1263 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 1264 if (!Arg) { 1265 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 1266 return false; 1267 } 1268 1269 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 1270 MI.eraseFromParent(); 1271 return true; 1272 } 1273 1274 return false; 1275 } 1276 1277 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 1278 MachineRegisterInfo &MRI, 1279 MachineIRBuilder &B) const { 1280 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1281 if (!MFI->isEntryFunction()) { 1282 return legalizePreloadedArgIntrin(MI, MRI, B, 1283 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 1284 } 1285 1286 B.setInstr(MI); 1287 1288 uint64_t Offset = 1289 ST.getTargetLowering()->getImplicitParameterOffset( 1290 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 1291 Register DstReg = MI.getOperand(0).getReg(); 1292 LLT DstTy = MRI.getType(DstReg); 1293 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 1294 1295 const ArgDescriptor *Arg; 1296 const TargetRegisterClass *RC; 1297 std::tie(Arg, RC) 1298 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 1299 if (!Arg) 1300 return false; 1301 1302 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 1303 if (!loadInputValue(KernargPtrReg, B, Arg)) 1304 return false; 1305 1306 B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 1307 MI.eraseFromParent(); 1308 return true; 1309 } 1310 1311 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 1312 MachineRegisterInfo &MRI, 1313 MachineIRBuilder &B) const { 1314 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 1315 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { 1316 case Intrinsic::amdgcn_if: { 1317 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 1318 const SIRegisterInfo *TRI 1319 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 1320 1321 B.setInstr(*BrCond); 1322 Register Def = MI.getOperand(1).getReg(); 1323 Register Use = MI.getOperand(3).getReg(); 1324 B.buildInstr(AMDGPU::SI_IF) 1325 .addDef(Def) 1326 .addUse(Use) 1327 .addMBB(BrCond->getOperand(1).getMBB()); 1328 1329 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 1330 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 1331 MI.eraseFromParent(); 1332 BrCond->eraseFromParent(); 1333 return true; 1334 } 1335 1336 return false; 1337 } 1338 case Intrinsic::amdgcn_loop: { 1339 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 1340 const SIRegisterInfo *TRI 1341 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 1342 1343 B.setInstr(*BrCond); 1344 Register Reg = MI.getOperand(2).getReg(); 1345 B.buildInstr(AMDGPU::SI_LOOP) 1346 .addUse(Reg) 1347 .addMBB(BrCond->getOperand(1).getMBB()); 1348 MI.eraseFromParent(); 1349 BrCond->eraseFromParent(); 1350 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 1351 return true; 1352 } 1353 1354 return false; 1355 } 1356 case Intrinsic::amdgcn_kernarg_segment_ptr: 1357 return legalizePreloadedArgIntrin( 1358 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 1359 case Intrinsic::amdgcn_implicitarg_ptr: 1360 return legalizeImplicitArgPtr(MI, MRI, B); 1361 case Intrinsic::amdgcn_workitem_id_x: 1362 return legalizePreloadedArgIntrin(MI, MRI, B, 1363 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 1364 case Intrinsic::amdgcn_workitem_id_y: 1365 return legalizePreloadedArgIntrin(MI, MRI, B, 1366 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 1367 case Intrinsic::amdgcn_workitem_id_z: 1368 return legalizePreloadedArgIntrin(MI, MRI, B, 1369 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 1370 case Intrinsic::amdgcn_workgroup_id_x: 1371 return legalizePreloadedArgIntrin(MI, MRI, B, 1372 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 1373 case Intrinsic::amdgcn_workgroup_id_y: 1374 return legalizePreloadedArgIntrin(MI, MRI, B, 1375 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 1376 case Intrinsic::amdgcn_workgroup_id_z: 1377 return legalizePreloadedArgIntrin(MI, MRI, B, 1378 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 1379 case Intrinsic::amdgcn_dispatch_ptr: 1380 return legalizePreloadedArgIntrin(MI, MRI, B, 1381 AMDGPUFunctionArgInfo::DISPATCH_PTR); 1382 case Intrinsic::amdgcn_queue_ptr: 1383 return legalizePreloadedArgIntrin(MI, MRI, B, 1384 AMDGPUFunctionArgInfo::QUEUE_PTR); 1385 case Intrinsic::amdgcn_implicit_buffer_ptr: 1386 return legalizePreloadedArgIntrin( 1387 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 1388 case Intrinsic::amdgcn_dispatch_id: 1389 return legalizePreloadedArgIntrin(MI, MRI, B, 1390 AMDGPUFunctionArgInfo::DISPATCH_ID); 1391 default: 1392 return true; 1393 } 1394 1395 return true; 1396 } 1397