1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPULegalizerInfo.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "SIMachineFunctionInfo.h" 18 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 19 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 20 #include "llvm/CodeGen/TargetOpcodes.h" 21 #include "llvm/CodeGen/ValueTypes.h" 22 #include "llvm/IR/DerivedTypes.h" 23 #include "llvm/IR/Type.h" 24 #include "llvm/Support/Debug.h" 25 26 #define DEBUG_TYPE "amdgpu-legalinfo" 27 28 using namespace llvm; 29 using namespace LegalizeActions; 30 using namespace LegalizeMutations; 31 using namespace LegalityPredicates; 32 33 34 static LegalityPredicate isMultiple32(unsigned TypeIdx, 35 unsigned MaxSize = 512) { 36 return [=](const LegalityQuery &Query) { 37 const LLT Ty = Query.Types[TypeIdx]; 38 const LLT EltTy = Ty.getScalarType(); 39 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 40 }; 41 } 42 43 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 44 return [=](const LegalityQuery &Query) { 45 const LLT Ty = Query.Types[TypeIdx]; 46 return Ty.isVector() && 47 Ty.getNumElements() % 2 != 0 && 48 Ty.getElementType().getSizeInBits() < 32; 49 }; 50 } 51 52 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 53 return [=](const LegalityQuery &Query) { 54 const LLT Ty = Query.Types[TypeIdx]; 55 const LLT EltTy = Ty.getElementType(); 56 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 57 }; 58 } 59 60 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 61 return [=](const LegalityQuery &Query) { 62 const LLT Ty = Query.Types[TypeIdx]; 63 const LLT EltTy = Ty.getElementType(); 64 unsigned Size = Ty.getSizeInBits(); 65 unsigned Pieces = (Size + 63) / 64; 66 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 67 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 68 }; 69 } 70 71 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 72 return [=](const LegalityQuery &Query) { 73 const LLT QueryTy = Query.Types[TypeIdx]; 74 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 75 }; 76 } 77 78 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 79 return [=](const LegalityQuery &Query) { 80 const LLT QueryTy = Query.Types[TypeIdx]; 81 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 82 }; 83 } 84 85 // Any combination of 32 or 64-bit elements up to 512 bits, and multiples of 86 // v2s16. 87 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 88 return [=](const LegalityQuery &Query) { 89 const LLT Ty = Query.Types[TypeIdx]; 90 if (Ty.isVector()) { 91 const int EltSize = Ty.getElementType().getSizeInBits(); 92 return EltSize == 32 || EltSize == 64 || 93 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 94 EltSize == 128 || EltSize == 256; 95 } 96 97 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 512; 98 }; 99 } 100 101 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 102 const GCNTargetMachine &TM) 103 : ST(ST_) { 104 using namespace TargetOpcode; 105 106 auto GetAddrSpacePtr = [&TM](unsigned AS) { 107 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 108 }; 109 110 const LLT S1 = LLT::scalar(1); 111 const LLT S8 = LLT::scalar(8); 112 const LLT S16 = LLT::scalar(16); 113 const LLT S32 = LLT::scalar(32); 114 const LLT S64 = LLT::scalar(64); 115 const LLT S128 = LLT::scalar(128); 116 const LLT S256 = LLT::scalar(256); 117 const LLT S512 = LLT::scalar(512); 118 119 const LLT V2S16 = LLT::vector(2, 16); 120 const LLT V4S16 = LLT::vector(4, 16); 121 122 const LLT V2S32 = LLT::vector(2, 32); 123 const LLT V3S32 = LLT::vector(3, 32); 124 const LLT V4S32 = LLT::vector(4, 32); 125 const LLT V5S32 = LLT::vector(5, 32); 126 const LLT V6S32 = LLT::vector(6, 32); 127 const LLT V7S32 = LLT::vector(7, 32); 128 const LLT V8S32 = LLT::vector(8, 32); 129 const LLT V9S32 = LLT::vector(9, 32); 130 const LLT V10S32 = LLT::vector(10, 32); 131 const LLT V11S32 = LLT::vector(11, 32); 132 const LLT V12S32 = LLT::vector(12, 32); 133 const LLT V13S32 = LLT::vector(13, 32); 134 const LLT V14S32 = LLT::vector(14, 32); 135 const LLT V15S32 = LLT::vector(15, 32); 136 const LLT V16S32 = LLT::vector(16, 32); 137 138 const LLT V2S64 = LLT::vector(2, 64); 139 const LLT V3S64 = LLT::vector(3, 64); 140 const LLT V4S64 = LLT::vector(4, 64); 141 const LLT V5S64 = LLT::vector(5, 64); 142 const LLT V6S64 = LLT::vector(6, 64); 143 const LLT V7S64 = LLT::vector(7, 64); 144 const LLT V8S64 = LLT::vector(8, 64); 145 146 std::initializer_list<LLT> AllS32Vectors = 147 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 148 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32}; 149 std::initializer_list<LLT> AllS64Vectors = 150 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64}; 151 152 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 153 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 154 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 155 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 156 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 157 158 const LLT CodePtr = FlatPtr; 159 160 const std::initializer_list<LLT> AddrSpaces64 = { 161 GlobalPtr, ConstantPtr, FlatPtr 162 }; 163 164 const std::initializer_list<LLT> AddrSpaces32 = { 165 LocalPtr, PrivatePtr 166 }; 167 168 const std::initializer_list<LLT> FPTypesBase = { 169 S32, S64 170 }; 171 172 const std::initializer_list<LLT> FPTypes16 = { 173 S32, S64, S16 174 }; 175 176 const std::initializer_list<LLT> FPTypesPK16 = { 177 S32, S64, S16, V2S16 178 }; 179 180 setAction({G_BRCOND, S1}, Legal); 181 182 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 183 // elements for v3s16 184 getActionDefinitionsBuilder(G_PHI) 185 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 186 .legalFor(AllS32Vectors) 187 .legalFor(AllS64Vectors) 188 .legalFor(AddrSpaces64) 189 .legalFor(AddrSpaces32) 190 .clampScalar(0, S32, S256) 191 .widenScalarToNextPow2(0, 32) 192 .clampMaxNumElements(0, S32, 16) 193 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 194 .legalIf(isPointer(0)); 195 196 if (ST.has16BitInsts()) { 197 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 198 .legalFor({S32, S16}) 199 .clampScalar(0, S16, S32) 200 .scalarize(0); 201 } else { 202 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 203 .legalFor({S32}) 204 .clampScalar(0, S32, S32) 205 .scalarize(0); 206 } 207 208 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 209 .legalFor({S32}) 210 .clampScalar(0, S32, S32) 211 .scalarize(0); 212 213 // Report legal for any types we can handle anywhere. For the cases only legal 214 // on the SALU, RegBankSelect will be able to re-legalize. 215 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 216 .legalFor({S32, S1, S64, V2S32, V2S16, V4S16}) 217 .clampScalar(0, S32, S64) 218 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 219 .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0)) 220 .widenScalarToNextPow2(0) 221 .scalarize(0); 222 223 getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO, 224 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 225 .legalFor({{S32, S1}}) 226 .clampScalar(0, S32, S32); 227 228 getActionDefinitionsBuilder(G_BITCAST) 229 .legalForCartesianProduct({S32, V2S16}) 230 .legalForCartesianProduct({S64, V2S32, V4S16}) 231 .legalForCartesianProduct({V2S64, V4S32}) 232 // Don't worry about the size constraint. 233 .legalIf(all(isPointer(0), isPointer(1))); 234 235 if (ST.has16BitInsts()) { 236 getActionDefinitionsBuilder(G_FCONSTANT) 237 .legalFor({S32, S64, S16}) 238 .clampScalar(0, S16, S64); 239 } else { 240 getActionDefinitionsBuilder(G_FCONSTANT) 241 .legalFor({S32, S64}) 242 .clampScalar(0, S32, S64); 243 } 244 245 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 246 .legalFor({S1, S32, S64, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 247 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 248 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 249 .clampScalarOrElt(0, S32, S512) 250 .legalIf(isMultiple32(0)) 251 .widenScalarToNextPow2(0, 32) 252 .clampMaxNumElements(0, S32, 16); 253 254 255 // FIXME: i1 operands to intrinsics should always be legal, but other i1 256 // values may not be legal. We need to figure out how to distinguish 257 // between these two scenarios. 258 getActionDefinitionsBuilder(G_CONSTANT) 259 .legalFor({S1, S32, S64, GlobalPtr, 260 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 261 .clampScalar(0, S32, S64) 262 .widenScalarToNextPow2(0) 263 .legalIf(isPointer(0)); 264 265 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 266 267 auto &FPOpActions = getActionDefinitionsBuilder( 268 { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA, G_FCANONICALIZE}) 269 .legalFor({S32, S64}); 270 271 if (ST.has16BitInsts()) { 272 if (ST.hasVOP3PInsts()) 273 FPOpActions.legalFor({S16, V2S16}); 274 else 275 FPOpActions.legalFor({S16}); 276 } 277 278 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 279 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 280 281 if (ST.hasVOP3PInsts()) { 282 MinNumMaxNum.customFor(FPTypesPK16) 283 .clampMaxNumElements(0, S16, 2) 284 .clampScalar(0, S16, S64) 285 .scalarize(0); 286 } else if (ST.has16BitInsts()) { 287 MinNumMaxNum.customFor(FPTypes16) 288 .clampScalar(0, S16, S64) 289 .scalarize(0); 290 } else { 291 MinNumMaxNum.customFor(FPTypesBase) 292 .clampScalar(0, S32, S64) 293 .scalarize(0); 294 } 295 296 // TODO: Implement 297 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); 298 299 if (ST.hasVOP3PInsts()) 300 FPOpActions.clampMaxNumElements(0, S16, 2); 301 FPOpActions 302 .scalarize(0) 303 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 304 305 if (ST.has16BitInsts()) { 306 getActionDefinitionsBuilder(G_FSQRT) 307 .legalFor({S32, S64, S16}) 308 .scalarize(0) 309 .clampScalar(0, S16, S64); 310 } else { 311 getActionDefinitionsBuilder(G_FSQRT) 312 .legalFor({S32, S64}) 313 .scalarize(0) 314 .clampScalar(0, S32, S64); 315 } 316 317 getActionDefinitionsBuilder(G_FPTRUNC) 318 .legalFor({{S32, S64}, {S16, S32}}) 319 .scalarize(0); 320 321 getActionDefinitionsBuilder(G_FPEXT) 322 .legalFor({{S64, S32}, {S32, S16}}) 323 .lowerFor({{S64, S16}}) // FIXME: Implement 324 .scalarize(0); 325 326 // TODO: Verify V_BFI_B32 is generated from expanded bit ops. 327 getActionDefinitionsBuilder(G_FCOPYSIGN).lower(); 328 329 getActionDefinitionsBuilder(G_FSUB) 330 // Use actual fsub instruction 331 .legalFor({S32}) 332 // Must use fadd + fneg 333 .lowerFor({S64, S16, V2S16}) 334 .scalarize(0) 335 .clampScalar(0, S32, S64); 336 337 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 338 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 339 {S32, S1}, {S64, S1}, {S16, S1}, 340 // FIXME: Hack 341 {S64, LLT::scalar(33)}, 342 {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}}) 343 .scalarize(0); 344 345 getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 346 .legalFor({{S32, S32}, {S64, S32}}) 347 .lowerFor({{S32, S64}}) 348 .customFor({{S64, S64}}) 349 .scalarize(0); 350 351 getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 352 .legalFor({{S32, S32}, {S32, S64}}) 353 .scalarize(0); 354 355 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 356 .legalFor({S32, S64}) 357 .scalarize(0); 358 359 if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 360 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 361 .legalFor({S32, S64}) 362 .clampScalar(0, S32, S64) 363 .scalarize(0); 364 } else { 365 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 366 .legalFor({S32}) 367 .customFor({S64}) 368 .clampScalar(0, S32, S64) 369 .scalarize(0); 370 } 371 372 getActionDefinitionsBuilder(G_GEP) 373 .legalForCartesianProduct(AddrSpaces64, {S64}) 374 .legalForCartesianProduct(AddrSpaces32, {S32}) 375 .scalarize(0); 376 377 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 378 379 auto &CmpBuilder = 380 getActionDefinitionsBuilder(G_ICMP) 381 .legalForCartesianProduct( 382 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 383 .legalFor({{S1, S32}, {S1, S64}}); 384 if (ST.has16BitInsts()) { 385 CmpBuilder.legalFor({{S1, S16}}); 386 } 387 388 CmpBuilder 389 .widenScalarToNextPow2(1) 390 .clampScalar(1, S32, S64) 391 .scalarize(0) 392 .legalIf(all(typeIs(0, S1), isPointer(1))); 393 394 getActionDefinitionsBuilder(G_FCMP) 395 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 396 .widenScalarToNextPow2(1) 397 .clampScalar(1, S32, S64) 398 .scalarize(0); 399 400 // FIXME: fexp, flog2, flog10 needs to be custom lowered. 401 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, 402 G_FLOG, G_FLOG2, G_FLOG10}) 403 .legalFor({S32}) 404 .scalarize(0); 405 406 // The 64-bit versions produce 32-bit results, but only on the SALU. 407 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, 408 G_CTTZ, G_CTTZ_ZERO_UNDEF, 409 G_CTPOP}) 410 .legalFor({{S32, S32}, {S32, S64}}) 411 .clampScalar(0, S32, S32) 412 .clampScalar(1, S32, S64) 413 .scalarize(0) 414 .widenScalarToNextPow2(0, 32) 415 .widenScalarToNextPow2(1, 32); 416 417 // TODO: Expand for > s32 418 getActionDefinitionsBuilder(G_BSWAP) 419 .legalFor({S32}) 420 .clampScalar(0, S32, S32) 421 .scalarize(0); 422 423 if (ST.has16BitInsts()) { 424 if (ST.hasVOP3PInsts()) { 425 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 426 .legalFor({S32, S16, V2S16}) 427 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 428 .clampMaxNumElements(0, S16, 2) 429 .clampScalar(0, S16, S32) 430 .widenScalarToNextPow2(0) 431 .scalarize(0); 432 } else { 433 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 434 .legalFor({S32, S16}) 435 .widenScalarToNextPow2(0) 436 .clampScalar(0, S16, S32) 437 .scalarize(0); 438 } 439 } else { 440 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 441 .legalFor({S32}) 442 .clampScalar(0, S32, S32) 443 .widenScalarToNextPow2(0) 444 .scalarize(0); 445 } 446 447 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 448 return [=](const LegalityQuery &Query) { 449 return Query.Types[TypeIdx0].getSizeInBits() < 450 Query.Types[TypeIdx1].getSizeInBits(); 451 }; 452 }; 453 454 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 455 return [=](const LegalityQuery &Query) { 456 return Query.Types[TypeIdx0].getSizeInBits() > 457 Query.Types[TypeIdx1].getSizeInBits(); 458 }; 459 }; 460 461 getActionDefinitionsBuilder(G_INTTOPTR) 462 // List the common cases 463 .legalForCartesianProduct(AddrSpaces64, {S64}) 464 .legalForCartesianProduct(AddrSpaces32, {S32}) 465 .scalarize(0) 466 // Accept any address space as long as the size matches 467 .legalIf(sameSize(0, 1)) 468 .widenScalarIf(smallerThan(1, 0), 469 [](const LegalityQuery &Query) { 470 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 471 }) 472 .narrowScalarIf(greaterThan(1, 0), 473 [](const LegalityQuery &Query) { 474 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 475 }); 476 477 getActionDefinitionsBuilder(G_PTRTOINT) 478 // List the common cases 479 .legalForCartesianProduct(AddrSpaces64, {S64}) 480 .legalForCartesianProduct(AddrSpaces32, {S32}) 481 .scalarize(0) 482 // Accept any address space as long as the size matches 483 .legalIf(sameSize(0, 1)) 484 .widenScalarIf(smallerThan(0, 1), 485 [](const LegalityQuery &Query) { 486 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 487 }) 488 .narrowScalarIf( 489 greaterThan(0, 1), 490 [](const LegalityQuery &Query) { 491 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 492 }); 493 494 if (ST.hasFlatAddressSpace()) { 495 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 496 .scalarize(0) 497 .custom(); 498 } 499 500 getActionDefinitionsBuilder({G_LOAD, G_STORE}) 501 .narrowScalarIf([](const LegalityQuery &Query) { 502 unsigned Size = Query.Types[0].getSizeInBits(); 503 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 504 return (Size > 32 && MemSize < Size); 505 }, 506 [](const LegalityQuery &Query) { 507 return std::make_pair(0, LLT::scalar(32)); 508 }) 509 .fewerElementsIf([=](const LegalityQuery &Query) { 510 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 511 return (MemSize == 96) && 512 Query.Types[0].isVector() && 513 !ST.hasDwordx3LoadStores(); 514 }, 515 [=](const LegalityQuery &Query) { 516 return std::make_pair(0, V2S32); 517 }) 518 .legalIf([=](const LegalityQuery &Query) { 519 const LLT &Ty0 = Query.Types[0]; 520 521 unsigned Size = Ty0.getSizeInBits(); 522 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 523 if (Size < 32 || (Size > 32 && MemSize < Size)) 524 return false; 525 526 if (Ty0.isVector() && Size != MemSize) 527 return false; 528 529 // TODO: Decompose private loads into 4-byte components. 530 // TODO: Illegal flat loads on SI 531 switch (MemSize) { 532 case 8: 533 case 16: 534 return Size == 32; 535 case 32: 536 case 64: 537 case 128: 538 return true; 539 540 case 96: 541 return ST.hasDwordx3LoadStores(); 542 543 case 256: 544 case 512: 545 // TODO: Possibly support loads of i256 and i512 . This will require 546 // adding i256 and i512 types to MVT in order for to be able to use 547 // TableGen. 548 // TODO: Add support for other vector types, this will require 549 // defining more value mappings for the new types. 550 return Ty0.isVector() && (Ty0.getScalarType().getSizeInBits() == 32 || 551 Ty0.getScalarType().getSizeInBits() == 64); 552 553 default: 554 return false; 555 } 556 }) 557 .clampScalar(0, S32, S64); 558 559 560 // FIXME: Handle alignment requirements. 561 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 562 .legalForTypesWithMemDesc({ 563 {S32, GlobalPtr, 8, 8}, 564 {S32, GlobalPtr, 16, 8}, 565 {S32, LocalPtr, 8, 8}, 566 {S32, LocalPtr, 16, 8}, 567 {S32, PrivatePtr, 8, 8}, 568 {S32, PrivatePtr, 16, 8}}); 569 if (ST.hasFlatAddressSpace()) { 570 ExtLoads.legalForTypesWithMemDesc({{S32, FlatPtr, 8, 8}, 571 {S32, FlatPtr, 16, 8}}); 572 } 573 574 ExtLoads.clampScalar(0, S32, S32) 575 .widenScalarToNextPow2(0) 576 .unsupportedIfMemSizeNotPow2() 577 .lower(); 578 579 auto &Atomics = getActionDefinitionsBuilder( 580 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 581 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 582 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 583 G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG}) 584 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 585 {S64, GlobalPtr}, {S64, LocalPtr}}); 586 if (ST.hasFlatAddressSpace()) { 587 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 588 } 589 590 // TODO: Pointer types, any 32-bit or 64-bit vector 591 getActionDefinitionsBuilder(G_SELECT) 592 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 593 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 594 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1}) 595 .clampScalar(0, S16, S64) 596 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 597 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 598 .scalarize(1) 599 .clampMaxNumElements(0, S32, 2) 600 .clampMaxNumElements(0, LocalPtr, 2) 601 .clampMaxNumElements(0, PrivatePtr, 2) 602 .scalarize(0) 603 .widenScalarToNextPow2(0) 604 .legalIf(all(isPointer(0), typeIs(1, S1))); 605 606 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 607 // be more flexible with the shift amount type. 608 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 609 .legalFor({{S32, S32}, {S64, S32}}); 610 if (ST.has16BitInsts()) { 611 if (ST.hasVOP3PInsts()) { 612 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 613 .clampMaxNumElements(0, S16, 2); 614 } else 615 Shifts.legalFor({{S16, S32}, {S16, S16}}); 616 617 Shifts.clampScalar(1, S16, S32); 618 Shifts.clampScalar(0, S16, S64); 619 Shifts.widenScalarToNextPow2(0, 16); 620 } else { 621 // Make sure we legalize the shift amount type first, as the general 622 // expansion for the shifted type will produce much worse code if it hasn't 623 // been truncated already. 624 Shifts.clampScalar(1, S32, S32); 625 Shifts.clampScalar(0, S32, S64); 626 Shifts.widenScalarToNextPow2(0, 32); 627 } 628 Shifts.scalarize(0); 629 630 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 631 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 632 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 633 unsigned IdxTypeIdx = 2; 634 635 getActionDefinitionsBuilder(Op) 636 .legalIf([=](const LegalityQuery &Query) { 637 const LLT &VecTy = Query.Types[VecTypeIdx]; 638 const LLT &IdxTy = Query.Types[IdxTypeIdx]; 639 return VecTy.getSizeInBits() % 32 == 0 && 640 VecTy.getSizeInBits() <= 512 && 641 IdxTy.getSizeInBits() == 32; 642 }) 643 .clampScalar(EltTypeIdx, S32, S64) 644 .clampScalar(VecTypeIdx, S32, S64) 645 .clampScalar(IdxTypeIdx, S32, S32); 646 } 647 648 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 649 .unsupportedIf([=](const LegalityQuery &Query) { 650 const LLT &EltTy = Query.Types[1].getElementType(); 651 return Query.Types[0] != EltTy; 652 }); 653 654 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 655 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 656 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 657 658 // FIXME: Doesn't handle extract of illegal sizes. 659 getActionDefinitionsBuilder(Op) 660 .legalIf([=](const LegalityQuery &Query) { 661 const LLT BigTy = Query.Types[BigTyIdx]; 662 const LLT LitTy = Query.Types[LitTyIdx]; 663 return (BigTy.getSizeInBits() % 32 == 0) && 664 (LitTy.getSizeInBits() % 16 == 0); 665 }) 666 .widenScalarIf( 667 [=](const LegalityQuery &Query) { 668 const LLT BigTy = Query.Types[BigTyIdx]; 669 return (BigTy.getScalarSizeInBits() < 16); 670 }, 671 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 672 .widenScalarIf( 673 [=](const LegalityQuery &Query) { 674 const LLT LitTy = Query.Types[LitTyIdx]; 675 return (LitTy.getScalarSizeInBits() < 16); 676 }, 677 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 678 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 679 .widenScalarToNextPow2(BigTyIdx, 32); 680 681 } 682 683 getActionDefinitionsBuilder(G_BUILD_VECTOR) 684 .legalForCartesianProduct(AllS32Vectors, {S32}) 685 .legalForCartesianProduct(AllS64Vectors, {S64}) 686 .clampNumElements(0, V16S32, V16S32) 687 .clampNumElements(0, V2S64, V8S64) 688 .minScalarSameAs(1, 0) 689 .legalIf(isRegisterType(0)) 690 .minScalarOrElt(0, S32); 691 692 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 693 .legalIf(isRegisterType(0)); 694 695 // Merge/Unmerge 696 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 697 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 698 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 699 700 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 701 const LLT &Ty = Query.Types[TypeIdx]; 702 if (Ty.isVector()) { 703 const LLT &EltTy = Ty.getElementType(); 704 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 705 return true; 706 if (!isPowerOf2_32(EltTy.getSizeInBits())) 707 return true; 708 } 709 return false; 710 }; 711 712 getActionDefinitionsBuilder(Op) 713 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 714 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 715 // worth considering the multiples of 64 since 2*192 and 2*384 are not 716 // valid. 717 .clampScalar(LitTyIdx, S16, S256) 718 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 719 720 // Break up vectors with weird elements into scalars 721 .fewerElementsIf( 722 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 723 scalarize(0)) 724 .fewerElementsIf( 725 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 726 scalarize(1)) 727 .clampScalar(BigTyIdx, S32, S512) 728 .widenScalarIf( 729 [=](const LegalityQuery &Query) { 730 const LLT &Ty = Query.Types[BigTyIdx]; 731 return !isPowerOf2_32(Ty.getSizeInBits()) && 732 Ty.getSizeInBits() % 16 != 0; 733 }, 734 [=](const LegalityQuery &Query) { 735 // Pick the next power of 2, or a multiple of 64 over 128. 736 // Whichever is smaller. 737 const LLT &Ty = Query.Types[BigTyIdx]; 738 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 739 if (NewSizeInBits >= 256) { 740 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 741 if (RoundedTo < NewSizeInBits) 742 NewSizeInBits = RoundedTo; 743 } 744 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 745 }) 746 .legalIf([=](const LegalityQuery &Query) { 747 const LLT &BigTy = Query.Types[BigTyIdx]; 748 const LLT &LitTy = Query.Types[LitTyIdx]; 749 750 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 751 return false; 752 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 753 return false; 754 755 return BigTy.getSizeInBits() % 16 == 0 && 756 LitTy.getSizeInBits() % 16 == 0 && 757 BigTy.getSizeInBits() <= 512; 758 }) 759 // Any vectors left are the wrong size. Scalarize them. 760 .scalarize(0) 761 .scalarize(1); 762 } 763 764 computeTables(); 765 verify(*ST.getInstrInfo()); 766 } 767 768 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 769 MachineRegisterInfo &MRI, 770 MachineIRBuilder &MIRBuilder, 771 GISelChangeObserver &Observer) const { 772 switch (MI.getOpcode()) { 773 case TargetOpcode::G_ADDRSPACE_CAST: 774 return legalizeAddrSpaceCast(MI, MRI, MIRBuilder); 775 case TargetOpcode::G_FRINT: 776 return legalizeFrint(MI, MRI, MIRBuilder); 777 case TargetOpcode::G_FCEIL: 778 return legalizeFceil(MI, MRI, MIRBuilder); 779 case TargetOpcode::G_INTRINSIC_TRUNC: 780 return legalizeIntrinsicTrunc(MI, MRI, MIRBuilder); 781 case TargetOpcode::G_SITOFP: 782 return legalizeITOFP(MI, MRI, MIRBuilder, true); 783 case TargetOpcode::G_UITOFP: 784 return legalizeITOFP(MI, MRI, MIRBuilder, false); 785 case TargetOpcode::G_FMINNUM: 786 case TargetOpcode::G_FMAXNUM: 787 case TargetOpcode::G_FMINNUM_IEEE: 788 case TargetOpcode::G_FMAXNUM_IEEE: 789 return legalizeMinNumMaxNum(MI, MRI, MIRBuilder); 790 default: 791 return false; 792 } 793 794 llvm_unreachable("expected switch to return"); 795 } 796 797 Register AMDGPULegalizerInfo::getSegmentAperture( 798 unsigned AS, 799 MachineRegisterInfo &MRI, 800 MachineIRBuilder &MIRBuilder) const { 801 MachineFunction &MF = MIRBuilder.getMF(); 802 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 803 const LLT S32 = LLT::scalar(32); 804 805 if (ST.hasApertureRegs()) { 806 // FIXME: Use inline constants (src_{shared, private}_base) instead of 807 // getreg. 808 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 809 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 810 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 811 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 812 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 813 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 814 unsigned Encoding = 815 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 816 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 817 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 818 819 Register ApertureReg = MRI.createGenericVirtualRegister(S32); 820 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 821 822 MIRBuilder.buildInstr(AMDGPU::S_GETREG_B32) 823 .addDef(GetReg) 824 .addImm(Encoding); 825 MRI.setType(GetReg, S32); 826 827 auto ShiftAmt = MIRBuilder.buildConstant(S32, WidthM1 + 1); 828 MIRBuilder.buildInstr(TargetOpcode::G_SHL) 829 .addDef(ApertureReg) 830 .addUse(GetReg) 831 .addUse(ShiftAmt.getReg(0)); 832 833 return ApertureReg; 834 } 835 836 Register QueuePtr = MRI.createGenericVirtualRegister( 837 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 838 839 // FIXME: Placeholder until we can track the input registers. 840 MIRBuilder.buildConstant(QueuePtr, 0xdeadbeef); 841 842 // Offset into amd_queue_t for group_segment_aperture_base_hi / 843 // private_segment_aperture_base_hi. 844 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 845 846 // FIXME: Don't use undef 847 Value *V = UndefValue::get(PointerType::get( 848 Type::getInt8Ty(MF.getFunction().getContext()), 849 AMDGPUAS::CONSTANT_ADDRESS)); 850 851 MachinePointerInfo PtrInfo(V, StructOffset); 852 MachineMemOperand *MMO = MF.getMachineMemOperand( 853 PtrInfo, 854 MachineMemOperand::MOLoad | 855 MachineMemOperand::MODereferenceable | 856 MachineMemOperand::MOInvariant, 857 4, 858 MinAlign(64, StructOffset)); 859 860 Register LoadResult = MRI.createGenericVirtualRegister(S32); 861 Register LoadAddr; 862 863 MIRBuilder.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 864 MIRBuilder.buildLoad(LoadResult, LoadAddr, *MMO); 865 return LoadResult; 866 } 867 868 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 869 MachineInstr &MI, MachineRegisterInfo &MRI, 870 MachineIRBuilder &MIRBuilder) const { 871 MachineFunction &MF = MIRBuilder.getMF(); 872 873 MIRBuilder.setInstr(MI); 874 875 Register Dst = MI.getOperand(0).getReg(); 876 Register Src = MI.getOperand(1).getReg(); 877 878 LLT DstTy = MRI.getType(Dst); 879 LLT SrcTy = MRI.getType(Src); 880 unsigned DestAS = DstTy.getAddressSpace(); 881 unsigned SrcAS = SrcTy.getAddressSpace(); 882 883 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 884 // vector element. 885 assert(!DstTy.isVector()); 886 887 const AMDGPUTargetMachine &TM 888 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 889 890 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 891 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 892 MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BITCAST)); 893 return true; 894 } 895 896 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 897 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 898 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 899 unsigned NullVal = TM.getNullPointerValue(DestAS); 900 901 auto SegmentNull = MIRBuilder.buildConstant(DstTy, NullVal); 902 auto FlatNull = MIRBuilder.buildConstant(SrcTy, 0); 903 904 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy); 905 906 // Extract low 32-bits of the pointer. 907 MIRBuilder.buildExtract(PtrLo32, Src, 0); 908 909 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 910 MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); 911 MIRBuilder.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 912 913 MI.eraseFromParent(); 914 return true; 915 } 916 917 assert(SrcAS == AMDGPUAS::LOCAL_ADDRESS || 918 SrcAS == AMDGPUAS::PRIVATE_ADDRESS); 919 920 auto SegmentNull = 921 MIRBuilder.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 922 auto FlatNull = 923 MIRBuilder.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 924 925 Register ApertureReg = getSegmentAperture(DestAS, MRI, MIRBuilder); 926 927 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 928 MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); 929 930 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy); 931 932 // Coerce the type of the low half of the result so we can use merge_values. 933 Register SrcAsInt = MRI.createGenericVirtualRegister(LLT::scalar(32)); 934 MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT) 935 .addDef(SrcAsInt) 936 .addUse(Src); 937 938 // TODO: Should we allow mismatched types but matching sizes in merges to 939 // avoid the ptrtoint? 940 MIRBuilder.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); 941 MIRBuilder.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); 942 943 MI.eraseFromParent(); 944 return true; 945 } 946 947 bool AMDGPULegalizerInfo::legalizeFrint( 948 MachineInstr &MI, MachineRegisterInfo &MRI, 949 MachineIRBuilder &MIRBuilder) const { 950 MIRBuilder.setInstr(MI); 951 952 Register Src = MI.getOperand(1).getReg(); 953 LLT Ty = MRI.getType(Src); 954 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 955 956 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 957 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 958 959 auto C1 = MIRBuilder.buildFConstant(Ty, C1Val); 960 auto CopySign = MIRBuilder.buildFCopysign(Ty, C1, Src); 961 962 // TODO: Should this propagate fast-math-flags? 963 auto Tmp1 = MIRBuilder.buildFAdd(Ty, Src, CopySign); 964 auto Tmp2 = MIRBuilder.buildFSub(Ty, Tmp1, CopySign); 965 966 auto C2 = MIRBuilder.buildFConstant(Ty, C2Val); 967 auto Fabs = MIRBuilder.buildFAbs(Ty, Src); 968 969 auto Cond = MIRBuilder.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 970 MIRBuilder.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 971 return true; 972 } 973 974 bool AMDGPULegalizerInfo::legalizeFceil( 975 MachineInstr &MI, MachineRegisterInfo &MRI, 976 MachineIRBuilder &B) const { 977 B.setInstr(MI); 978 979 const LLT S1 = LLT::scalar(1); 980 const LLT S64 = LLT::scalar(64); 981 982 Register Src = MI.getOperand(1).getReg(); 983 assert(MRI.getType(Src) == S64); 984 985 // result = trunc(src) 986 // if (src > 0.0 && src != result) 987 // result += 1.0 988 989 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src}); 990 991 const auto Zero = B.buildFConstant(S64, 0.0); 992 const auto One = B.buildFConstant(S64, 1.0); 993 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 994 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 995 auto And = B.buildAnd(S1, Lt0, NeTrunc); 996 auto Add = B.buildSelect(S64, And, One, Zero); 997 998 // TODO: Should this propagate fast-math-flags? 999 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1000 return true; 1001 } 1002 1003 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1004 MachineIRBuilder &B) { 1005 const unsigned FractBits = 52; 1006 const unsigned ExpBits = 11; 1007 LLT S32 = LLT::scalar(32); 1008 1009 auto Const0 = B.buildConstant(S32, FractBits - 32); 1010 auto Const1 = B.buildConstant(S32, ExpBits); 1011 1012 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1013 .addUse(Const0.getReg(0)) 1014 .addUse(Const1.getReg(0)); 1015 1016 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1017 } 1018 1019 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1020 MachineInstr &MI, MachineRegisterInfo &MRI, 1021 MachineIRBuilder &B) const { 1022 B.setInstr(MI); 1023 1024 const LLT S1 = LLT::scalar(1); 1025 const LLT S32 = LLT::scalar(32); 1026 const LLT S64 = LLT::scalar(64); 1027 1028 Register Src = MI.getOperand(1).getReg(); 1029 assert(MRI.getType(Src) == S64); 1030 1031 // TODO: Should this use extract since the low half is unused? 1032 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1033 Register Hi = Unmerge.getReg(1); 1034 1035 // Extract the upper half, since this is where we will find the sign and 1036 // exponent. 1037 auto Exp = extractF64Exponent(Hi, B); 1038 1039 const unsigned FractBits = 52; 1040 1041 // Extract the sign bit. 1042 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1043 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1044 1045 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1046 1047 const auto Zero32 = B.buildConstant(S32, 0); 1048 1049 // Extend back to 64-bits. 1050 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); 1051 1052 auto Shr = B.buildAShr(S64, FractMask, Exp); 1053 auto Not = B.buildNot(S64, Shr); 1054 auto Tmp0 = B.buildAnd(S64, Src, Not); 1055 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1056 1057 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1058 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1059 1060 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1061 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1062 return true; 1063 } 1064 1065 bool AMDGPULegalizerInfo::legalizeITOFP( 1066 MachineInstr &MI, MachineRegisterInfo &MRI, 1067 MachineIRBuilder &B, bool Signed) const { 1068 B.setInstr(MI); 1069 1070 Register Dst = MI.getOperand(0).getReg(); 1071 Register Src = MI.getOperand(1).getReg(); 1072 1073 const LLT S64 = LLT::scalar(64); 1074 const LLT S32 = LLT::scalar(32); 1075 1076 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1077 1078 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1079 1080 auto CvtHi = Signed ? 1081 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1082 B.buildUITOFP(S64, Unmerge.getReg(1)); 1083 1084 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1085 1086 auto ThirtyTwo = B.buildConstant(S32, 32); 1087 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1088 .addUse(CvtHi.getReg(0)) 1089 .addUse(ThirtyTwo.getReg(0)); 1090 1091 // TODO: Should this propagate fast-math-flags? 1092 B.buildFAdd(Dst, LdExp, CvtLo); 1093 MI.eraseFromParent(); 1094 return true; 1095 } 1096 1097 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1098 MachineInstr &MI, MachineRegisterInfo &MRI, 1099 MachineIRBuilder &B) const { 1100 MachineFunction &MF = B.getMF(); 1101 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1102 1103 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1104 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1105 1106 // With ieee_mode disabled, the instructions have the correct behavior 1107 // already for G_FMINNUM/G_FMAXNUM 1108 if (!MFI->getMode().IEEE) 1109 return !IsIEEEOp; 1110 1111 if (IsIEEEOp) 1112 return true; 1113 1114 MachineIRBuilder HelperBuilder(MI); 1115 GISelObserverWrapper DummyObserver; 1116 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1117 HelperBuilder.setMBB(*MI.getParent()); 1118 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1119 } 1120 1121 // Return the use branch instruction, otherwise null if the usage is invalid. 1122 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 1123 MachineRegisterInfo &MRI) { 1124 Register CondDef = MI.getOperand(0).getReg(); 1125 if (!MRI.hasOneNonDBGUse(CondDef)) 1126 return nullptr; 1127 1128 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 1129 return UseMI.getParent() == MI.getParent() && 1130 UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr; 1131 } 1132 1133 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 1134 Register Reg, LLT Ty) const { 1135 Register LiveIn = MRI.getLiveInVirtReg(Reg); 1136 if (LiveIn) 1137 return LiveIn; 1138 1139 Register NewReg = MRI.createGenericVirtualRegister(Ty); 1140 MRI.addLiveIn(Reg, NewReg); 1141 return NewReg; 1142 } 1143 1144 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 1145 const ArgDescriptor *Arg) const { 1146 if (!Arg->isRegister()) 1147 return false; // TODO: Handle these 1148 1149 assert(Arg->getRegister() != 0); 1150 assert(Arg->getRegister().isPhysical()); 1151 1152 MachineRegisterInfo &MRI = *B.getMRI(); 1153 1154 LLT Ty = MRI.getType(DstReg); 1155 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 1156 1157 if (Arg->isMasked()) { 1158 // TODO: Should we try to emit this once in the entry block? 1159 const LLT S32 = LLT::scalar(32); 1160 const unsigned Mask = Arg->getMask(); 1161 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 1162 1163 auto ShiftAmt = B.buildConstant(S32, Shift); 1164 auto LShr = B.buildLShr(S32, LiveIn, ShiftAmt); 1165 B.buildAnd(DstReg, LShr, B.buildConstant(S32, Mask >> Shift)); 1166 } else 1167 B.buildCopy(DstReg, LiveIn); 1168 1169 // Insert the argument copy if it doens't already exist. 1170 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 1171 if (!MRI.getVRegDef(LiveIn)) { 1172 MachineBasicBlock &EntryMBB = B.getMF().front(); 1173 EntryMBB.addLiveIn(Arg->getRegister()); 1174 B.setInsertPt(EntryMBB, EntryMBB.begin()); 1175 B.buildCopy(LiveIn, Arg->getRegister()); 1176 } 1177 1178 return true; 1179 } 1180 1181 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 1182 MachineInstr &MI, 1183 MachineRegisterInfo &MRI, 1184 MachineIRBuilder &B, 1185 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 1186 B.setInstr(MI); 1187 1188 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1189 1190 const ArgDescriptor *Arg; 1191 const TargetRegisterClass *RC; 1192 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 1193 if (!Arg) { 1194 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 1195 return false; 1196 } 1197 1198 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 1199 MI.eraseFromParent(); 1200 return true; 1201 } 1202 1203 return false; 1204 } 1205 1206 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 1207 MachineRegisterInfo &MRI, 1208 MachineIRBuilder &B) const { 1209 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1210 if (!MFI->isEntryFunction()) { 1211 return legalizePreloadedArgIntrin(MI, MRI, B, 1212 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 1213 } 1214 1215 B.setInstr(MI); 1216 1217 uint64_t Offset = 1218 ST.getTargetLowering()->getImplicitParameterOffset( 1219 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 1220 Register DstReg = MI.getOperand(0).getReg(); 1221 LLT DstTy = MRI.getType(DstReg); 1222 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 1223 1224 const ArgDescriptor *Arg; 1225 const TargetRegisterClass *RC; 1226 std::tie(Arg, RC) 1227 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 1228 if (!Arg) 1229 return false; 1230 1231 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 1232 if (!loadInputValue(KernargPtrReg, B, Arg)) 1233 return false; 1234 1235 B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 1236 MI.eraseFromParent(); 1237 return true; 1238 } 1239 1240 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 1241 MachineRegisterInfo &MRI, 1242 MachineIRBuilder &B) const { 1243 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 1244 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { 1245 case Intrinsic::amdgcn_if: { 1246 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 1247 const SIRegisterInfo *TRI 1248 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 1249 1250 B.setInstr(*BrCond); 1251 Register Def = MI.getOperand(1).getReg(); 1252 Register Use = MI.getOperand(3).getReg(); 1253 B.buildInstr(AMDGPU::SI_IF) 1254 .addDef(Def) 1255 .addUse(Use) 1256 .addMBB(BrCond->getOperand(1).getMBB()); 1257 1258 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 1259 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 1260 MI.eraseFromParent(); 1261 BrCond->eraseFromParent(); 1262 return true; 1263 } 1264 1265 return false; 1266 } 1267 case Intrinsic::amdgcn_loop: { 1268 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 1269 const SIRegisterInfo *TRI 1270 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 1271 1272 B.setInstr(*BrCond); 1273 Register Reg = MI.getOperand(2).getReg(); 1274 B.buildInstr(AMDGPU::SI_LOOP) 1275 .addUse(Reg) 1276 .addMBB(BrCond->getOperand(1).getMBB()); 1277 MI.eraseFromParent(); 1278 BrCond->eraseFromParent(); 1279 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 1280 return true; 1281 } 1282 1283 return false; 1284 } 1285 case Intrinsic::amdgcn_kernarg_segment_ptr: 1286 return legalizePreloadedArgIntrin( 1287 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 1288 case Intrinsic::amdgcn_implicitarg_ptr: 1289 return legalizeImplicitArgPtr(MI, MRI, B); 1290 case Intrinsic::amdgcn_workitem_id_x: 1291 return legalizePreloadedArgIntrin(MI, MRI, B, 1292 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 1293 case Intrinsic::amdgcn_workitem_id_y: 1294 return legalizePreloadedArgIntrin(MI, MRI, B, 1295 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 1296 case Intrinsic::amdgcn_workitem_id_z: 1297 return legalizePreloadedArgIntrin(MI, MRI, B, 1298 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 1299 case Intrinsic::amdgcn_workgroup_id_x: 1300 return legalizePreloadedArgIntrin(MI, MRI, B, 1301 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 1302 case Intrinsic::amdgcn_workgroup_id_y: 1303 return legalizePreloadedArgIntrin(MI, MRI, B, 1304 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 1305 case Intrinsic::amdgcn_workgroup_id_z: 1306 return legalizePreloadedArgIntrin(MI, MRI, B, 1307 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 1308 case Intrinsic::amdgcn_dispatch_ptr: 1309 return legalizePreloadedArgIntrin(MI, MRI, B, 1310 AMDGPUFunctionArgInfo::DISPATCH_PTR); 1311 case Intrinsic::amdgcn_queue_ptr: 1312 return legalizePreloadedArgIntrin(MI, MRI, B, 1313 AMDGPUFunctionArgInfo::QUEUE_PTR); 1314 case Intrinsic::amdgcn_implicit_buffer_ptr: 1315 return legalizePreloadedArgIntrin( 1316 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 1317 case Intrinsic::amdgcn_dispatch_id: 1318 return legalizePreloadedArgIntrin(MI, MRI, B, 1319 AMDGPUFunctionArgInfo::DISPATCH_ID); 1320 default: 1321 return true; 1322 } 1323 1324 return true; 1325 } 1326