1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPULegalizerInfo.h" 22 23 #include "AMDGPU.h" 24 #include "AMDGPUGlobalISelUtils.h" 25 #include "AMDGPUTargetMachine.h" 26 #include "SIMachineFunctionInfo.h" 27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 30 #include "llvm/CodeGen/TargetOpcodes.h" 31 #include "llvm/CodeGen/ValueTypes.h" 32 #include "llvm/IR/DerivedTypes.h" 33 #include "llvm/IR/DiagnosticInfo.h" 34 #include "llvm/IR/Type.h" 35 #include "llvm/Support/Debug.h" 36 37 #define DEBUG_TYPE "amdgpu-legalinfo" 38 39 using namespace llvm; 40 using namespace LegalizeActions; 41 using namespace LegalizeMutations; 42 using namespace LegalityPredicates; 43 using namespace MIPatternMatch; 44 45 // Round the number of elements to the next power of two elements 46 static LLT getPow2VectorType(LLT Ty) { 47 unsigned NElts = Ty.getNumElements(); 48 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 49 return Ty.changeNumElements(Pow2NElts); 50 } 51 52 // Round the number of bits to the next power of two bits 53 static LLT getPow2ScalarType(LLT Ty) { 54 unsigned Bits = Ty.getSizeInBits(); 55 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 56 return LLT::scalar(Pow2Bits); 57 } 58 59 static LegalityPredicate isMultiple32(unsigned TypeIdx, 60 unsigned MaxSize = 1024) { 61 return [=](const LegalityQuery &Query) { 62 const LLT Ty = Query.Types[TypeIdx]; 63 const LLT EltTy = Ty.getScalarType(); 64 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 65 }; 66 } 67 68 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 69 return [=](const LegalityQuery &Query) { 70 return Query.Types[TypeIdx].getSizeInBits() == Size; 71 }; 72 } 73 74 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 75 return [=](const LegalityQuery &Query) { 76 const LLT Ty = Query.Types[TypeIdx]; 77 return Ty.isVector() && 78 Ty.getNumElements() % 2 != 0 && 79 Ty.getElementType().getSizeInBits() < 32 && 80 Ty.getSizeInBits() % 32 != 0; 81 }; 82 } 83 84 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 85 return [=](const LegalityQuery &Query) { 86 const LLT Ty = Query.Types[TypeIdx]; 87 const LLT EltTy = Ty.getScalarType(); 88 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 89 }; 90 } 91 92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 93 return [=](const LegalityQuery &Query) { 94 const LLT Ty = Query.Types[TypeIdx]; 95 const LLT EltTy = Ty.getElementType(); 96 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 97 }; 98 } 99 100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 101 return [=](const LegalityQuery &Query) { 102 const LLT Ty = Query.Types[TypeIdx]; 103 const LLT EltTy = Ty.getElementType(); 104 unsigned Size = Ty.getSizeInBits(); 105 unsigned Pieces = (Size + 63) / 64; 106 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 107 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 108 }; 109 } 110 111 // Increase the number of vector elements to reach the next multiple of 32-bit 112 // type. 113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 114 return [=](const LegalityQuery &Query) { 115 const LLT Ty = Query.Types[TypeIdx]; 116 117 const LLT EltTy = Ty.getElementType(); 118 const int Size = Ty.getSizeInBits(); 119 const int EltSize = EltTy.getSizeInBits(); 120 const int NextMul32 = (Size + 31) / 32; 121 122 assert(EltSize < 32); 123 124 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 125 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 126 }; 127 } 128 129 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 130 return [=](const LegalityQuery &Query) { 131 const LLT QueryTy = Query.Types[TypeIdx]; 132 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 133 }; 134 } 135 136 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 137 return [=](const LegalityQuery &Query) { 138 const LLT QueryTy = Query.Types[TypeIdx]; 139 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 140 }; 141 } 142 143 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 144 return [=](const LegalityQuery &Query) { 145 const LLT QueryTy = Query.Types[TypeIdx]; 146 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 147 }; 148 } 149 150 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 151 // v2s16. 152 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 153 return [=](const LegalityQuery &Query) { 154 const LLT Ty = Query.Types[TypeIdx]; 155 if (Ty.isVector()) { 156 const int EltSize = Ty.getElementType().getSizeInBits(); 157 return EltSize == 32 || EltSize == 64 || 158 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 159 EltSize == 128 || EltSize == 256; 160 } 161 162 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 163 }; 164 } 165 166 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 167 return [=](const LegalityQuery &Query) { 168 const LLT QueryTy = Query.Types[TypeIdx]; 169 return QueryTy.isVector() && QueryTy.getElementType() == Type; 170 }; 171 } 172 173 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 174 return [=](const LegalityQuery &Query) { 175 const LLT Ty = Query.Types[TypeIdx]; 176 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 177 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 178 }; 179 } 180 181 static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) { 182 return [=](const LegalityQuery &Query) { 183 return Query.Types[TypeIdx0].getSizeInBits() < 184 Query.Types[TypeIdx1].getSizeInBits(); 185 }; 186 } 187 188 static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) { 189 return [=](const LegalityQuery &Query) { 190 return Query.Types[TypeIdx0].getSizeInBits() > 191 Query.Types[TypeIdx1].getSizeInBits(); 192 }; 193 } 194 195 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 196 const GCNTargetMachine &TM) 197 : ST(ST_) { 198 using namespace TargetOpcode; 199 200 auto GetAddrSpacePtr = [&TM](unsigned AS) { 201 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 202 }; 203 204 const LLT S1 = LLT::scalar(1); 205 const LLT S16 = LLT::scalar(16); 206 const LLT S32 = LLT::scalar(32); 207 const LLT S64 = LLT::scalar(64); 208 const LLT S128 = LLT::scalar(128); 209 const LLT S256 = LLT::scalar(256); 210 const LLT S1024 = LLT::scalar(1024); 211 212 const LLT V2S16 = LLT::vector(2, 16); 213 const LLT V4S16 = LLT::vector(4, 16); 214 215 const LLT V2S32 = LLT::vector(2, 32); 216 const LLT V3S32 = LLT::vector(3, 32); 217 const LLT V4S32 = LLT::vector(4, 32); 218 const LLT V5S32 = LLT::vector(5, 32); 219 const LLT V6S32 = LLT::vector(6, 32); 220 const LLT V7S32 = LLT::vector(7, 32); 221 const LLT V8S32 = LLT::vector(8, 32); 222 const LLT V9S32 = LLT::vector(9, 32); 223 const LLT V10S32 = LLT::vector(10, 32); 224 const LLT V11S32 = LLT::vector(11, 32); 225 const LLT V12S32 = LLT::vector(12, 32); 226 const LLT V13S32 = LLT::vector(13, 32); 227 const LLT V14S32 = LLT::vector(14, 32); 228 const LLT V15S32 = LLT::vector(15, 32); 229 const LLT V16S32 = LLT::vector(16, 32); 230 const LLT V32S32 = LLT::vector(32, 32); 231 232 const LLT V2S64 = LLT::vector(2, 64); 233 const LLT V3S64 = LLT::vector(3, 64); 234 const LLT V4S64 = LLT::vector(4, 64); 235 const LLT V5S64 = LLT::vector(5, 64); 236 const LLT V6S64 = LLT::vector(6, 64); 237 const LLT V7S64 = LLT::vector(7, 64); 238 const LLT V8S64 = LLT::vector(8, 64); 239 const LLT V16S64 = LLT::vector(16, 64); 240 241 std::initializer_list<LLT> AllS32Vectors = 242 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 243 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 244 std::initializer_list<LLT> AllS64Vectors = 245 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 246 247 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 248 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 249 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 250 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 251 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 252 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 253 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 254 255 const LLT CodePtr = FlatPtr; 256 257 const std::initializer_list<LLT> AddrSpaces64 = { 258 GlobalPtr, ConstantPtr, FlatPtr 259 }; 260 261 const std::initializer_list<LLT> AddrSpaces32 = { 262 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 263 }; 264 265 const std::initializer_list<LLT> FPTypesBase = { 266 S32, S64 267 }; 268 269 const std::initializer_list<LLT> FPTypes16 = { 270 S32, S64, S16 271 }; 272 273 const std::initializer_list<LLT> FPTypesPK16 = { 274 S32, S64, S16, V2S16 275 }; 276 277 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 278 279 setAction({G_BRCOND, S1}, Legal); // VCC branches 280 setAction({G_BRCOND, S32}, Legal); // SCC branches 281 282 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 283 // elements for v3s16 284 getActionDefinitionsBuilder(G_PHI) 285 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 286 .legalFor(AllS32Vectors) 287 .legalFor(AllS64Vectors) 288 .legalFor(AddrSpaces64) 289 .legalFor(AddrSpaces32) 290 .clampScalar(0, S32, S256) 291 .widenScalarToNextPow2(0, 32) 292 .clampMaxNumElements(0, S32, 16) 293 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 294 .legalIf(isPointer(0)); 295 296 if (ST.has16BitInsts()) { 297 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 298 .legalFor({S32, S16}) 299 .clampScalar(0, S16, S32) 300 .scalarize(0) 301 .widenScalarToNextPow2(0, 32); 302 } else { 303 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 304 .legalFor({S32}) 305 .clampScalar(0, S32, S32) 306 .scalarize(0); 307 } 308 309 // FIXME: Not really legal. Placeholder for custom lowering. 310 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 311 .legalFor({S32, S64}) 312 .clampScalar(0, S32, S64) 313 .widenScalarToNextPow2(0, 32) 314 .scalarize(0); 315 316 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 317 .legalFor({S32}) 318 .clampScalar(0, S32, S32) 319 .scalarize(0); 320 321 // Report legal for any types we can handle anywhere. For the cases only legal 322 // on the SALU, RegBankSelect will be able to re-legalize. 323 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 324 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 325 .clampScalar(0, S32, S64) 326 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 327 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 328 .widenScalarToNextPow2(0) 329 .scalarize(0); 330 331 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 332 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 333 .legalFor({{S32, S1}, {S32, S32}}) 334 .clampScalar(0, S32, S32) 335 .scalarize(0); // TODO: Implement. 336 337 getActionDefinitionsBuilder(G_BITCAST) 338 // Don't worry about the size constraint. 339 .legalIf(all(isRegisterType(0), isRegisterType(1))) 340 .lower(); 341 342 343 getActionDefinitionsBuilder(G_CONSTANT) 344 .legalFor({S1, S32, S64, S16, GlobalPtr, 345 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 346 .clampScalar(0, S32, S64) 347 .widenScalarToNextPow2(0) 348 .legalIf(isPointer(0)); 349 350 getActionDefinitionsBuilder(G_FCONSTANT) 351 .legalFor({S32, S64, S16}) 352 .clampScalar(0, S16, S64); 353 354 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 355 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 356 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 357 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 358 .clampScalarOrElt(0, S32, S1024) 359 .legalIf(isMultiple32(0)) 360 .widenScalarToNextPow2(0, 32) 361 .clampMaxNumElements(0, S32, 16); 362 363 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 364 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 365 .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr}); 366 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 367 368 auto &FPOpActions = getActionDefinitionsBuilder( 369 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 370 .legalFor({S32, S64}); 371 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 372 .customFor({S32, S64}); 373 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 374 .customFor({S32, S64}); 375 376 if (ST.has16BitInsts()) { 377 if (ST.hasVOP3PInsts()) 378 FPOpActions.legalFor({S16, V2S16}); 379 else 380 FPOpActions.legalFor({S16}); 381 382 TrigActions.customFor({S16}); 383 FDIVActions.customFor({S16}); 384 } 385 386 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 387 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 388 389 if (ST.hasVOP3PInsts()) { 390 MinNumMaxNum.customFor(FPTypesPK16) 391 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 392 .clampMaxNumElements(0, S16, 2) 393 .clampScalar(0, S16, S64) 394 .scalarize(0); 395 } else if (ST.has16BitInsts()) { 396 MinNumMaxNum.customFor(FPTypes16) 397 .clampScalar(0, S16, S64) 398 .scalarize(0); 399 } else { 400 MinNumMaxNum.customFor(FPTypesBase) 401 .clampScalar(0, S32, S64) 402 .scalarize(0); 403 } 404 405 if (ST.hasVOP3PInsts()) 406 FPOpActions.clampMaxNumElements(0, S16, 2); 407 408 FPOpActions 409 .scalarize(0) 410 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 411 412 TrigActions 413 .scalarize(0) 414 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 415 416 FDIVActions 417 .scalarize(0) 418 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 419 420 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 421 .legalFor(FPTypesPK16) 422 .clampMaxNumElements(0, S16, 2) 423 .scalarize(0) 424 .clampScalar(0, S16, S64); 425 426 if (ST.has16BitInsts()) { 427 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 428 .legalFor({S32, S64, S16}) 429 .scalarize(0) 430 .clampScalar(0, S16, S64); 431 } else { 432 getActionDefinitionsBuilder(G_FSQRT) 433 .legalFor({S32, S64}) 434 .scalarize(0) 435 .clampScalar(0, S32, S64); 436 437 if (ST.hasFractBug()) { 438 getActionDefinitionsBuilder(G_FFLOOR) 439 .customFor({S64}) 440 .legalFor({S32, S64}) 441 .scalarize(0) 442 .clampScalar(0, S32, S64); 443 } else { 444 getActionDefinitionsBuilder(G_FFLOOR) 445 .legalFor({S32, S64}) 446 .scalarize(0) 447 .clampScalar(0, S32, S64); 448 } 449 } 450 451 getActionDefinitionsBuilder(G_FPTRUNC) 452 .legalFor({{S32, S64}, {S16, S32}}) 453 .scalarize(0) 454 .lower(); 455 456 getActionDefinitionsBuilder(G_FPEXT) 457 .legalFor({{S64, S32}, {S32, S16}}) 458 .lowerFor({{S64, S16}}) // FIXME: Implement 459 .scalarize(0); 460 461 getActionDefinitionsBuilder(G_FSUB) 462 // Use actual fsub instruction 463 .legalFor({S32}) 464 // Must use fadd + fneg 465 .lowerFor({S64, S16, V2S16}) 466 .scalarize(0) 467 .clampScalar(0, S32, S64); 468 469 // Whether this is legal depends on the floating point mode for the function. 470 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 471 if (ST.hasMadF16()) 472 FMad.customFor({S32, S16}); 473 else 474 FMad.customFor({S32}); 475 FMad.scalarize(0) 476 .lower(); 477 478 getActionDefinitionsBuilder(G_TRUNC) 479 .alwaysLegal(); 480 481 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 482 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 483 {S32, S1}, {S64, S1}, {S16, S1}}) 484 .scalarize(0) 485 .clampScalar(0, S32, S64) 486 .widenScalarToNextPow2(1, 32); 487 488 // TODO: Split s1->s64 during regbankselect for VALU. 489 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 490 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 491 .lowerFor({{S32, S64}}) 492 .lowerIf(typeIs(1, S1)) 493 .customFor({{S64, S64}}); 494 if (ST.has16BitInsts()) 495 IToFP.legalFor({{S16, S16}}); 496 IToFP.clampScalar(1, S32, S64) 497 .scalarize(0) 498 .widenScalarToNextPow2(1); 499 500 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 501 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 502 .customFor({{S64, S64}}); 503 if (ST.has16BitInsts()) 504 FPToI.legalFor({{S16, S16}}); 505 else 506 FPToI.minScalar(1, S32); 507 508 FPToI.minScalar(0, S32) 509 .scalarize(0) 510 .lower(); 511 512 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 513 .scalarize(0) 514 .lower(); 515 516 if (ST.has16BitInsts()) { 517 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 518 .legalFor({S16, S32, S64}) 519 .clampScalar(0, S16, S64) 520 .scalarize(0); 521 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 522 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 523 .legalFor({S32, S64}) 524 .clampScalar(0, S32, S64) 525 .scalarize(0); 526 } else { 527 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 528 .legalFor({S32}) 529 .customFor({S64}) 530 .clampScalar(0, S32, S64) 531 .scalarize(0); 532 } 533 534 getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK}) 535 .scalarize(0) 536 .alwaysLegal(); 537 538 auto &CmpBuilder = 539 getActionDefinitionsBuilder(G_ICMP) 540 // The compare output type differs based on the register bank of the output, 541 // so make both s1 and s32 legal. 542 // 543 // Scalar compares producing output in scc will be promoted to s32, as that 544 // is the allocatable register type that will be needed for the copy from 545 // scc. This will be promoted during RegBankSelect, and we assume something 546 // before that won't try to use s32 result types. 547 // 548 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 549 // bank. 550 .legalForCartesianProduct( 551 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 552 .legalForCartesianProduct( 553 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 554 if (ST.has16BitInsts()) { 555 CmpBuilder.legalFor({{S1, S16}}); 556 } 557 558 CmpBuilder 559 .widenScalarToNextPow2(1) 560 .clampScalar(1, S32, S64) 561 .scalarize(0) 562 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 563 564 getActionDefinitionsBuilder(G_FCMP) 565 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 566 .widenScalarToNextPow2(1) 567 .clampScalar(1, S32, S64) 568 .scalarize(0); 569 570 // FIXME: fpow has a selection pattern that should move to custom lowering. 571 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2, G_FPOW}); 572 if (ST.has16BitInsts()) 573 Exp2Ops.legalFor({S32, S16}); 574 else 575 Exp2Ops.legalFor({S32}); 576 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 577 Exp2Ops.scalarize(0); 578 579 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10}); 580 if (ST.has16BitInsts()) 581 ExpOps.customFor({{S32}, {S16}}); 582 else 583 ExpOps.customFor({S32}); 584 ExpOps.clampScalar(0, MinScalarFPTy, S32) 585 .scalarize(0); 586 587 // The 64-bit versions produce 32-bit results, but only on the SALU. 588 getActionDefinitionsBuilder(G_CTPOP) 589 .legalFor({{S32, S32}, {S32, S64}}) 590 .clampScalar(0, S32, S32) 591 .clampScalar(1, S32, S64) 592 .scalarize(0) 593 .widenScalarToNextPow2(0, 32) 594 .widenScalarToNextPow2(1, 32); 595 596 // The hardware instructions return a different result on 0 than the generic 597 // instructions expect. The hardware produces -1, but these produce the 598 // bitwidth. 599 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 600 .scalarize(0) 601 .clampScalar(0, S32, S32) 602 .clampScalar(1, S32, S64) 603 .widenScalarToNextPow2(0, 32) 604 .widenScalarToNextPow2(1, 32) 605 .lower(); 606 607 // The 64-bit versions produce 32-bit results, but only on the SALU. 608 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 609 .legalFor({{S32, S32}, {S32, S64}}) 610 .clampScalar(0, S32, S32) 611 .clampScalar(1, S32, S64) 612 .scalarize(0) 613 .widenScalarToNextPow2(0, 32) 614 .widenScalarToNextPow2(1, 32); 615 616 getActionDefinitionsBuilder(G_BITREVERSE) 617 .legalFor({S32}) 618 .clampScalar(0, S32, S32) 619 .scalarize(0); 620 621 if (ST.has16BitInsts()) { 622 getActionDefinitionsBuilder(G_BSWAP) 623 .legalFor({S16, S32, V2S16}) 624 .clampMaxNumElements(0, S16, 2) 625 // FIXME: Fixing non-power-of-2 before clamp is workaround for 626 // narrowScalar limitation. 627 .widenScalarToNextPow2(0) 628 .clampScalar(0, S16, S32) 629 .scalarize(0); 630 631 if (ST.hasVOP3PInsts()) { 632 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 633 .legalFor({S32, S16, V2S16}) 634 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 635 .clampMaxNumElements(0, S16, 2) 636 .clampScalar(0, S16, S32) 637 .widenScalarToNextPow2(0) 638 .scalarize(0); 639 } else { 640 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 641 .legalFor({S32, S16}) 642 .widenScalarToNextPow2(0) 643 .clampScalar(0, S16, S32) 644 .scalarize(0); 645 } 646 } else { 647 // TODO: Should have same legality without v_perm_b32 648 getActionDefinitionsBuilder(G_BSWAP) 649 .legalFor({S32}) 650 .lowerIf(narrowerThan(0, 32)) 651 // FIXME: Fixing non-power-of-2 before clamp is workaround for 652 // narrowScalar limitation. 653 .widenScalarToNextPow2(0) 654 .maxScalar(0, S32) 655 .scalarize(0) 656 .lower(); 657 658 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 659 .legalFor({S32}) 660 .clampScalar(0, S32, S32) 661 .widenScalarToNextPow2(0) 662 .scalarize(0); 663 } 664 665 getActionDefinitionsBuilder(G_INTTOPTR) 666 // List the common cases 667 .legalForCartesianProduct(AddrSpaces64, {S64}) 668 .legalForCartesianProduct(AddrSpaces32, {S32}) 669 .scalarize(0) 670 // Accept any address space as long as the size matches 671 .legalIf(sameSize(0, 1)) 672 .widenScalarIf(smallerThan(1, 0), 673 [](const LegalityQuery &Query) { 674 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 675 }) 676 .narrowScalarIf(greaterThan(1, 0), 677 [](const LegalityQuery &Query) { 678 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 679 }); 680 681 getActionDefinitionsBuilder(G_PTRTOINT) 682 // List the common cases 683 .legalForCartesianProduct(AddrSpaces64, {S64}) 684 .legalForCartesianProduct(AddrSpaces32, {S32}) 685 .scalarize(0) 686 // Accept any address space as long as the size matches 687 .legalIf(sameSize(0, 1)) 688 .widenScalarIf(smallerThan(0, 1), 689 [](const LegalityQuery &Query) { 690 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 691 }) 692 .narrowScalarIf( 693 greaterThan(0, 1), 694 [](const LegalityQuery &Query) { 695 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 696 }); 697 698 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 699 .scalarize(0) 700 .custom(); 701 702 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 703 // handle some operations by just promoting the register during 704 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 705 auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned { 706 switch (AS) { 707 // FIXME: Private element size. 708 case AMDGPUAS::PRIVATE_ADDRESS: 709 return 32; 710 // FIXME: Check subtarget 711 case AMDGPUAS::LOCAL_ADDRESS: 712 return ST.useDS128() ? 128 : 64; 713 714 // Treat constant and global as identical. SMRD loads are sometimes usable 715 // for global loads (ideally constant address space should be eliminated) 716 // depending on the context. Legality cannot be context dependent, but 717 // RegBankSelect can split the load as necessary depending on the pointer 718 // register bank/uniformity and if the memory is invariant or not written in 719 // a kernel. 720 case AMDGPUAS::CONSTANT_ADDRESS: 721 case AMDGPUAS::GLOBAL_ADDRESS: 722 return IsLoad ? 512 : 128; 723 default: 724 return 128; 725 } 726 }; 727 728 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 729 bool IsLoad) -> bool { 730 const LLT DstTy = Query.Types[0]; 731 732 // Split vector extloads. 733 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 734 unsigned Align = Query.MMODescrs[0].AlignInBits; 735 736 if (MemSize < DstTy.getSizeInBits()) 737 MemSize = std::max(MemSize, Align); 738 739 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 740 return true; 741 742 const LLT PtrTy = Query.Types[1]; 743 unsigned AS = PtrTy.getAddressSpace(); 744 if (MemSize > maxSizeForAddrSpace(AS, IsLoad)) 745 return true; 746 747 // Catch weird sized loads that don't evenly divide into the access sizes 748 // TODO: May be able to widen depending on alignment etc. 749 unsigned NumRegs = (MemSize + 31) / 32; 750 if (NumRegs == 3) { 751 if (!ST.hasDwordx3LoadStores()) 752 return true; 753 } else { 754 // If the alignment allows, these should have been widened. 755 if (!isPowerOf2_32(NumRegs)) 756 return true; 757 } 758 759 if (Align < MemSize) { 760 const SITargetLowering *TLI = ST.getTargetLowering(); 761 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 762 } 763 764 return false; 765 }; 766 767 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool { 768 unsigned Size = Query.Types[0].getSizeInBits(); 769 if (isPowerOf2_32(Size)) 770 return false; 771 772 if (Size == 96 && ST.hasDwordx3LoadStores()) 773 return false; 774 775 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 776 if (Size >= maxSizeForAddrSpace(AddrSpace, true)) 777 return false; 778 779 unsigned Align = Query.MMODescrs[0].AlignInBits; 780 unsigned RoundedSize = NextPowerOf2(Size); 781 return (Align >= RoundedSize); 782 }; 783 784 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 785 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 786 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 787 788 // TODO: Refine based on subtargets which support unaligned access or 128-bit 789 // LDS 790 // TODO: Unsupported flat for SI. 791 792 for (unsigned Op : {G_LOAD, G_STORE}) { 793 const bool IsStore = Op == G_STORE; 794 795 auto &Actions = getActionDefinitionsBuilder(Op); 796 // Whitelist the common cases. 797 // TODO: Loads to s16 on gfx9 798 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 799 {V2S32, GlobalPtr, 64, GlobalAlign32}, 800 {V4S32, GlobalPtr, 128, GlobalAlign32}, 801 {S128, GlobalPtr, 128, GlobalAlign32}, 802 {S64, GlobalPtr, 64, GlobalAlign32}, 803 {V2S64, GlobalPtr, 128, GlobalAlign32}, 804 {V2S16, GlobalPtr, 32, GlobalAlign32}, 805 {S32, GlobalPtr, 8, GlobalAlign8}, 806 {S32, GlobalPtr, 16, GlobalAlign16}, 807 808 {S32, LocalPtr, 32, 32}, 809 {S64, LocalPtr, 64, 32}, 810 {V2S32, LocalPtr, 64, 32}, 811 {S32, LocalPtr, 8, 8}, 812 {S32, LocalPtr, 16, 16}, 813 {V2S16, LocalPtr, 32, 32}, 814 815 {S32, PrivatePtr, 32, 32}, 816 {S32, PrivatePtr, 8, 8}, 817 {S32, PrivatePtr, 16, 16}, 818 {V2S16, PrivatePtr, 32, 32}, 819 820 {S32, FlatPtr, 32, GlobalAlign32}, 821 {S32, FlatPtr, 16, GlobalAlign16}, 822 {S32, FlatPtr, 8, GlobalAlign8}, 823 {V2S16, FlatPtr, 32, GlobalAlign32}, 824 825 {S32, ConstantPtr, 32, GlobalAlign32}, 826 {V2S32, ConstantPtr, 64, GlobalAlign32}, 827 {V4S32, ConstantPtr, 128, GlobalAlign32}, 828 {S64, ConstantPtr, 64, GlobalAlign32}, 829 {S128, ConstantPtr, 128, GlobalAlign32}, 830 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 831 Actions 832 .customIf(typeIs(1, Constant32Ptr)) 833 // Widen suitably aligned loads by loading extra elements. 834 .moreElementsIf([=](const LegalityQuery &Query) { 835 const LLT Ty = Query.Types[0]; 836 return Op == G_LOAD && Ty.isVector() && 837 shouldWidenLoadResult(Query); 838 }, moreElementsToNextPow2(0)) 839 .widenScalarIf([=](const LegalityQuery &Query) { 840 const LLT Ty = Query.Types[0]; 841 return Op == G_LOAD && !Ty.isVector() && 842 shouldWidenLoadResult(Query); 843 }, widenScalarOrEltToNextPow2(0)) 844 .narrowScalarIf( 845 [=](const LegalityQuery &Query) -> bool { 846 return !Query.Types[0].isVector() && 847 needToSplitMemOp(Query, Op == G_LOAD); 848 }, 849 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 850 const LLT DstTy = Query.Types[0]; 851 const LLT PtrTy = Query.Types[1]; 852 853 const unsigned DstSize = DstTy.getSizeInBits(); 854 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 855 856 // Split extloads. 857 if (DstSize > MemSize) 858 return std::make_pair(0, LLT::scalar(MemSize)); 859 860 if (!isPowerOf2_32(DstSize)) { 861 // We're probably decomposing an odd sized store. Try to split 862 // to the widest type. TODO: Account for alignment. As-is it 863 // should be OK, since the new parts will be further legalized. 864 unsigned FloorSize = PowerOf2Floor(DstSize); 865 return std::make_pair(0, LLT::scalar(FloorSize)); 866 } 867 868 if (DstSize > 32 && (DstSize % 32 != 0)) { 869 // FIXME: Need a way to specify non-extload of larger size if 870 // suitably aligned. 871 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 872 } 873 874 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 875 Op == G_LOAD); 876 if (MemSize > MaxSize) 877 return std::make_pair(0, LLT::scalar(MaxSize)); 878 879 unsigned Align = Query.MMODescrs[0].AlignInBits; 880 return std::make_pair(0, LLT::scalar(Align)); 881 }) 882 .fewerElementsIf( 883 [=](const LegalityQuery &Query) -> bool { 884 return Query.Types[0].isVector() && 885 needToSplitMemOp(Query, Op == G_LOAD); 886 }, 887 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 888 const LLT DstTy = Query.Types[0]; 889 const LLT PtrTy = Query.Types[1]; 890 891 LLT EltTy = DstTy.getElementType(); 892 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 893 Op == G_LOAD); 894 895 // FIXME: Handle widened to power of 2 results better. This ends 896 // up scalarizing. 897 // FIXME: 3 element stores scalarized on SI 898 899 // Split if it's too large for the address space. 900 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 901 unsigned NumElts = DstTy.getNumElements(); 902 unsigned EltSize = EltTy.getSizeInBits(); 903 904 if (MaxSize % EltSize == 0) { 905 return std::make_pair( 906 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 907 } 908 909 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 910 911 // FIXME: Refine when odd breakdowns handled 912 // The scalars will need to be re-legalized. 913 if (NumPieces == 1 || NumPieces >= NumElts || 914 NumElts % NumPieces != 0) 915 return std::make_pair(0, EltTy); 916 917 return std::make_pair(0, 918 LLT::vector(NumElts / NumPieces, EltTy)); 919 } 920 921 // FIXME: We could probably handle weird extending loads better. 922 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 923 if (DstTy.getSizeInBits() > MemSize) 924 return std::make_pair(0, EltTy); 925 926 unsigned EltSize = EltTy.getSizeInBits(); 927 unsigned DstSize = DstTy.getSizeInBits(); 928 if (!isPowerOf2_32(DstSize)) { 929 // We're probably decomposing an odd sized store. Try to split 930 // to the widest type. TODO: Account for alignment. As-is it 931 // should be OK, since the new parts will be further legalized. 932 unsigned FloorSize = PowerOf2Floor(DstSize); 933 return std::make_pair( 934 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 935 } 936 937 // Need to split because of alignment. 938 unsigned Align = Query.MMODescrs[0].AlignInBits; 939 if (EltSize > Align && 940 (EltSize / Align < DstTy.getNumElements())) { 941 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 942 } 943 944 // May need relegalization for the scalars. 945 return std::make_pair(0, EltTy); 946 }) 947 .minScalar(0, S32); 948 949 if (IsStore) 950 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 951 952 // TODO: Need a bitcast lower option? 953 Actions 954 .legalIf([=](const LegalityQuery &Query) { 955 const LLT Ty0 = Query.Types[0]; 956 unsigned Size = Ty0.getSizeInBits(); 957 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 958 unsigned Align = Query.MMODescrs[0].AlignInBits; 959 960 // FIXME: Widening store from alignment not valid. 961 if (MemSize < Size) 962 MemSize = std::max(MemSize, Align); 963 964 // No extending vector loads. 965 if (Size > MemSize && Ty0.isVector()) 966 return false; 967 968 switch (MemSize) { 969 case 8: 970 case 16: 971 return Size == 32; 972 case 32: 973 case 64: 974 case 128: 975 return true; 976 case 96: 977 return ST.hasDwordx3LoadStores(); 978 case 256: 979 case 512: 980 return true; 981 default: 982 return false; 983 } 984 }) 985 .widenScalarToNextPow2(0) 986 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 987 } 988 989 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 990 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 991 {S32, GlobalPtr, 16, 2 * 8}, 992 {S32, LocalPtr, 8, 8}, 993 {S32, LocalPtr, 16, 16}, 994 {S32, PrivatePtr, 8, 8}, 995 {S32, PrivatePtr, 16, 16}, 996 {S32, ConstantPtr, 8, 8}, 997 {S32, ConstantPtr, 16, 2 * 8}}); 998 if (ST.hasFlatAddressSpace()) { 999 ExtLoads.legalForTypesWithMemDesc( 1000 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1001 } 1002 1003 ExtLoads.clampScalar(0, S32, S32) 1004 .widenScalarToNextPow2(0) 1005 .unsupportedIfMemSizeNotPow2() 1006 .lower(); 1007 1008 auto &Atomics = getActionDefinitionsBuilder( 1009 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1010 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1011 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1012 G_ATOMICRMW_UMIN}) 1013 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1014 {S64, GlobalPtr}, {S64, LocalPtr}}); 1015 if (ST.hasFlatAddressSpace()) { 1016 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1017 } 1018 1019 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1020 .legalFor({{S32, LocalPtr}}); 1021 1022 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1023 // demarshalling 1024 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1025 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1026 {S32, FlatPtr}, {S64, FlatPtr}}) 1027 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1028 {S32, RegionPtr}, {S64, RegionPtr}}); 1029 // TODO: Pointer types, any 32-bit or 64-bit vector 1030 1031 // Condition should be s32 for scalar, s1 for vector. 1032 getActionDefinitionsBuilder(G_SELECT) 1033 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1034 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1035 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1036 .clampScalar(0, S16, S64) 1037 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1038 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1039 .scalarize(1) 1040 .clampMaxNumElements(0, S32, 2) 1041 .clampMaxNumElements(0, LocalPtr, 2) 1042 .clampMaxNumElements(0, PrivatePtr, 2) 1043 .scalarize(0) 1044 .widenScalarToNextPow2(0) 1045 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1046 1047 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1048 // be more flexible with the shift amount type. 1049 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1050 .legalFor({{S32, S32}, {S64, S32}}); 1051 if (ST.has16BitInsts()) { 1052 if (ST.hasVOP3PInsts()) { 1053 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 1054 .clampMaxNumElements(0, S16, 2); 1055 } else 1056 Shifts.legalFor({{S16, S32}, {S16, S16}}); 1057 1058 // TODO: Support 16-bit shift amounts 1059 Shifts.clampScalar(1, S32, S32); 1060 Shifts.clampScalar(0, S16, S64); 1061 Shifts.widenScalarToNextPow2(0, 16); 1062 } else { 1063 // Make sure we legalize the shift amount type first, as the general 1064 // expansion for the shifted type will produce much worse code if it hasn't 1065 // been truncated already. 1066 Shifts.clampScalar(1, S32, S32); 1067 Shifts.clampScalar(0, S32, S64); 1068 Shifts.widenScalarToNextPow2(0, 32); 1069 } 1070 Shifts.scalarize(0); 1071 1072 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1073 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1074 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1075 unsigned IdxTypeIdx = 2; 1076 1077 getActionDefinitionsBuilder(Op) 1078 .customIf([=](const LegalityQuery &Query) { 1079 const LLT EltTy = Query.Types[EltTypeIdx]; 1080 const LLT VecTy = Query.Types[VecTypeIdx]; 1081 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1082 return (EltTy.getSizeInBits() == 16 || 1083 EltTy.getSizeInBits() % 32 == 0) && 1084 VecTy.getSizeInBits() % 32 == 0 && 1085 VecTy.getSizeInBits() <= 1024 && 1086 IdxTy.getSizeInBits() == 32; 1087 }) 1088 .clampScalar(EltTypeIdx, S32, S64) 1089 .clampScalar(VecTypeIdx, S32, S64) 1090 .clampScalar(IdxTypeIdx, S32, S32); 1091 } 1092 1093 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1094 .unsupportedIf([=](const LegalityQuery &Query) { 1095 const LLT &EltTy = Query.Types[1].getElementType(); 1096 return Query.Types[0] != EltTy; 1097 }); 1098 1099 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1100 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1101 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1102 1103 // FIXME: Doesn't handle extract of illegal sizes. 1104 getActionDefinitionsBuilder(Op) 1105 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1106 // FIXME: Multiples of 16 should not be legal. 1107 .legalIf([=](const LegalityQuery &Query) { 1108 const LLT BigTy = Query.Types[BigTyIdx]; 1109 const LLT LitTy = Query.Types[LitTyIdx]; 1110 return (BigTy.getSizeInBits() % 32 == 0) && 1111 (LitTy.getSizeInBits() % 16 == 0); 1112 }) 1113 .widenScalarIf( 1114 [=](const LegalityQuery &Query) { 1115 const LLT BigTy = Query.Types[BigTyIdx]; 1116 return (BigTy.getScalarSizeInBits() < 16); 1117 }, 1118 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1119 .widenScalarIf( 1120 [=](const LegalityQuery &Query) { 1121 const LLT LitTy = Query.Types[LitTyIdx]; 1122 return (LitTy.getScalarSizeInBits() < 16); 1123 }, 1124 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1125 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1126 .widenScalarToNextPow2(BigTyIdx, 32); 1127 1128 } 1129 1130 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1131 .legalForCartesianProduct(AllS32Vectors, {S32}) 1132 .legalForCartesianProduct(AllS64Vectors, {S64}) 1133 .clampNumElements(0, V16S32, V32S32) 1134 .clampNumElements(0, V2S64, V16S64) 1135 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1136 1137 if (ST.hasScalarPackInsts()) { 1138 BuildVector 1139 // FIXME: Should probably widen s1 vectors straight to s32 1140 .minScalarOrElt(0, S16) 1141 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1142 .minScalar(1, S32); 1143 1144 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1145 .legalFor({V2S16, S32}) 1146 .lower(); 1147 BuildVector.minScalarOrElt(0, S32); 1148 } else { 1149 BuildVector.customFor({V2S16, S16}); 1150 BuildVector.minScalarOrElt(0, S32); 1151 1152 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1153 .customFor({V2S16, S32}) 1154 .lower(); 1155 } 1156 1157 BuildVector.legalIf(isRegisterType(0)); 1158 1159 // FIXME: Clamp maximum size 1160 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1161 .legalIf(isRegisterType(0)); 1162 1163 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1164 // pre-legalize. 1165 if (ST.hasVOP3PInsts()) { 1166 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1167 .customFor({V2S16, V2S16}) 1168 .lower(); 1169 } else 1170 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1171 1172 // Merge/Unmerge 1173 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1174 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1175 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1176 1177 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1178 const LLT &Ty = Query.Types[TypeIdx]; 1179 if (Ty.isVector()) { 1180 const LLT &EltTy = Ty.getElementType(); 1181 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 1182 return true; 1183 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1184 return true; 1185 } 1186 return false; 1187 }; 1188 1189 auto &Builder = getActionDefinitionsBuilder(Op) 1190 // Try to widen to s16 first for small types. 1191 // TODO: Only do this on targets with legal s16 shifts 1192 .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1193 1194 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1195 .lowerFor({{S16, V2S16}}) 1196 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1197 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1198 elementTypeIs(1, S16)), 1199 changeTo(1, V2S16)) 1200 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1201 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1202 // valid. 1203 .clampScalar(LitTyIdx, S32, S256) 1204 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1205 // Break up vectors with weird elements into scalars 1206 .fewerElementsIf( 1207 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 1208 scalarize(0)) 1209 .fewerElementsIf( 1210 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 1211 scalarize(1)) 1212 .clampScalar(BigTyIdx, S32, S1024); 1213 1214 if (Op == G_MERGE_VALUES) { 1215 Builder.widenScalarIf( 1216 // TODO: Use 16-bit shifts if legal for 8-bit values? 1217 [=](const LegalityQuery &Query) { 1218 const LLT Ty = Query.Types[LitTyIdx]; 1219 return Ty.getSizeInBits() < 32; 1220 }, 1221 changeTo(LitTyIdx, S32)); 1222 } 1223 1224 Builder.widenScalarIf( 1225 [=](const LegalityQuery &Query) { 1226 const LLT Ty = Query.Types[BigTyIdx]; 1227 return !isPowerOf2_32(Ty.getSizeInBits()) && 1228 Ty.getSizeInBits() % 16 != 0; 1229 }, 1230 [=](const LegalityQuery &Query) { 1231 // Pick the next power of 2, or a multiple of 64 over 128. 1232 // Whichever is smaller. 1233 const LLT &Ty = Query.Types[BigTyIdx]; 1234 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1235 if (NewSizeInBits >= 256) { 1236 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1237 if (RoundedTo < NewSizeInBits) 1238 NewSizeInBits = RoundedTo; 1239 } 1240 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1241 }) 1242 .legalIf([=](const LegalityQuery &Query) { 1243 const LLT &BigTy = Query.Types[BigTyIdx]; 1244 const LLT &LitTy = Query.Types[LitTyIdx]; 1245 1246 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1247 return false; 1248 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1249 return false; 1250 1251 return BigTy.getSizeInBits() % 16 == 0 && 1252 LitTy.getSizeInBits() % 16 == 0 && 1253 BigTy.getSizeInBits() <= 1024; 1254 }) 1255 // Any vectors left are the wrong size. Scalarize them. 1256 .scalarize(0) 1257 .scalarize(1); 1258 } 1259 1260 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1261 // RegBankSelect. 1262 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1263 .legalFor({{S32}, {S64}}); 1264 1265 if (ST.hasVOP3PInsts()) { 1266 SextInReg.lowerFor({{V2S16}}) 1267 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1268 // get more vector shift opportunities, since we'll get those when 1269 // expanded. 1270 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1271 } else if (ST.has16BitInsts()) { 1272 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1273 } else { 1274 // Prefer to promote to s32 before lowering if we don't have 16-bit 1275 // shifts. This avoid a lot of intermediate truncate and extend operations. 1276 SextInReg.lowerFor({{S32}, {S64}}); 1277 } 1278 1279 SextInReg 1280 .scalarize(0) 1281 .clampScalar(0, S32, S64) 1282 .lower(); 1283 1284 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1285 .legalFor({S64}); 1286 1287 getActionDefinitionsBuilder({ 1288 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1289 G_FCOPYSIGN, 1290 1291 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1292 G_READ_REGISTER, 1293 G_WRITE_REGISTER, 1294 1295 G_SADDO, G_SSUBO, 1296 1297 // TODO: Implement 1298 G_FMINIMUM, G_FMAXIMUM 1299 }).lower(); 1300 1301 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1302 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1303 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1304 .unsupported(); 1305 1306 computeTables(); 1307 verify(*ST.getInstrInfo()); 1308 } 1309 1310 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1311 MachineRegisterInfo &MRI, 1312 MachineIRBuilder &B, 1313 GISelChangeObserver &Observer) const { 1314 switch (MI.getOpcode()) { 1315 case TargetOpcode::G_ADDRSPACE_CAST: 1316 return legalizeAddrSpaceCast(MI, MRI, B); 1317 case TargetOpcode::G_FRINT: 1318 return legalizeFrint(MI, MRI, B); 1319 case TargetOpcode::G_FCEIL: 1320 return legalizeFceil(MI, MRI, B); 1321 case TargetOpcode::G_INTRINSIC_TRUNC: 1322 return legalizeIntrinsicTrunc(MI, MRI, B); 1323 case TargetOpcode::G_SITOFP: 1324 return legalizeITOFP(MI, MRI, B, true); 1325 case TargetOpcode::G_UITOFP: 1326 return legalizeITOFP(MI, MRI, B, false); 1327 case TargetOpcode::G_FPTOSI: 1328 return legalizeFPTOI(MI, MRI, B, true); 1329 case TargetOpcode::G_FPTOUI: 1330 return legalizeFPTOI(MI, MRI, B, false); 1331 case TargetOpcode::G_FMINNUM: 1332 case TargetOpcode::G_FMAXNUM: 1333 case TargetOpcode::G_FMINNUM_IEEE: 1334 case TargetOpcode::G_FMAXNUM_IEEE: 1335 return legalizeMinNumMaxNum(MI, MRI, B); 1336 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1337 return legalizeExtractVectorElt(MI, MRI, B); 1338 case TargetOpcode::G_INSERT_VECTOR_ELT: 1339 return legalizeInsertVectorElt(MI, MRI, B); 1340 case TargetOpcode::G_SHUFFLE_VECTOR: 1341 return legalizeShuffleVector(MI, MRI, B); 1342 case TargetOpcode::G_FSIN: 1343 case TargetOpcode::G_FCOS: 1344 return legalizeSinCos(MI, MRI, B); 1345 case TargetOpcode::G_GLOBAL_VALUE: 1346 return legalizeGlobalValue(MI, MRI, B); 1347 case TargetOpcode::G_LOAD: 1348 return legalizeLoad(MI, MRI, B, Observer); 1349 case TargetOpcode::G_FMAD: 1350 return legalizeFMad(MI, MRI, B); 1351 case TargetOpcode::G_FDIV: 1352 return legalizeFDIV(MI, MRI, B); 1353 case TargetOpcode::G_ATOMIC_CMPXCHG: 1354 return legalizeAtomicCmpXChg(MI, MRI, B); 1355 case TargetOpcode::G_FLOG: 1356 return legalizeFlog(MI, B, 1.0f / numbers::log2ef); 1357 case TargetOpcode::G_FLOG10: 1358 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1359 case TargetOpcode::G_FEXP: 1360 return legalizeFExp(MI, B); 1361 case TargetOpcode::G_FFLOOR: 1362 return legalizeFFloor(MI, MRI, B); 1363 case TargetOpcode::G_BUILD_VECTOR: 1364 return legalizeBuildVector(MI, MRI, B); 1365 default: 1366 return false; 1367 } 1368 1369 llvm_unreachable("expected switch to return"); 1370 } 1371 1372 Register AMDGPULegalizerInfo::getSegmentAperture( 1373 unsigned AS, 1374 MachineRegisterInfo &MRI, 1375 MachineIRBuilder &B) const { 1376 MachineFunction &MF = B.getMF(); 1377 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1378 const LLT S32 = LLT::scalar(32); 1379 1380 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1381 1382 if (ST.hasApertureRegs()) { 1383 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1384 // getreg. 1385 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1386 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1387 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1388 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1389 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1390 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1391 unsigned Encoding = 1392 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1393 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1394 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1395 1396 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1397 1398 B.buildInstr(AMDGPU::S_GETREG_B32) 1399 .addDef(GetReg) 1400 .addImm(Encoding); 1401 MRI.setType(GetReg, S32); 1402 1403 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1404 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1405 } 1406 1407 Register QueuePtr = MRI.createGenericVirtualRegister( 1408 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1409 1410 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1411 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1412 return Register(); 1413 1414 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1415 // private_segment_aperture_base_hi. 1416 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1417 1418 // TODO: can we be smarter about machine pointer info? 1419 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1420 MachineMemOperand *MMO = MF.getMachineMemOperand( 1421 PtrInfo, 1422 MachineMemOperand::MOLoad | 1423 MachineMemOperand::MODereferenceable | 1424 MachineMemOperand::MOInvariant, 1425 4, 1426 MinAlign(64, StructOffset)); 1427 1428 Register LoadAddr; 1429 1430 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1431 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1432 } 1433 1434 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1435 MachineInstr &MI, MachineRegisterInfo &MRI, 1436 MachineIRBuilder &B) const { 1437 MachineFunction &MF = B.getMF(); 1438 1439 B.setInstr(MI); 1440 1441 const LLT S32 = LLT::scalar(32); 1442 Register Dst = MI.getOperand(0).getReg(); 1443 Register Src = MI.getOperand(1).getReg(); 1444 1445 LLT DstTy = MRI.getType(Dst); 1446 LLT SrcTy = MRI.getType(Src); 1447 unsigned DestAS = DstTy.getAddressSpace(); 1448 unsigned SrcAS = SrcTy.getAddressSpace(); 1449 1450 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1451 // vector element. 1452 assert(!DstTy.isVector()); 1453 1454 const AMDGPUTargetMachine &TM 1455 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1456 1457 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1458 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1459 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1460 return true; 1461 } 1462 1463 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1464 // Truncate. 1465 B.buildExtract(Dst, Src, 0); 1466 MI.eraseFromParent(); 1467 return true; 1468 } 1469 1470 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1471 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1472 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1473 1474 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1475 // another. Merge operands are required to be the same type, but creating an 1476 // extra ptrtoint would be kind of pointless. 1477 auto HighAddr = B.buildConstant( 1478 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1479 B.buildMerge(Dst, {Src, HighAddr}); 1480 MI.eraseFromParent(); 1481 return true; 1482 } 1483 1484 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1485 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1486 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1487 unsigned NullVal = TM.getNullPointerValue(DestAS); 1488 1489 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1490 auto FlatNull = B.buildConstant(SrcTy, 0); 1491 1492 // Extract low 32-bits of the pointer. 1493 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1494 1495 auto CmpRes = 1496 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1497 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1498 1499 MI.eraseFromParent(); 1500 return true; 1501 } 1502 1503 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1504 return false; 1505 1506 if (!ST.hasFlatAddressSpace()) 1507 return false; 1508 1509 auto SegmentNull = 1510 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1511 auto FlatNull = 1512 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1513 1514 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1515 if (!ApertureReg.isValid()) 1516 return false; 1517 1518 auto CmpRes = 1519 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1520 1521 // Coerce the type of the low half of the result so we can use merge_values. 1522 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1523 1524 // TODO: Should we allow mismatched types but matching sizes in merges to 1525 // avoid the ptrtoint? 1526 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1527 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1528 1529 MI.eraseFromParent(); 1530 return true; 1531 } 1532 1533 bool AMDGPULegalizerInfo::legalizeFrint( 1534 MachineInstr &MI, MachineRegisterInfo &MRI, 1535 MachineIRBuilder &B) const { 1536 B.setInstr(MI); 1537 1538 Register Src = MI.getOperand(1).getReg(); 1539 LLT Ty = MRI.getType(Src); 1540 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1541 1542 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1543 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1544 1545 auto C1 = B.buildFConstant(Ty, C1Val); 1546 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1547 1548 // TODO: Should this propagate fast-math-flags? 1549 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1550 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1551 1552 auto C2 = B.buildFConstant(Ty, C2Val); 1553 auto Fabs = B.buildFAbs(Ty, Src); 1554 1555 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1556 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1557 return true; 1558 } 1559 1560 bool AMDGPULegalizerInfo::legalizeFceil( 1561 MachineInstr &MI, MachineRegisterInfo &MRI, 1562 MachineIRBuilder &B) const { 1563 B.setInstr(MI); 1564 1565 const LLT S1 = LLT::scalar(1); 1566 const LLT S64 = LLT::scalar(64); 1567 1568 Register Src = MI.getOperand(1).getReg(); 1569 assert(MRI.getType(Src) == S64); 1570 1571 // result = trunc(src) 1572 // if (src > 0.0 && src != result) 1573 // result += 1.0 1574 1575 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1576 1577 const auto Zero = B.buildFConstant(S64, 0.0); 1578 const auto One = B.buildFConstant(S64, 1.0); 1579 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1580 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1581 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1582 auto Add = B.buildSelect(S64, And, One, Zero); 1583 1584 // TODO: Should this propagate fast-math-flags? 1585 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1586 return true; 1587 } 1588 1589 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1590 MachineIRBuilder &B) { 1591 const unsigned FractBits = 52; 1592 const unsigned ExpBits = 11; 1593 LLT S32 = LLT::scalar(32); 1594 1595 auto Const0 = B.buildConstant(S32, FractBits - 32); 1596 auto Const1 = B.buildConstant(S32, ExpBits); 1597 1598 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1599 .addUse(Const0.getReg(0)) 1600 .addUse(Const1.getReg(0)); 1601 1602 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1603 } 1604 1605 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1606 MachineInstr &MI, MachineRegisterInfo &MRI, 1607 MachineIRBuilder &B) const { 1608 B.setInstr(MI); 1609 1610 const LLT S1 = LLT::scalar(1); 1611 const LLT S32 = LLT::scalar(32); 1612 const LLT S64 = LLT::scalar(64); 1613 1614 Register Src = MI.getOperand(1).getReg(); 1615 assert(MRI.getType(Src) == S64); 1616 1617 // TODO: Should this use extract since the low half is unused? 1618 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1619 Register Hi = Unmerge.getReg(1); 1620 1621 // Extract the upper half, since this is where we will find the sign and 1622 // exponent. 1623 auto Exp = extractF64Exponent(Hi, B); 1624 1625 const unsigned FractBits = 52; 1626 1627 // Extract the sign bit. 1628 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1629 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1630 1631 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1632 1633 const auto Zero32 = B.buildConstant(S32, 0); 1634 1635 // Extend back to 64-bits. 1636 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1637 1638 auto Shr = B.buildAShr(S64, FractMask, Exp); 1639 auto Not = B.buildNot(S64, Shr); 1640 auto Tmp0 = B.buildAnd(S64, Src, Not); 1641 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1642 1643 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1644 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1645 1646 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1647 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1648 return true; 1649 } 1650 1651 bool AMDGPULegalizerInfo::legalizeITOFP( 1652 MachineInstr &MI, MachineRegisterInfo &MRI, 1653 MachineIRBuilder &B, bool Signed) const { 1654 B.setInstr(MI); 1655 1656 Register Dst = MI.getOperand(0).getReg(); 1657 Register Src = MI.getOperand(1).getReg(); 1658 1659 const LLT S64 = LLT::scalar(64); 1660 const LLT S32 = LLT::scalar(32); 1661 1662 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1663 1664 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1665 1666 auto CvtHi = Signed ? 1667 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1668 B.buildUITOFP(S64, Unmerge.getReg(1)); 1669 1670 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1671 1672 auto ThirtyTwo = B.buildConstant(S32, 32); 1673 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1674 .addUse(CvtHi.getReg(0)) 1675 .addUse(ThirtyTwo.getReg(0)); 1676 1677 // TODO: Should this propagate fast-math-flags? 1678 B.buildFAdd(Dst, LdExp, CvtLo); 1679 MI.eraseFromParent(); 1680 return true; 1681 } 1682 1683 // TODO: Copied from DAG implementation. Verify logic and document how this 1684 // actually works. 1685 bool AMDGPULegalizerInfo::legalizeFPTOI( 1686 MachineInstr &MI, MachineRegisterInfo &MRI, 1687 MachineIRBuilder &B, bool Signed) const { 1688 B.setInstr(MI); 1689 1690 Register Dst = MI.getOperand(0).getReg(); 1691 Register Src = MI.getOperand(1).getReg(); 1692 1693 const LLT S64 = LLT::scalar(64); 1694 const LLT S32 = LLT::scalar(32); 1695 1696 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1697 1698 unsigned Flags = MI.getFlags(); 1699 1700 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1701 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1702 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1703 1704 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1705 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1706 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1707 1708 auto Hi = Signed ? 1709 B.buildFPTOSI(S32, FloorMul) : 1710 B.buildFPTOUI(S32, FloorMul); 1711 auto Lo = B.buildFPTOUI(S32, Fma); 1712 1713 B.buildMerge(Dst, { Lo, Hi }); 1714 MI.eraseFromParent(); 1715 1716 return true; 1717 } 1718 1719 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1720 MachineInstr &MI, MachineRegisterInfo &MRI, 1721 MachineIRBuilder &B) const { 1722 MachineFunction &MF = B.getMF(); 1723 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1724 1725 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1726 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1727 1728 // With ieee_mode disabled, the instructions have the correct behavior 1729 // already for G_FMINNUM/G_FMAXNUM 1730 if (!MFI->getMode().IEEE) 1731 return !IsIEEEOp; 1732 1733 if (IsIEEEOp) 1734 return true; 1735 1736 MachineIRBuilder HelperBuilder(MI); 1737 GISelObserverWrapper DummyObserver; 1738 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1739 HelperBuilder.setInstr(MI); 1740 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1741 } 1742 1743 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1744 MachineInstr &MI, MachineRegisterInfo &MRI, 1745 MachineIRBuilder &B) const { 1746 // TODO: Should move some of this into LegalizerHelper. 1747 1748 // TODO: Promote dynamic indexing of s16 to s32 1749 1750 // FIXME: Artifact combiner probably should have replaced the truncated 1751 // constant before this, so we shouldn't need 1752 // getConstantVRegValWithLookThrough. 1753 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1754 MI.getOperand(2).getReg(), MRI); 1755 if (!IdxVal) // Dynamic case will be selected to register indexing. 1756 return true; 1757 1758 Register Dst = MI.getOperand(0).getReg(); 1759 Register Vec = MI.getOperand(1).getReg(); 1760 1761 LLT VecTy = MRI.getType(Vec); 1762 LLT EltTy = VecTy.getElementType(); 1763 assert(EltTy == MRI.getType(Dst)); 1764 1765 B.setInstr(MI); 1766 1767 if (IdxVal->Value < VecTy.getNumElements()) 1768 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1769 else 1770 B.buildUndef(Dst); 1771 1772 MI.eraseFromParent(); 1773 return true; 1774 } 1775 1776 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1777 MachineInstr &MI, MachineRegisterInfo &MRI, 1778 MachineIRBuilder &B) const { 1779 // TODO: Should move some of this into LegalizerHelper. 1780 1781 // TODO: Promote dynamic indexing of s16 to s32 1782 1783 // FIXME: Artifact combiner probably should have replaced the truncated 1784 // constant before this, so we shouldn't need 1785 // getConstantVRegValWithLookThrough. 1786 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1787 MI.getOperand(3).getReg(), MRI); 1788 if (!IdxVal) // Dynamic case will be selected to register indexing. 1789 return true; 1790 1791 Register Dst = MI.getOperand(0).getReg(); 1792 Register Vec = MI.getOperand(1).getReg(); 1793 Register Ins = MI.getOperand(2).getReg(); 1794 1795 LLT VecTy = MRI.getType(Vec); 1796 LLT EltTy = VecTy.getElementType(); 1797 assert(EltTy == MRI.getType(Ins)); 1798 1799 B.setInstr(MI); 1800 1801 if (IdxVal->Value < VecTy.getNumElements()) 1802 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 1803 else 1804 B.buildUndef(Dst); 1805 1806 MI.eraseFromParent(); 1807 return true; 1808 } 1809 1810 static bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask) { 1811 assert(Mask.size() == 2); 1812 1813 // If one half is undef, the other is trivially in the same reg. 1814 if (Mask[0] == -1 || Mask[1] == -1) 1815 return true; 1816 return ((Mask[0] == 0 || Mask[0] == 1) && (Mask[1] == 0 || Mask[1] == 1)) || 1817 ((Mask[0] == 2 || Mask[0] == 3) && (Mask[1] == 2 || Mask[1] == 3)); 1818 } 1819 1820 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1821 MachineInstr &MI, MachineRegisterInfo &MRI, 1822 MachineIRBuilder &B) const { 1823 const LLT V2S16 = LLT::vector(2, 16); 1824 1825 Register Dst = MI.getOperand(0).getReg(); 1826 Register Src0 = MI.getOperand(1).getReg(); 1827 LLT DstTy = MRI.getType(Dst); 1828 LLT SrcTy = MRI.getType(Src0); 1829 1830 if (SrcTy == V2S16 && DstTy == V2S16 && 1831 isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1832 return true; 1833 1834 MachineIRBuilder HelperBuilder(MI); 1835 GISelObserverWrapper DummyObserver; 1836 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1837 HelperBuilder.setInstr(MI); 1838 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1839 } 1840 1841 bool AMDGPULegalizerInfo::legalizeSinCos( 1842 MachineInstr &MI, MachineRegisterInfo &MRI, 1843 MachineIRBuilder &B) const { 1844 B.setInstr(MI); 1845 1846 Register DstReg = MI.getOperand(0).getReg(); 1847 Register SrcReg = MI.getOperand(1).getReg(); 1848 LLT Ty = MRI.getType(DstReg); 1849 unsigned Flags = MI.getFlags(); 1850 1851 Register TrigVal; 1852 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1853 if (ST.hasTrigReducedRange()) { 1854 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1855 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1856 .addUse(MulVal.getReg(0)) 1857 .setMIFlags(Flags).getReg(0); 1858 } else 1859 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1860 1861 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1862 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1863 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1864 .addUse(TrigVal) 1865 .setMIFlags(Flags); 1866 MI.eraseFromParent(); 1867 return true; 1868 } 1869 1870 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1871 Register DstReg, LLT PtrTy, 1872 MachineIRBuilder &B, const GlobalValue *GV, 1873 unsigned Offset, unsigned GAFlags) const { 1874 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1875 // to the following code sequence: 1876 // 1877 // For constant address space: 1878 // s_getpc_b64 s[0:1] 1879 // s_add_u32 s0, s0, $symbol 1880 // s_addc_u32 s1, s1, 0 1881 // 1882 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1883 // a fixup or relocation is emitted to replace $symbol with a literal 1884 // constant, which is a pc-relative offset from the encoding of the $symbol 1885 // operand to the global variable. 1886 // 1887 // For global address space: 1888 // s_getpc_b64 s[0:1] 1889 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1890 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1891 // 1892 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1893 // fixups or relocations are emitted to replace $symbol@*@lo and 1894 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1895 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1896 // operand to the global variable. 1897 // 1898 // What we want here is an offset from the value returned by s_getpc 1899 // (which is the address of the s_add_u32 instruction) to the global 1900 // variable, but since the encoding of $symbol starts 4 bytes after the start 1901 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1902 // small. This requires us to add 4 to the global variable offset in order to 1903 // compute the correct address. 1904 1905 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1906 1907 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1908 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1909 1910 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1911 .addDef(PCReg); 1912 1913 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1914 if (GAFlags == SIInstrInfo::MO_NONE) 1915 MIB.addImm(0); 1916 else 1917 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1918 1919 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1920 1921 if (PtrTy.getSizeInBits() == 32) 1922 B.buildExtract(DstReg, PCReg, 0); 1923 return true; 1924 } 1925 1926 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1927 MachineInstr &MI, MachineRegisterInfo &MRI, 1928 MachineIRBuilder &B) const { 1929 Register DstReg = MI.getOperand(0).getReg(); 1930 LLT Ty = MRI.getType(DstReg); 1931 unsigned AS = Ty.getAddressSpace(); 1932 1933 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1934 MachineFunction &MF = B.getMF(); 1935 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1936 B.setInstr(MI); 1937 1938 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1939 if (!MFI->isEntryFunction()) { 1940 const Function &Fn = MF.getFunction(); 1941 DiagnosticInfoUnsupported BadLDSDecl( 1942 Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 1943 Fn.getContext().diagnose(BadLDSDecl); 1944 } 1945 1946 // TODO: We could emit code to handle the initialization somewhere. 1947 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1948 const SITargetLowering *TLI = ST.getTargetLowering(); 1949 if (!TLI->shouldUseLDSConstAddress(GV)) { 1950 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 1951 return true; // Leave in place; 1952 } 1953 1954 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1955 MI.eraseFromParent(); 1956 return true; 1957 } 1958 1959 const Function &Fn = MF.getFunction(); 1960 DiagnosticInfoUnsupported BadInit( 1961 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1962 Fn.getContext().diagnose(BadInit); 1963 return true; 1964 } 1965 1966 const SITargetLowering *TLI = ST.getTargetLowering(); 1967 1968 if (TLI->shouldEmitFixup(GV)) { 1969 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 1970 MI.eraseFromParent(); 1971 return true; 1972 } 1973 1974 if (TLI->shouldEmitPCReloc(GV)) { 1975 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 1976 MI.eraseFromParent(); 1977 return true; 1978 } 1979 1980 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1981 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 1982 1983 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 1984 MachinePointerInfo::getGOT(MF), 1985 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1986 MachineMemOperand::MOInvariant, 1987 8 /*Size*/, 8 /*Align*/); 1988 1989 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 1990 1991 if (Ty.getSizeInBits() == 32) { 1992 // Truncate if this is a 32-bit constant adrdess. 1993 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 1994 B.buildExtract(DstReg, Load, 0); 1995 } else 1996 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 1997 1998 MI.eraseFromParent(); 1999 return true; 2000 } 2001 2002 bool AMDGPULegalizerInfo::legalizeLoad( 2003 MachineInstr &MI, MachineRegisterInfo &MRI, 2004 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2005 B.setInstr(MI); 2006 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2007 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2008 Observer.changingInstr(MI); 2009 MI.getOperand(1).setReg(Cast.getReg(0)); 2010 Observer.changedInstr(MI); 2011 return true; 2012 } 2013 2014 bool AMDGPULegalizerInfo::legalizeFMad( 2015 MachineInstr &MI, MachineRegisterInfo &MRI, 2016 MachineIRBuilder &B) const { 2017 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2018 assert(Ty.isScalar()); 2019 2020 MachineFunction &MF = B.getMF(); 2021 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2022 2023 // TODO: Always legal with future ftz flag. 2024 // FIXME: Do we need just output? 2025 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2026 return true; 2027 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2028 return true; 2029 2030 MachineIRBuilder HelperBuilder(MI); 2031 GISelObserverWrapper DummyObserver; 2032 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2033 HelperBuilder.setMBB(*MI.getParent()); 2034 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2035 } 2036 2037 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2038 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2039 Register DstReg = MI.getOperand(0).getReg(); 2040 Register PtrReg = MI.getOperand(1).getReg(); 2041 Register CmpVal = MI.getOperand(2).getReg(); 2042 Register NewVal = MI.getOperand(3).getReg(); 2043 2044 assert(SITargetLowering::isFlatGlobalAddrSpace( 2045 MRI.getType(PtrReg).getAddressSpace()) && 2046 "this should not have been custom lowered"); 2047 2048 LLT ValTy = MRI.getType(CmpVal); 2049 LLT VecTy = LLT::vector(2, ValTy); 2050 2051 B.setInstr(MI); 2052 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2053 2054 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2055 .addDef(DstReg) 2056 .addUse(PtrReg) 2057 .addUse(PackedVal) 2058 .setMemRefs(MI.memoperands()); 2059 2060 MI.eraseFromParent(); 2061 return true; 2062 } 2063 2064 bool AMDGPULegalizerInfo::legalizeFlog( 2065 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2066 Register Dst = MI.getOperand(0).getReg(); 2067 Register Src = MI.getOperand(1).getReg(); 2068 LLT Ty = B.getMRI()->getType(Dst); 2069 unsigned Flags = MI.getFlags(); 2070 B.setInstr(MI); 2071 2072 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2073 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2074 2075 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2076 MI.eraseFromParent(); 2077 return true; 2078 } 2079 2080 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2081 MachineIRBuilder &B) const { 2082 Register Dst = MI.getOperand(0).getReg(); 2083 Register Src = MI.getOperand(1).getReg(); 2084 unsigned Flags = MI.getFlags(); 2085 LLT Ty = B.getMRI()->getType(Dst); 2086 B.setInstr(MI); 2087 2088 auto K = B.buildFConstant(Ty, numbers::log2e); 2089 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2090 B.buildFExp2(Dst, Mul, Flags); 2091 MI.eraseFromParent(); 2092 return true; 2093 } 2094 2095 // Find a source register, ignoring any possible source modifiers. 2096 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2097 Register ModSrc = OrigSrc; 2098 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2099 ModSrc = SrcFNeg->getOperand(1).getReg(); 2100 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2101 ModSrc = SrcFAbs->getOperand(1).getReg(); 2102 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2103 ModSrc = SrcFAbs->getOperand(1).getReg(); 2104 return ModSrc; 2105 } 2106 2107 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2108 MachineRegisterInfo &MRI, 2109 MachineIRBuilder &B) const { 2110 B.setInstr(MI); 2111 2112 const LLT S1 = LLT::scalar(1); 2113 const LLT S64 = LLT::scalar(64); 2114 Register Dst = MI.getOperand(0).getReg(); 2115 Register OrigSrc = MI.getOperand(1).getReg(); 2116 unsigned Flags = MI.getFlags(); 2117 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2118 "this should not have been custom lowered"); 2119 2120 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2121 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2122 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2123 // V_FRACT bug is: 2124 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2125 // 2126 // Convert floor(x) to (x - fract(x)) 2127 2128 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2129 .addUse(OrigSrc) 2130 .setMIFlags(Flags); 2131 2132 // Give source modifier matching some assistance before obscuring a foldable 2133 // pattern. 2134 2135 // TODO: We can avoid the neg on the fract? The input sign to fract 2136 // shouldn't matter? 2137 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2138 2139 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2140 2141 Register Min = MRI.createGenericVirtualRegister(S64); 2142 2143 // We don't need to concern ourselves with the snan handling difference, so 2144 // use the one which will directly select. 2145 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2146 if (MFI->getMode().IEEE) 2147 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2148 else 2149 B.buildFMinNum(Min, Fract, Const, Flags); 2150 2151 Register CorrectedFract = Min; 2152 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2153 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2154 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2155 } 2156 2157 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2158 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2159 2160 MI.eraseFromParent(); 2161 return true; 2162 } 2163 2164 // Turn an illegal packed v2s16 build vector into bit operations. 2165 // TODO: This should probably be a bitcast action in LegalizerHelper. 2166 bool AMDGPULegalizerInfo::legalizeBuildVector( 2167 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2168 Register Dst = MI.getOperand(0).getReg(); 2169 LLT DstTy = MRI.getType(Dst); 2170 const LLT S32 = LLT::scalar(32); 2171 const LLT V2S16 = LLT::vector(2, 16); 2172 (void)DstTy; 2173 (void)V2S16; 2174 assert(DstTy == V2S16); 2175 2176 Register Src0 = MI.getOperand(1).getReg(); 2177 Register Src1 = MI.getOperand(2).getReg(); 2178 assert(MRI.getType(Src0) == LLT::scalar(16)); 2179 2180 B.setInstr(MI); 2181 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2182 B.buildBitcast(Dst, Merge); 2183 2184 MI.eraseFromParent(); 2185 return true; 2186 } 2187 2188 // Return the use branch instruction, otherwise null if the usage is invalid. 2189 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2190 MachineRegisterInfo &MRI, 2191 MachineInstr *&Br) { 2192 Register CondDef = MI.getOperand(0).getReg(); 2193 if (!MRI.hasOneNonDBGUse(CondDef)) 2194 return nullptr; 2195 2196 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2197 if (UseMI.getParent() != MI.getParent() || 2198 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2199 return nullptr; 2200 2201 // Make sure the cond br is followed by a G_BR 2202 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2203 if (Next != MI.getParent()->end()) { 2204 if (Next->getOpcode() != AMDGPU::G_BR) 2205 return nullptr; 2206 Br = &*Next; 2207 } 2208 2209 return &UseMI; 2210 } 2211 2212 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 2213 Register Reg, LLT Ty) const { 2214 Register LiveIn = MRI.getLiveInVirtReg(Reg); 2215 if (LiveIn) 2216 return LiveIn; 2217 2218 Register NewReg = MRI.createGenericVirtualRegister(Ty); 2219 MRI.addLiveIn(Reg, NewReg); 2220 return NewReg; 2221 } 2222 2223 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2224 const ArgDescriptor *Arg) const { 2225 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2226 return false; // TODO: Handle these 2227 2228 assert(Arg->getRegister().isPhysical()); 2229 2230 MachineRegisterInfo &MRI = *B.getMRI(); 2231 2232 LLT Ty = MRI.getType(DstReg); 2233 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 2234 2235 if (Arg->isMasked()) { 2236 // TODO: Should we try to emit this once in the entry block? 2237 const LLT S32 = LLT::scalar(32); 2238 const unsigned Mask = Arg->getMask(); 2239 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2240 2241 Register AndMaskSrc = LiveIn; 2242 2243 if (Shift != 0) { 2244 auto ShiftAmt = B.buildConstant(S32, Shift); 2245 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2246 } 2247 2248 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2249 } else 2250 B.buildCopy(DstReg, LiveIn); 2251 2252 // Insert the argument copy if it doens't already exist. 2253 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2254 if (!MRI.getVRegDef(LiveIn)) { 2255 // FIXME: Should have scoped insert pt 2256 MachineBasicBlock &OrigInsBB = B.getMBB(); 2257 auto OrigInsPt = B.getInsertPt(); 2258 2259 MachineBasicBlock &EntryMBB = B.getMF().front(); 2260 EntryMBB.addLiveIn(Arg->getRegister()); 2261 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2262 B.buildCopy(LiveIn, Arg->getRegister()); 2263 2264 B.setInsertPt(OrigInsBB, OrigInsPt); 2265 } 2266 2267 return true; 2268 } 2269 2270 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2271 MachineInstr &MI, 2272 MachineRegisterInfo &MRI, 2273 MachineIRBuilder &B, 2274 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2275 B.setInstr(MI); 2276 2277 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2278 2279 const ArgDescriptor *Arg; 2280 const TargetRegisterClass *RC; 2281 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 2282 if (!Arg) { 2283 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2284 return false; 2285 } 2286 2287 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 2288 MI.eraseFromParent(); 2289 return true; 2290 } 2291 2292 return false; 2293 } 2294 2295 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2296 MachineRegisterInfo &MRI, 2297 MachineIRBuilder &B) const { 2298 B.setInstr(MI); 2299 Register Dst = MI.getOperand(0).getReg(); 2300 LLT DstTy = MRI.getType(Dst); 2301 LLT S16 = LLT::scalar(16); 2302 LLT S32 = LLT::scalar(32); 2303 LLT S64 = LLT::scalar(64); 2304 2305 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2306 return true; 2307 2308 if (DstTy == S16) 2309 return legalizeFDIV16(MI, MRI, B); 2310 if (DstTy == S32) 2311 return legalizeFDIV32(MI, MRI, B); 2312 if (DstTy == S64) 2313 return legalizeFDIV64(MI, MRI, B); 2314 2315 return false; 2316 } 2317 2318 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2319 MachineRegisterInfo &MRI, 2320 MachineIRBuilder &B) const { 2321 Register Res = MI.getOperand(0).getReg(); 2322 Register LHS = MI.getOperand(1).getReg(); 2323 Register RHS = MI.getOperand(2).getReg(); 2324 2325 uint16_t Flags = MI.getFlags(); 2326 2327 LLT ResTy = MRI.getType(Res); 2328 LLT S32 = LLT::scalar(32); 2329 LLT S64 = LLT::scalar(64); 2330 2331 const MachineFunction &MF = B.getMF(); 2332 bool Unsafe = 2333 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2334 2335 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2336 return false; 2337 2338 if (!Unsafe && ResTy == S32 && 2339 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2340 return false; 2341 2342 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2343 // 1 / x -> RCP(x) 2344 if (CLHS->isExactlyValue(1.0)) { 2345 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2346 .addUse(RHS) 2347 .setMIFlags(Flags); 2348 2349 MI.eraseFromParent(); 2350 return true; 2351 } 2352 2353 // -1 / x -> RCP( FNEG(x) ) 2354 if (CLHS->isExactlyValue(-1.0)) { 2355 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2356 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2357 .addUse(FNeg.getReg(0)) 2358 .setMIFlags(Flags); 2359 2360 MI.eraseFromParent(); 2361 return true; 2362 } 2363 } 2364 2365 // x / y -> x * (1.0 / y) 2366 if (Unsafe) { 2367 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2368 .addUse(RHS) 2369 .setMIFlags(Flags); 2370 B.buildFMul(Res, LHS, RCP, Flags); 2371 2372 MI.eraseFromParent(); 2373 return true; 2374 } 2375 2376 return false; 2377 } 2378 2379 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2380 MachineRegisterInfo &MRI, 2381 MachineIRBuilder &B) const { 2382 B.setInstr(MI); 2383 Register Res = MI.getOperand(0).getReg(); 2384 Register LHS = MI.getOperand(1).getReg(); 2385 Register RHS = MI.getOperand(2).getReg(); 2386 2387 uint16_t Flags = MI.getFlags(); 2388 2389 LLT S16 = LLT::scalar(16); 2390 LLT S32 = LLT::scalar(32); 2391 2392 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2393 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2394 2395 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2396 .addUse(RHSExt.getReg(0)) 2397 .setMIFlags(Flags); 2398 2399 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2400 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2401 2402 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2403 .addUse(RDst.getReg(0)) 2404 .addUse(RHS) 2405 .addUse(LHS) 2406 .setMIFlags(Flags); 2407 2408 MI.eraseFromParent(); 2409 return true; 2410 } 2411 2412 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2413 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2414 static void toggleSPDenormMode(bool Enable, 2415 MachineIRBuilder &B, 2416 const GCNSubtarget &ST, 2417 AMDGPU::SIModeRegisterDefaults Mode) { 2418 // Set SP denorm mode to this value. 2419 unsigned SPDenormMode = 2420 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2421 2422 if (ST.hasDenormModeInst()) { 2423 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2424 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2425 2426 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2427 B.buildInstr(AMDGPU::S_DENORM_MODE) 2428 .addImm(NewDenormModeValue); 2429 2430 } else { 2431 // Select FP32 bit field in mode register. 2432 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2433 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2434 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2435 2436 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2437 .addImm(SPDenormMode) 2438 .addImm(SPDenormModeBitField); 2439 } 2440 } 2441 2442 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2443 MachineRegisterInfo &MRI, 2444 MachineIRBuilder &B) const { 2445 B.setInstr(MI); 2446 Register Res = MI.getOperand(0).getReg(); 2447 Register LHS = MI.getOperand(1).getReg(); 2448 Register RHS = MI.getOperand(2).getReg(); 2449 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2450 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2451 2452 uint16_t Flags = MI.getFlags(); 2453 2454 LLT S32 = LLT::scalar(32); 2455 LLT S1 = LLT::scalar(1); 2456 2457 auto One = B.buildFConstant(S32, 1.0f); 2458 2459 auto DenominatorScaled = 2460 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2461 .addUse(RHS) 2462 .addUse(LHS) 2463 .addImm(1) 2464 .setMIFlags(Flags); 2465 auto NumeratorScaled = 2466 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2467 .addUse(LHS) 2468 .addUse(RHS) 2469 .addImm(0) 2470 .setMIFlags(Flags); 2471 2472 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2473 .addUse(DenominatorScaled.getReg(0)) 2474 .setMIFlags(Flags); 2475 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2476 2477 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2478 // aren't modeled as reading it. 2479 if (!Mode.allFP32Denormals()) 2480 toggleSPDenormMode(true, B, ST, Mode); 2481 2482 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2483 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2484 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2485 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2486 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2487 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2488 2489 if (!Mode.allFP32Denormals()) 2490 toggleSPDenormMode(false, B, ST, Mode); 2491 2492 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2493 .addUse(Fma4.getReg(0)) 2494 .addUse(Fma1.getReg(0)) 2495 .addUse(Fma3.getReg(0)) 2496 .addUse(NumeratorScaled.getReg(1)) 2497 .setMIFlags(Flags); 2498 2499 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2500 .addUse(Fmas.getReg(0)) 2501 .addUse(RHS) 2502 .addUse(LHS) 2503 .setMIFlags(Flags); 2504 2505 MI.eraseFromParent(); 2506 return true; 2507 } 2508 2509 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2510 MachineRegisterInfo &MRI, 2511 MachineIRBuilder &B) const { 2512 B.setInstr(MI); 2513 Register Res = MI.getOperand(0).getReg(); 2514 Register LHS = MI.getOperand(1).getReg(); 2515 Register RHS = MI.getOperand(2).getReg(); 2516 2517 uint16_t Flags = MI.getFlags(); 2518 2519 LLT S64 = LLT::scalar(64); 2520 LLT S1 = LLT::scalar(1); 2521 2522 auto One = B.buildFConstant(S64, 1.0); 2523 2524 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2525 .addUse(LHS) 2526 .addUse(RHS) 2527 .addImm(1) 2528 .setMIFlags(Flags); 2529 2530 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2531 2532 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2533 .addUse(DivScale0.getReg(0)) 2534 .setMIFlags(Flags); 2535 2536 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2537 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2538 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2539 2540 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2541 .addUse(LHS) 2542 .addUse(RHS) 2543 .addImm(0) 2544 .setMIFlags(Flags); 2545 2546 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2547 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); 2548 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2549 2550 Register Scale; 2551 if (!ST.hasUsableDivScaleConditionOutput()) { 2552 // Workaround a hardware bug on SI where the condition output from div_scale 2553 // is not usable. 2554 2555 LLT S32 = LLT::scalar(32); 2556 2557 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2558 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2559 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2560 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 2561 2562 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 2563 Scale1Unmerge.getReg(1)); 2564 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 2565 Scale0Unmerge.getReg(1)); 2566 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 2567 } else { 2568 Scale = DivScale1.getReg(1); 2569 } 2570 2571 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 2572 .addUse(Fma4.getReg(0)) 2573 .addUse(Fma3.getReg(0)) 2574 .addUse(Mul.getReg(0)) 2575 .addUse(Scale) 2576 .setMIFlags(Flags); 2577 2578 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 2579 .addUse(Fmas.getReg(0)) 2580 .addUse(RHS) 2581 .addUse(LHS) 2582 .setMIFlags(Flags); 2583 2584 MI.eraseFromParent(); 2585 return true; 2586 } 2587 2588 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 2589 MachineRegisterInfo &MRI, 2590 MachineIRBuilder &B) const { 2591 B.setInstr(MI); 2592 Register Res = MI.getOperand(0).getReg(); 2593 Register LHS = MI.getOperand(2).getReg(); 2594 Register RHS = MI.getOperand(3).getReg(); 2595 uint16_t Flags = MI.getFlags(); 2596 2597 LLT S32 = LLT::scalar(32); 2598 LLT S1 = LLT::scalar(1); 2599 2600 auto Abs = B.buildFAbs(S32, RHS, Flags); 2601 const APFloat C0Val(1.0f); 2602 2603 auto C0 = B.buildConstant(S32, 0x6f800000); 2604 auto C1 = B.buildConstant(S32, 0x2f800000); 2605 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 2606 2607 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 2608 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 2609 2610 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 2611 2612 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2613 .addUse(Mul0.getReg(0)) 2614 .setMIFlags(Flags); 2615 2616 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 2617 2618 B.buildFMul(Res, Sel, Mul1, Flags); 2619 2620 MI.eraseFromParent(); 2621 return true; 2622 } 2623 2624 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 2625 MachineRegisterInfo &MRI, 2626 MachineIRBuilder &B) const { 2627 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2628 if (!MFI->isEntryFunction()) { 2629 return legalizePreloadedArgIntrin(MI, MRI, B, 2630 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 2631 } 2632 2633 B.setInstr(MI); 2634 2635 uint64_t Offset = 2636 ST.getTargetLowering()->getImplicitParameterOffset( 2637 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 2638 Register DstReg = MI.getOperand(0).getReg(); 2639 LLT DstTy = MRI.getType(DstReg); 2640 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 2641 2642 const ArgDescriptor *Arg; 2643 const TargetRegisterClass *RC; 2644 std::tie(Arg, RC) 2645 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2646 if (!Arg) 2647 return false; 2648 2649 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 2650 if (!loadInputValue(KernargPtrReg, B, Arg)) 2651 return false; 2652 2653 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 2654 MI.eraseFromParent(); 2655 return true; 2656 } 2657 2658 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 2659 MachineRegisterInfo &MRI, 2660 MachineIRBuilder &B, 2661 unsigned AddrSpace) const { 2662 B.setInstr(MI); 2663 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 2664 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 2665 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 2666 MI.eraseFromParent(); 2667 return true; 2668 } 2669 2670 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 2671 // offset (the offset that is included in bounds checking and swizzling, to be 2672 // split between the instruction's voffset and immoffset fields) and soffset 2673 // (the offset that is excluded from bounds checking and swizzling, to go in 2674 // the instruction's soffset field). This function takes the first kind of 2675 // offset and figures out how to split it between voffset and immoffset. 2676 std::tuple<Register, unsigned, unsigned> 2677 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 2678 Register OrigOffset) const { 2679 const unsigned MaxImm = 4095; 2680 Register BaseReg; 2681 unsigned TotalConstOffset; 2682 MachineInstr *OffsetDef; 2683 const LLT S32 = LLT::scalar(32); 2684 2685 std::tie(BaseReg, TotalConstOffset, OffsetDef) 2686 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 2687 2688 unsigned ImmOffset = TotalConstOffset; 2689 2690 // If the immediate value is too big for the immoffset field, put the value 2691 // and -4096 into the immoffset field so that the value that is copied/added 2692 // for the voffset field is a multiple of 4096, and it stands more chance 2693 // of being CSEd with the copy/add for another similar load/store. 2694 // However, do not do that rounding down to a multiple of 4096 if that is a 2695 // negative number, as it appears to be illegal to have a negative offset 2696 // in the vgpr, even if adding the immediate offset makes it positive. 2697 unsigned Overflow = ImmOffset & ~MaxImm; 2698 ImmOffset -= Overflow; 2699 if ((int32_t)Overflow < 0) { 2700 Overflow += ImmOffset; 2701 ImmOffset = 0; 2702 } 2703 2704 if (Overflow != 0) { 2705 if (!BaseReg) { 2706 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 2707 } else { 2708 auto OverflowVal = B.buildConstant(S32, Overflow); 2709 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 2710 } 2711 } 2712 2713 if (!BaseReg) 2714 BaseReg = B.buildConstant(S32, 0).getReg(0); 2715 2716 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 2717 } 2718 2719 /// Handle register layout difference for f16 images for some subtargets. 2720 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 2721 MachineRegisterInfo &MRI, 2722 Register Reg) const { 2723 if (!ST.hasUnpackedD16VMem()) 2724 return Reg; 2725 2726 const LLT S16 = LLT::scalar(16); 2727 const LLT S32 = LLT::scalar(32); 2728 LLT StoreVT = MRI.getType(Reg); 2729 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 2730 2731 auto Unmerge = B.buildUnmerge(S16, Reg); 2732 2733 SmallVector<Register, 4> WideRegs; 2734 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 2735 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 2736 2737 int NumElts = StoreVT.getNumElements(); 2738 2739 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 2740 } 2741 2742 Register AMDGPULegalizerInfo::fixStoreSourceType( 2743 MachineIRBuilder &B, Register VData, bool IsFormat) const { 2744 MachineRegisterInfo *MRI = B.getMRI(); 2745 LLT Ty = MRI->getType(VData); 2746 2747 const LLT S16 = LLT::scalar(16); 2748 2749 // Fixup illegal register types for i8 stores. 2750 if (Ty == LLT::scalar(8) || Ty == S16) { 2751 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 2752 return AnyExt; 2753 } 2754 2755 if (Ty.isVector()) { 2756 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 2757 if (IsFormat) 2758 return handleD16VData(B, *MRI, VData); 2759 } 2760 } 2761 2762 return VData; 2763 } 2764 2765 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 2766 MachineRegisterInfo &MRI, 2767 MachineIRBuilder &B, 2768 bool IsTyped, 2769 bool IsFormat) const { 2770 B.setInstr(MI); 2771 2772 Register VData = MI.getOperand(1).getReg(); 2773 LLT Ty = MRI.getType(VData); 2774 LLT EltTy = Ty.getScalarType(); 2775 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 2776 const LLT S32 = LLT::scalar(32); 2777 2778 VData = fixStoreSourceType(B, VData, IsFormat); 2779 Register RSrc = MI.getOperand(2).getReg(); 2780 2781 MachineMemOperand *MMO = *MI.memoperands_begin(); 2782 const int MemSize = MMO->getSize(); 2783 2784 unsigned ImmOffset; 2785 unsigned TotalOffset; 2786 2787 // The typed intrinsics add an immediate after the registers. 2788 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 2789 2790 // The struct intrinsic variants add one additional operand over raw. 2791 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 2792 Register VIndex; 2793 int OpOffset = 0; 2794 if (HasVIndex) { 2795 VIndex = MI.getOperand(3).getReg(); 2796 OpOffset = 1; 2797 } 2798 2799 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 2800 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 2801 2802 unsigned Format = 0; 2803 if (IsTyped) { 2804 Format = MI.getOperand(5 + OpOffset).getImm(); 2805 ++OpOffset; 2806 } 2807 2808 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 2809 2810 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 2811 if (TotalOffset != 0) 2812 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 2813 2814 unsigned Opc; 2815 if (IsTyped) { 2816 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 2817 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 2818 } else if (IsFormat) { 2819 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 2820 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 2821 } else { 2822 switch (MemSize) { 2823 case 1: 2824 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 2825 break; 2826 case 2: 2827 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 2828 break; 2829 default: 2830 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 2831 break; 2832 } 2833 } 2834 2835 if (!VIndex) 2836 VIndex = B.buildConstant(S32, 0).getReg(0); 2837 2838 auto MIB = B.buildInstr(Opc) 2839 .addUse(VData) // vdata 2840 .addUse(RSrc) // rsrc 2841 .addUse(VIndex) // vindex 2842 .addUse(VOffset) // voffset 2843 .addUse(SOffset) // soffset 2844 .addImm(ImmOffset); // offset(imm) 2845 2846 if (IsTyped) 2847 MIB.addImm(Format); 2848 2849 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 2850 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 2851 .addMemOperand(MMO); 2852 2853 MI.eraseFromParent(); 2854 return true; 2855 } 2856 2857 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 2858 MachineRegisterInfo &MRI, 2859 MachineIRBuilder &B, 2860 bool IsFormat, 2861 bool IsTyped) const { 2862 B.setInstr(MI); 2863 2864 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 2865 MachineMemOperand *MMO = *MI.memoperands_begin(); 2866 const int MemSize = MMO->getSize(); 2867 const LLT S32 = LLT::scalar(32); 2868 2869 Register Dst = MI.getOperand(0).getReg(); 2870 Register RSrc = MI.getOperand(2).getReg(); 2871 2872 // The typed intrinsics add an immediate after the registers. 2873 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 2874 2875 // The struct intrinsic variants add one additional operand over raw. 2876 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 2877 Register VIndex; 2878 int OpOffset = 0; 2879 if (HasVIndex) { 2880 VIndex = MI.getOperand(3).getReg(); 2881 OpOffset = 1; 2882 } 2883 2884 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 2885 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 2886 2887 unsigned Format = 0; 2888 if (IsTyped) { 2889 Format = MI.getOperand(5 + OpOffset).getImm(); 2890 ++OpOffset; 2891 } 2892 2893 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 2894 unsigned ImmOffset; 2895 unsigned TotalOffset; 2896 2897 LLT Ty = MRI.getType(Dst); 2898 LLT EltTy = Ty.getScalarType(); 2899 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 2900 const bool Unpacked = ST.hasUnpackedD16VMem(); 2901 2902 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 2903 if (TotalOffset != 0) 2904 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 2905 2906 unsigned Opc; 2907 2908 if (IsTyped) { 2909 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 2910 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 2911 } else if (IsFormat) { 2912 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 2913 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 2914 } else { 2915 switch (MemSize) { 2916 case 1: 2917 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 2918 break; 2919 case 2: 2920 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 2921 break; 2922 default: 2923 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 2924 break; 2925 } 2926 } 2927 2928 Register LoadDstReg; 2929 2930 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 2931 LLT UnpackedTy = Ty.changeElementSize(32); 2932 2933 if (IsExtLoad) 2934 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 2935 else if (Unpacked && IsD16 && Ty.isVector()) 2936 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 2937 else 2938 LoadDstReg = Dst; 2939 2940 if (!VIndex) 2941 VIndex = B.buildConstant(S32, 0).getReg(0); 2942 2943 auto MIB = B.buildInstr(Opc) 2944 .addDef(LoadDstReg) // vdata 2945 .addUse(RSrc) // rsrc 2946 .addUse(VIndex) // vindex 2947 .addUse(VOffset) // voffset 2948 .addUse(SOffset) // soffset 2949 .addImm(ImmOffset); // offset(imm) 2950 2951 if (IsTyped) 2952 MIB.addImm(Format); 2953 2954 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 2955 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 2956 .addMemOperand(MMO); 2957 2958 if (LoadDstReg != Dst) { 2959 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2960 2961 // Widen result for extending loads was widened. 2962 if (IsExtLoad) 2963 B.buildTrunc(Dst, LoadDstReg); 2964 else { 2965 // Repack to original 16-bit vector result 2966 // FIXME: G_TRUNC should work, but legalization currently fails 2967 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 2968 SmallVector<Register, 4> Repack; 2969 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 2970 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 2971 B.buildMerge(Dst, Repack); 2972 } 2973 } 2974 2975 MI.eraseFromParent(); 2976 return true; 2977 } 2978 2979 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 2980 MachineIRBuilder &B, 2981 bool IsInc) const { 2982 B.setInstr(MI); 2983 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 2984 AMDGPU::G_AMDGPU_ATOMIC_DEC; 2985 B.buildInstr(Opc) 2986 .addDef(MI.getOperand(0).getReg()) 2987 .addUse(MI.getOperand(2).getReg()) 2988 .addUse(MI.getOperand(3).getReg()) 2989 .cloneMemRefs(MI); 2990 MI.eraseFromParent(); 2991 return true; 2992 } 2993 2994 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 2995 switch (IntrID) { 2996 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 2997 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 2998 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 2999 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3000 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3001 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3002 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3003 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3004 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3005 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3006 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3007 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3008 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3009 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3010 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3011 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3012 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3013 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3014 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3015 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3016 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3017 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3018 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3019 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3020 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3021 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3022 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3023 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3024 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3025 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3026 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3027 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3028 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3029 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3030 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3031 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3032 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3033 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3034 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3035 default: 3036 llvm_unreachable("unhandled atomic opcode"); 3037 } 3038 } 3039 3040 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3041 MachineIRBuilder &B, 3042 Intrinsic::ID IID) const { 3043 B.setInstr(MI); 3044 3045 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3046 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3047 3048 Register Dst = MI.getOperand(0).getReg(); 3049 Register VData = MI.getOperand(2).getReg(); 3050 3051 Register CmpVal; 3052 int OpOffset = 0; 3053 3054 if (IsCmpSwap) { 3055 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3056 ++OpOffset; 3057 } 3058 3059 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3060 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3061 3062 // The struct intrinsic variants add one additional operand over raw. 3063 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3064 Register VIndex; 3065 if (HasVIndex) { 3066 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3067 ++OpOffset; 3068 } 3069 3070 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3071 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3072 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3073 3074 MachineMemOperand *MMO = *MI.memoperands_begin(); 3075 3076 unsigned ImmOffset; 3077 unsigned TotalOffset; 3078 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3079 if (TotalOffset != 0) 3080 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3081 3082 if (!VIndex) 3083 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3084 3085 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3086 .addDef(Dst) 3087 .addUse(VData); // vdata 3088 3089 if (IsCmpSwap) 3090 MIB.addReg(CmpVal); 3091 3092 MIB.addUse(RSrc) // rsrc 3093 .addUse(VIndex) // vindex 3094 .addUse(VOffset) // voffset 3095 .addUse(SOffset) // soffset 3096 .addImm(ImmOffset) // offset(imm) 3097 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3098 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3099 .addMemOperand(MMO); 3100 3101 MI.eraseFromParent(); 3102 return true; 3103 } 3104 3105 // Produce a vector of s16 elements from s32 pieces. 3106 static void truncToS16Vector(MachineIRBuilder &B, Register DstReg, 3107 ArrayRef<Register> UnmergeParts) { 3108 const LLT S16 = LLT::scalar(16); 3109 3110 SmallVector<Register, 4> RemergeParts(UnmergeParts.size()); 3111 for (int I = 0, E = UnmergeParts.size(); I != E; ++I) 3112 RemergeParts[I] = B.buildTrunc(S16, UnmergeParts[I]).getReg(0); 3113 3114 B.buildBuildVector(DstReg, RemergeParts); 3115 } 3116 3117 /// Convert a set of s32 registers to a result vector with s16 elements. 3118 static void bitcastToS16Vector(MachineIRBuilder &B, Register DstReg, 3119 ArrayRef<Register> UnmergeParts) { 3120 MachineRegisterInfo &MRI = *B.getMRI(); 3121 const LLT V2S16 = LLT::vector(2, 16); 3122 LLT TargetTy = MRI.getType(DstReg); 3123 int NumElts = UnmergeParts.size(); 3124 3125 if (NumElts == 1) { 3126 assert(TargetTy == V2S16); 3127 B.buildBitcast(DstReg, UnmergeParts[0]); 3128 return; 3129 } 3130 3131 SmallVector<Register, 4> RemergeParts(NumElts); 3132 for (int I = 0; I != NumElts; ++I) 3133 RemergeParts[I] = B.buildBitcast(V2S16, UnmergeParts[I]).getReg(0); 3134 3135 if (TargetTy.getSizeInBits() == 32u * NumElts) { 3136 B.buildConcatVectors(DstReg, RemergeParts); 3137 return; 3138 } 3139 3140 const LLT V3S16 = LLT::vector(3, 16); 3141 const LLT V6S16 = LLT::vector(6, 16); 3142 3143 // Widen to v6s16 and unpack v3 parts. 3144 assert(TargetTy == V3S16); 3145 3146 RemergeParts.push_back(B.buildUndef(V2S16).getReg(0)); 3147 auto Concat = B.buildConcatVectors(V6S16, RemergeParts); 3148 B.buildUnmerge({DstReg, MRI.createGenericVirtualRegister(V3S16)}, Concat); 3149 } 3150 3151 // FIXME: Just vector trunc should be sufficent, but legalization currently 3152 // broken. 3153 static void repackUnpackedD16Load(MachineIRBuilder &B, Register DstReg, 3154 Register WideDstReg) { 3155 const LLT S32 = LLT::scalar(32); 3156 const LLT S16 = LLT::scalar(16); 3157 3158 auto Unmerge = B.buildUnmerge(S32, WideDstReg); 3159 3160 int NumOps = Unmerge->getNumOperands() - 1; 3161 SmallVector<Register, 4> RemergeParts(NumOps); 3162 for (int I = 0; I != NumOps; ++I) 3163 RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0); 3164 3165 B.buildBuildVector(DstReg, RemergeParts); 3166 } 3167 3168 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3169 MachineInstr &MI, MachineIRBuilder &B, 3170 GISelChangeObserver &Observer, 3171 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3172 bool IsTFE = MI.getNumExplicitDefs() == 2; 3173 3174 // We are only processing the operands of d16 image operations on subtargets 3175 // that use the unpacked register layout, or need to repack the TFE result. 3176 3177 // TODO: Need to handle a16 images too 3178 // TODO: Do we need to guard against already legalized intrinsics? 3179 if (!IsTFE && !ST.hasUnpackedD16VMem()) 3180 return true; 3181 3182 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3183 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3184 3185 if (BaseOpcode->Atomic) // No d16 atomics, or TFE. 3186 return true; 3187 3188 B.setInstr(MI); 3189 3190 MachineRegisterInfo *MRI = B.getMRI(); 3191 const LLT S32 = LLT::scalar(32); 3192 const LLT S16 = LLT::scalar(16); 3193 3194 if (BaseOpcode->Store) { // No TFE for stores? 3195 Register VData = MI.getOperand(1).getReg(); 3196 LLT Ty = MRI->getType(VData); 3197 if (!Ty.isVector() || Ty.getElementType() != S16) 3198 return true; 3199 3200 B.setInstr(MI); 3201 3202 Observer.changingInstr(MI); 3203 MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData)); 3204 Observer.changedInstr(MI); 3205 return true; 3206 } 3207 3208 Register DstReg = MI.getOperand(0).getReg(); 3209 LLT Ty = MRI->getType(DstReg); 3210 const LLT EltTy = Ty.getScalarType(); 3211 const bool IsD16 = Ty.getScalarType() == S16; 3212 const unsigned NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3213 3214 if (IsTFE) { 3215 // In the IR, TFE is supposed to be used with a 2 element struct return 3216 // type. The intruction really returns these two values in one contiguous 3217 // register, with one additional dword beyond the loaded data. Rewrite the 3218 // return type to use a single register result. 3219 Register Dst1Reg = MI.getOperand(1).getReg(); 3220 if (MRI->getType(Dst1Reg) != S32) 3221 return false; 3222 3223 // TODO: Make sure the TFE operand bit is set. 3224 3225 // The raw dword aligned data component of the load. The only legal cases 3226 // where this matters should be when using the packed D16 format, for 3227 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3228 LLT RoundedTy; 3229 LLT TFETy; 3230 3231 if (IsD16 && ST.hasUnpackedD16VMem()) { 3232 RoundedTy = LLT::scalarOrVector(NumElts, 32); 3233 TFETy = LLT::vector(NumElts + 1, 32); 3234 } else { 3235 unsigned EltSize = Ty.getScalarSizeInBits(); 3236 unsigned RoundedElts = (Ty.getSizeInBits() + 31) / 32; 3237 unsigned RoundedSize = 32 * RoundedElts; 3238 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3239 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3240 } 3241 3242 Register TFEReg = MRI->createGenericVirtualRegister(TFETy); 3243 Observer.changingInstr(MI); 3244 3245 MI.getOperand(0).setReg(TFEReg); 3246 MI.RemoveOperand(1); 3247 3248 Observer.changedInstr(MI); 3249 3250 // Insert after the instruction. 3251 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3252 3253 // Now figure out how to copy the new result register back into the old 3254 // result. 3255 3256 SmallVector<Register, 5> UnmergeResults(TFETy.getNumElements(), Dst1Reg); 3257 int NumDataElts = TFETy.getNumElements() - 1; 3258 3259 if (!Ty.isVector()) { 3260 // Simplest case is a trivial unmerge (plus a truncate for d16). 3261 UnmergeResults[0] = Ty == S32 ? 3262 DstReg : MRI->createGenericVirtualRegister(S32); 3263 3264 B.buildUnmerge(UnmergeResults, TFEReg); 3265 if (Ty != S32) 3266 B.buildTrunc(DstReg, UnmergeResults[0]); 3267 return true; 3268 } 3269 3270 // We have to repack into a new vector of some kind. 3271 for (int I = 0; I != NumDataElts; ++I) 3272 UnmergeResults[I] = MRI->createGenericVirtualRegister(S32); 3273 B.buildUnmerge(UnmergeResults, TFEReg); 3274 3275 // Drop the final TFE element. 3276 ArrayRef<Register> DataPart(UnmergeResults.data(), NumDataElts); 3277 3278 if (EltTy == S32) 3279 B.buildBuildVector(DstReg, DataPart); 3280 else if (ST.hasUnpackedD16VMem()) 3281 truncToS16Vector(B, DstReg, DataPart); 3282 else 3283 bitcastToS16Vector(B, DstReg, DataPart); 3284 3285 return true; 3286 } 3287 3288 // Must be an image load. 3289 if (!Ty.isVector() || Ty.getElementType() != S16) 3290 return true; 3291 3292 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3293 3294 LLT WidenedTy = Ty.changeElementType(S32); 3295 Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy); 3296 3297 Observer.changingInstr(MI); 3298 MI.getOperand(0).setReg(WideDstReg); 3299 Observer.changedInstr(MI); 3300 3301 repackUnpackedD16Load(B, DstReg, WideDstReg); 3302 return true; 3303 } 3304 3305 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 3306 MachineInstr &MI, MachineIRBuilder &B, 3307 GISelChangeObserver &Observer) const { 3308 Register Dst = MI.getOperand(0).getReg(); 3309 LLT Ty = B.getMRI()->getType(Dst); 3310 unsigned Size = Ty.getSizeInBits(); 3311 MachineFunction &MF = B.getMF(); 3312 3313 Observer.changingInstr(MI); 3314 3315 // FIXME: We don't really need this intermediate instruction. The intrinsic 3316 // should be fixed to have a memory operand. Since it's readnone, we're not 3317 // allowed to add one. 3318 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 3319 MI.RemoveOperand(1); // Remove intrinsic ID 3320 3321 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 3322 // TODO: Should this use datalayout alignment? 3323 const unsigned MemSize = (Size + 7) / 8; 3324 const unsigned MemAlign = 4; 3325 MachineMemOperand *MMO = MF.getMachineMemOperand( 3326 MachinePointerInfo(), 3327 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 3328 MachineMemOperand::MOInvariant, MemSize, MemAlign); 3329 MI.addMemOperand(MF, MMO); 3330 3331 // There are no 96-bit result scalar loads, but widening to 128-bit should 3332 // always be legal. We may need to restore this to a 96-bit result if it turns 3333 // out this needs to be converted to a vector load during RegBankSelect. 3334 if (!isPowerOf2_32(Size)) { 3335 LegalizerHelper Helper(MF, *this, Observer, B); 3336 B.setInstr(MI); 3337 3338 if (Ty.isVector()) 3339 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 3340 else 3341 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 3342 } 3343 3344 Observer.changedInstr(MI); 3345 return true; 3346 } 3347 3348 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 3349 MachineIRBuilder &B, 3350 GISelChangeObserver &Observer) const { 3351 MachineRegisterInfo &MRI = *B.getMRI(); 3352 3353 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 3354 auto IntrID = MI.getIntrinsicID(); 3355 switch (IntrID) { 3356 case Intrinsic::amdgcn_if: 3357 case Intrinsic::amdgcn_else: { 3358 MachineInstr *Br = nullptr; 3359 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 3360 const SIRegisterInfo *TRI 3361 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 3362 3363 B.setInstr(*BrCond); 3364 Register Def = MI.getOperand(1).getReg(); 3365 Register Use = MI.getOperand(3).getReg(); 3366 3367 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 3368 if (Br) 3369 BrTarget = Br->getOperand(0).getMBB(); 3370 3371 if (IntrID == Intrinsic::amdgcn_if) { 3372 B.buildInstr(AMDGPU::SI_IF) 3373 .addDef(Def) 3374 .addUse(Use) 3375 .addMBB(BrTarget); 3376 } else { 3377 B.buildInstr(AMDGPU::SI_ELSE) 3378 .addDef(Def) 3379 .addUse(Use) 3380 .addMBB(BrTarget) 3381 .addImm(0); 3382 } 3383 3384 if (Br) 3385 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 3386 3387 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 3388 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 3389 MI.eraseFromParent(); 3390 BrCond->eraseFromParent(); 3391 return true; 3392 } 3393 3394 return false; 3395 } 3396 case Intrinsic::amdgcn_loop: { 3397 MachineInstr *Br = nullptr; 3398 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 3399 const SIRegisterInfo *TRI 3400 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 3401 3402 B.setInstr(*BrCond); 3403 3404 // FIXME: Need to adjust branch targets based on unconditional branch. 3405 Register Reg = MI.getOperand(2).getReg(); 3406 B.buildInstr(AMDGPU::SI_LOOP) 3407 .addUse(Reg) 3408 .addMBB(BrCond->getOperand(1).getMBB()); 3409 MI.eraseFromParent(); 3410 BrCond->eraseFromParent(); 3411 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 3412 return true; 3413 } 3414 3415 return false; 3416 } 3417 case Intrinsic::amdgcn_kernarg_segment_ptr: 3418 return legalizePreloadedArgIntrin( 3419 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3420 case Intrinsic::amdgcn_implicitarg_ptr: 3421 return legalizeImplicitArgPtr(MI, MRI, B); 3422 case Intrinsic::amdgcn_workitem_id_x: 3423 return legalizePreloadedArgIntrin(MI, MRI, B, 3424 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 3425 case Intrinsic::amdgcn_workitem_id_y: 3426 return legalizePreloadedArgIntrin(MI, MRI, B, 3427 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 3428 case Intrinsic::amdgcn_workitem_id_z: 3429 return legalizePreloadedArgIntrin(MI, MRI, B, 3430 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 3431 case Intrinsic::amdgcn_workgroup_id_x: 3432 return legalizePreloadedArgIntrin(MI, MRI, B, 3433 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 3434 case Intrinsic::amdgcn_workgroup_id_y: 3435 return legalizePreloadedArgIntrin(MI, MRI, B, 3436 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 3437 case Intrinsic::amdgcn_workgroup_id_z: 3438 return legalizePreloadedArgIntrin(MI, MRI, B, 3439 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 3440 case Intrinsic::amdgcn_dispatch_ptr: 3441 return legalizePreloadedArgIntrin(MI, MRI, B, 3442 AMDGPUFunctionArgInfo::DISPATCH_PTR); 3443 case Intrinsic::amdgcn_queue_ptr: 3444 return legalizePreloadedArgIntrin(MI, MRI, B, 3445 AMDGPUFunctionArgInfo::QUEUE_PTR); 3446 case Intrinsic::amdgcn_implicit_buffer_ptr: 3447 return legalizePreloadedArgIntrin( 3448 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 3449 case Intrinsic::amdgcn_dispatch_id: 3450 return legalizePreloadedArgIntrin(MI, MRI, B, 3451 AMDGPUFunctionArgInfo::DISPATCH_ID); 3452 case Intrinsic::amdgcn_fdiv_fast: 3453 return legalizeFDIVFastIntrin(MI, MRI, B); 3454 case Intrinsic::amdgcn_is_shared: 3455 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 3456 case Intrinsic::amdgcn_is_private: 3457 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 3458 case Intrinsic::amdgcn_wavefrontsize: { 3459 B.setInstr(MI); 3460 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 3461 MI.eraseFromParent(); 3462 return true; 3463 } 3464 case Intrinsic::amdgcn_s_buffer_load: 3465 return legalizeSBufferLoad(MI, B, Observer); 3466 case Intrinsic::amdgcn_raw_buffer_store: 3467 case Intrinsic::amdgcn_struct_buffer_store: 3468 return legalizeBufferStore(MI, MRI, B, false, false); 3469 case Intrinsic::amdgcn_raw_buffer_store_format: 3470 case Intrinsic::amdgcn_struct_buffer_store_format: 3471 return legalizeBufferStore(MI, MRI, B, false, true); 3472 case Intrinsic::amdgcn_raw_tbuffer_store: 3473 case Intrinsic::amdgcn_struct_tbuffer_store: 3474 return legalizeBufferStore(MI, MRI, B, true, true); 3475 case Intrinsic::amdgcn_raw_buffer_load: 3476 case Intrinsic::amdgcn_struct_buffer_load: 3477 return legalizeBufferLoad(MI, MRI, B, false, false); 3478 case Intrinsic::amdgcn_raw_buffer_load_format: 3479 case Intrinsic::amdgcn_struct_buffer_load_format: 3480 return legalizeBufferLoad(MI, MRI, B, true, false); 3481 case Intrinsic::amdgcn_raw_tbuffer_load: 3482 case Intrinsic::amdgcn_struct_tbuffer_load: 3483 return legalizeBufferLoad(MI, MRI, B, true, true); 3484 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3485 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3486 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3487 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3488 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3489 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3490 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3491 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3492 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3493 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3494 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3495 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3496 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3497 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3498 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3499 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3500 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3501 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3502 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3503 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3504 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3505 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3506 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3507 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3508 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3509 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3510 return legalizeBufferAtomic(MI, B, IntrID); 3511 case Intrinsic::amdgcn_atomic_inc: 3512 return legalizeAtomicIncDec(MI, B, true); 3513 case Intrinsic::amdgcn_atomic_dec: 3514 return legalizeAtomicIncDec(MI, B, false); 3515 default: { 3516 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 3517 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 3518 return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr); 3519 return true; 3520 } 3521 } 3522 3523 return true; 3524 } 3525