1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPULegalizerInfo.h" 22 23 #include "AMDGPU.h" 24 #include "AMDGPUGlobalISelUtils.h" 25 #include "AMDGPUTargetMachine.h" 26 #include "SIMachineFunctionInfo.h" 27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 30 #include "llvm/CodeGen/TargetOpcodes.h" 31 #include "llvm/CodeGen/ValueTypes.h" 32 #include "llvm/IR/DerivedTypes.h" 33 #include "llvm/IR/DiagnosticInfo.h" 34 #include "llvm/IR/Type.h" 35 #include "llvm/Support/Debug.h" 36 37 #define DEBUG_TYPE "amdgpu-legalinfo" 38 39 using namespace llvm; 40 using namespace LegalizeActions; 41 using namespace LegalizeMutations; 42 using namespace LegalityPredicates; 43 using namespace MIPatternMatch; 44 45 // Round the number of elements to the next power of two elements 46 static LLT getPow2VectorType(LLT Ty) { 47 unsigned NElts = Ty.getNumElements(); 48 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 49 return Ty.changeNumElements(Pow2NElts); 50 } 51 52 // Round the number of bits to the next power of two bits 53 static LLT getPow2ScalarType(LLT Ty) { 54 unsigned Bits = Ty.getSizeInBits(); 55 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 56 return LLT::scalar(Pow2Bits); 57 } 58 59 static LegalityPredicate isMultiple32(unsigned TypeIdx, 60 unsigned MaxSize = 1024) { 61 return [=](const LegalityQuery &Query) { 62 const LLT Ty = Query.Types[TypeIdx]; 63 const LLT EltTy = Ty.getScalarType(); 64 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 65 }; 66 } 67 68 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 69 return [=](const LegalityQuery &Query) { 70 return Query.Types[TypeIdx].getSizeInBits() == Size; 71 }; 72 } 73 74 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 75 return [=](const LegalityQuery &Query) { 76 const LLT Ty = Query.Types[TypeIdx]; 77 return Ty.isVector() && 78 Ty.getNumElements() % 2 != 0 && 79 Ty.getElementType().getSizeInBits() < 32 && 80 Ty.getSizeInBits() % 32 != 0; 81 }; 82 } 83 84 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 85 return [=](const LegalityQuery &Query) { 86 const LLT Ty = Query.Types[TypeIdx]; 87 const LLT EltTy = Ty.getScalarType(); 88 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 89 }; 90 } 91 92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 93 return [=](const LegalityQuery &Query) { 94 const LLT Ty = Query.Types[TypeIdx]; 95 const LLT EltTy = Ty.getElementType(); 96 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 97 }; 98 } 99 100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 101 return [=](const LegalityQuery &Query) { 102 const LLT Ty = Query.Types[TypeIdx]; 103 const LLT EltTy = Ty.getElementType(); 104 unsigned Size = Ty.getSizeInBits(); 105 unsigned Pieces = (Size + 63) / 64; 106 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 107 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 108 }; 109 } 110 111 // Increase the number of vector elements to reach the next multiple of 32-bit 112 // type. 113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 114 return [=](const LegalityQuery &Query) { 115 const LLT Ty = Query.Types[TypeIdx]; 116 117 const LLT EltTy = Ty.getElementType(); 118 const int Size = Ty.getSizeInBits(); 119 const int EltSize = EltTy.getSizeInBits(); 120 const int NextMul32 = (Size + 31) / 32; 121 122 assert(EltSize < 32); 123 124 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 125 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 126 }; 127 } 128 129 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 130 return [=](const LegalityQuery &Query) { 131 const LLT QueryTy = Query.Types[TypeIdx]; 132 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 133 }; 134 } 135 136 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 137 return [=](const LegalityQuery &Query) { 138 const LLT QueryTy = Query.Types[TypeIdx]; 139 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 140 }; 141 } 142 143 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 144 return [=](const LegalityQuery &Query) { 145 const LLT QueryTy = Query.Types[TypeIdx]; 146 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 147 }; 148 } 149 150 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 151 // v2s16. 152 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 153 return [=](const LegalityQuery &Query) { 154 const LLT Ty = Query.Types[TypeIdx]; 155 if (Ty.isVector()) { 156 const int EltSize = Ty.getElementType().getSizeInBits(); 157 return EltSize == 32 || EltSize == 64 || 158 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 159 EltSize == 128 || EltSize == 256; 160 } 161 162 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 163 }; 164 } 165 166 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 167 return [=](const LegalityQuery &Query) { 168 const LLT QueryTy = Query.Types[TypeIdx]; 169 return QueryTy.isVector() && QueryTy.getElementType() == Type; 170 }; 171 } 172 173 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 174 return [=](const LegalityQuery &Query) { 175 const LLT Ty = Query.Types[TypeIdx]; 176 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 177 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 178 }; 179 } 180 181 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 182 const GCNTargetMachine &TM) 183 : ST(ST_) { 184 using namespace TargetOpcode; 185 186 auto GetAddrSpacePtr = [&TM](unsigned AS) { 187 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 188 }; 189 190 const LLT S1 = LLT::scalar(1); 191 const LLT S16 = LLT::scalar(16); 192 const LLT S32 = LLT::scalar(32); 193 const LLT S64 = LLT::scalar(64); 194 const LLT S128 = LLT::scalar(128); 195 const LLT S256 = LLT::scalar(256); 196 const LLT S1024 = LLT::scalar(1024); 197 198 const LLT V2S16 = LLT::vector(2, 16); 199 const LLT V4S16 = LLT::vector(4, 16); 200 201 const LLT V2S32 = LLT::vector(2, 32); 202 const LLT V3S32 = LLT::vector(3, 32); 203 const LLT V4S32 = LLT::vector(4, 32); 204 const LLT V5S32 = LLT::vector(5, 32); 205 const LLT V6S32 = LLT::vector(6, 32); 206 const LLT V7S32 = LLT::vector(7, 32); 207 const LLT V8S32 = LLT::vector(8, 32); 208 const LLT V9S32 = LLT::vector(9, 32); 209 const LLT V10S32 = LLT::vector(10, 32); 210 const LLT V11S32 = LLT::vector(11, 32); 211 const LLT V12S32 = LLT::vector(12, 32); 212 const LLT V13S32 = LLT::vector(13, 32); 213 const LLT V14S32 = LLT::vector(14, 32); 214 const LLT V15S32 = LLT::vector(15, 32); 215 const LLT V16S32 = LLT::vector(16, 32); 216 const LLT V32S32 = LLT::vector(32, 32); 217 218 const LLT V2S64 = LLT::vector(2, 64); 219 const LLT V3S64 = LLT::vector(3, 64); 220 const LLT V4S64 = LLT::vector(4, 64); 221 const LLT V5S64 = LLT::vector(5, 64); 222 const LLT V6S64 = LLT::vector(6, 64); 223 const LLT V7S64 = LLT::vector(7, 64); 224 const LLT V8S64 = LLT::vector(8, 64); 225 const LLT V16S64 = LLT::vector(16, 64); 226 227 std::initializer_list<LLT> AllS32Vectors = 228 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 229 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 230 std::initializer_list<LLT> AllS64Vectors = 231 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 232 233 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 234 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 235 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 236 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 237 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 238 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 239 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 240 241 const LLT CodePtr = FlatPtr; 242 243 const std::initializer_list<LLT> AddrSpaces64 = { 244 GlobalPtr, ConstantPtr, FlatPtr 245 }; 246 247 const std::initializer_list<LLT> AddrSpaces32 = { 248 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 249 }; 250 251 const std::initializer_list<LLT> FPTypesBase = { 252 S32, S64 253 }; 254 255 const std::initializer_list<LLT> FPTypes16 = { 256 S32, S64, S16 257 }; 258 259 const std::initializer_list<LLT> FPTypesPK16 = { 260 S32, S64, S16, V2S16 261 }; 262 263 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 264 265 setAction({G_BRCOND, S1}, Legal); // VCC branches 266 setAction({G_BRCOND, S32}, Legal); // SCC branches 267 268 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 269 // elements for v3s16 270 getActionDefinitionsBuilder(G_PHI) 271 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 272 .legalFor(AllS32Vectors) 273 .legalFor(AllS64Vectors) 274 .legalFor(AddrSpaces64) 275 .legalFor(AddrSpaces32) 276 .clampScalar(0, S32, S256) 277 .widenScalarToNextPow2(0, 32) 278 .clampMaxNumElements(0, S32, 16) 279 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 280 .legalIf(isPointer(0)); 281 282 if (ST.has16BitInsts()) { 283 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 284 .legalFor({S32, S16}) 285 .clampScalar(0, S16, S32) 286 .scalarize(0) 287 .widenScalarToNextPow2(0, 32); 288 } else { 289 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 290 .legalFor({S32}) 291 .clampScalar(0, S32, S32) 292 .scalarize(0); 293 } 294 295 // FIXME: Not really legal. Placeholder for custom lowering. 296 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 297 .legalFor({S32, S64}) 298 .clampScalar(0, S32, S64) 299 .widenScalarToNextPow2(0, 32) 300 .scalarize(0); 301 302 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 303 .legalFor({S32}) 304 .clampScalar(0, S32, S32) 305 .scalarize(0); 306 307 // Report legal for any types we can handle anywhere. For the cases only legal 308 // on the SALU, RegBankSelect will be able to re-legalize. 309 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 310 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 311 .clampScalar(0, S32, S64) 312 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 313 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 314 .widenScalarToNextPow2(0) 315 .scalarize(0); 316 317 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 318 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 319 .legalFor({{S32, S1}, {S32, S32}}) 320 .clampScalar(0, S32, S32) 321 .scalarize(0); // TODO: Implement. 322 323 getActionDefinitionsBuilder(G_BITCAST) 324 // Don't worry about the size constraint. 325 .legalIf(all(isRegisterType(0), isRegisterType(1))) 326 .lower(); 327 328 329 getActionDefinitionsBuilder(G_CONSTANT) 330 .legalFor({S1, S32, S64, S16, GlobalPtr, 331 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 332 .clampScalar(0, S32, S64) 333 .widenScalarToNextPow2(0) 334 .legalIf(isPointer(0)); 335 336 getActionDefinitionsBuilder(G_FCONSTANT) 337 .legalFor({S32, S64, S16}) 338 .clampScalar(0, S16, S64); 339 340 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 341 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 342 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 343 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 344 .clampScalarOrElt(0, S32, S1024) 345 .legalIf(isMultiple32(0)) 346 .widenScalarToNextPow2(0, 32) 347 .clampMaxNumElements(0, S32, 16); 348 349 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 350 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 351 .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr}); 352 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 353 354 auto &FPOpActions = getActionDefinitionsBuilder( 355 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 356 .legalFor({S32, S64}); 357 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 358 .customFor({S32, S64}); 359 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 360 .customFor({S32, S64}); 361 362 if (ST.has16BitInsts()) { 363 if (ST.hasVOP3PInsts()) 364 FPOpActions.legalFor({S16, V2S16}); 365 else 366 FPOpActions.legalFor({S16}); 367 368 TrigActions.customFor({S16}); 369 FDIVActions.customFor({S16}); 370 } 371 372 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 373 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 374 375 if (ST.hasVOP3PInsts()) { 376 MinNumMaxNum.customFor(FPTypesPK16) 377 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 378 .clampMaxNumElements(0, S16, 2) 379 .clampScalar(0, S16, S64) 380 .scalarize(0); 381 } else if (ST.has16BitInsts()) { 382 MinNumMaxNum.customFor(FPTypes16) 383 .clampScalar(0, S16, S64) 384 .scalarize(0); 385 } else { 386 MinNumMaxNum.customFor(FPTypesBase) 387 .clampScalar(0, S32, S64) 388 .scalarize(0); 389 } 390 391 if (ST.hasVOP3PInsts()) 392 FPOpActions.clampMaxNumElements(0, S16, 2); 393 394 FPOpActions 395 .scalarize(0) 396 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 397 398 TrigActions 399 .scalarize(0) 400 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 401 402 FDIVActions 403 .scalarize(0) 404 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 405 406 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 407 .legalFor(FPTypesPK16) 408 .clampMaxNumElements(0, S16, 2) 409 .scalarize(0) 410 .clampScalar(0, S16, S64); 411 412 if (ST.has16BitInsts()) { 413 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 414 .legalFor({S32, S64, S16}) 415 .scalarize(0) 416 .clampScalar(0, S16, S64); 417 } else { 418 getActionDefinitionsBuilder(G_FSQRT) 419 .legalFor({S32, S64}) 420 .scalarize(0) 421 .clampScalar(0, S32, S64); 422 423 if (ST.hasFractBug()) { 424 getActionDefinitionsBuilder(G_FFLOOR) 425 .customFor({S64}) 426 .legalFor({S32, S64}) 427 .scalarize(0) 428 .clampScalar(0, S32, S64); 429 } else { 430 getActionDefinitionsBuilder(G_FFLOOR) 431 .legalFor({S32, S64}) 432 .scalarize(0) 433 .clampScalar(0, S32, S64); 434 } 435 } 436 437 getActionDefinitionsBuilder(G_FPTRUNC) 438 .legalFor({{S32, S64}, {S16, S32}}) 439 .scalarize(0); 440 441 getActionDefinitionsBuilder(G_FPEXT) 442 .legalFor({{S64, S32}, {S32, S16}}) 443 .lowerFor({{S64, S16}}) // FIXME: Implement 444 .scalarize(0); 445 446 getActionDefinitionsBuilder(G_FSUB) 447 // Use actual fsub instruction 448 .legalFor({S32}) 449 // Must use fadd + fneg 450 .lowerFor({S64, S16, V2S16}) 451 .scalarize(0) 452 .clampScalar(0, S32, S64); 453 454 // Whether this is legal depends on the floating point mode for the function. 455 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 456 if (ST.hasMadF16()) 457 FMad.customFor({S32, S16}); 458 else 459 FMad.customFor({S32}); 460 FMad.scalarize(0) 461 .lower(); 462 463 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 464 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 465 {S32, S1}, {S64, S1}, {S16, S1}}) 466 .scalarize(0) 467 .clampScalar(0, S32, S64) 468 .widenScalarToNextPow2(1, 32); 469 470 // TODO: Split s1->s64 during regbankselect for VALU. 471 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 472 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 473 .lowerFor({{S32, S64}}) 474 .lowerIf(typeIs(1, S1)) 475 .customFor({{S64, S64}}); 476 if (ST.has16BitInsts()) 477 IToFP.legalFor({{S16, S16}}); 478 IToFP.clampScalar(1, S32, S64) 479 .scalarize(0); 480 481 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 482 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 483 .customFor({{S64, S64}}); 484 if (ST.has16BitInsts()) 485 FPToI.legalFor({{S16, S16}}); 486 else 487 FPToI.minScalar(1, S32); 488 489 FPToI.minScalar(0, S32) 490 .scalarize(0) 491 .lower(); 492 493 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 494 .scalarize(0) 495 .lower(); 496 497 if (ST.has16BitInsts()) { 498 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 499 .legalFor({S16, S32, S64}) 500 .clampScalar(0, S16, S64) 501 .scalarize(0); 502 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 503 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 504 .legalFor({S32, S64}) 505 .clampScalar(0, S32, S64) 506 .scalarize(0); 507 } else { 508 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 509 .legalFor({S32}) 510 .customFor({S64}) 511 .clampScalar(0, S32, S64) 512 .scalarize(0); 513 } 514 515 getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK}) 516 .scalarize(0) 517 .alwaysLegal(); 518 519 auto &CmpBuilder = 520 getActionDefinitionsBuilder(G_ICMP) 521 // The compare output type differs based on the register bank of the output, 522 // so make both s1 and s32 legal. 523 // 524 // Scalar compares producing output in scc will be promoted to s32, as that 525 // is the allocatable register type that will be needed for the copy from 526 // scc. This will be promoted during RegBankSelect, and we assume something 527 // before that won't try to use s32 result types. 528 // 529 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 530 // bank. 531 .legalForCartesianProduct( 532 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 533 .legalForCartesianProduct( 534 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 535 if (ST.has16BitInsts()) { 536 CmpBuilder.legalFor({{S1, S16}}); 537 } 538 539 CmpBuilder 540 .widenScalarToNextPow2(1) 541 .clampScalar(1, S32, S64) 542 .scalarize(0) 543 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 544 545 getActionDefinitionsBuilder(G_FCMP) 546 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 547 .widenScalarToNextPow2(1) 548 .clampScalar(1, S32, S64) 549 .scalarize(0); 550 551 // FIXME: fpow has a selection pattern that should move to custom lowering. 552 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2, G_FPOW}); 553 if (ST.has16BitInsts()) 554 Exp2Ops.legalFor({S32, S16}); 555 else 556 Exp2Ops.legalFor({S32}); 557 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 558 Exp2Ops.scalarize(0); 559 560 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10}); 561 if (ST.has16BitInsts()) 562 ExpOps.customFor({{S32}, {S16}}); 563 else 564 ExpOps.customFor({S32}); 565 ExpOps.clampScalar(0, MinScalarFPTy, S32) 566 .scalarize(0); 567 568 // The 64-bit versions produce 32-bit results, but only on the SALU. 569 getActionDefinitionsBuilder(G_CTPOP) 570 .legalFor({{S32, S32}, {S32, S64}}) 571 .clampScalar(0, S32, S32) 572 .clampScalar(1, S32, S64) 573 .scalarize(0) 574 .widenScalarToNextPow2(0, 32) 575 .widenScalarToNextPow2(1, 32); 576 577 // The hardware instructions return a different result on 0 than the generic 578 // instructions expect. The hardware produces -1, but these produce the 579 // bitwidth. 580 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 581 .scalarize(0) 582 .clampScalar(0, S32, S32) 583 .clampScalar(1, S32, S64) 584 .widenScalarToNextPow2(0, 32) 585 .widenScalarToNextPow2(1, 32) 586 .lower(); 587 588 // The 64-bit versions produce 32-bit results, but only on the SALU. 589 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 590 .legalFor({{S32, S32}, {S32, S64}}) 591 .clampScalar(0, S32, S32) 592 .clampScalar(1, S32, S64) 593 .scalarize(0) 594 .widenScalarToNextPow2(0, 32) 595 .widenScalarToNextPow2(1, 32); 596 597 // TODO: Expand for > s32 598 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE}) 599 .legalFor({S32}) 600 .clampScalar(0, S32, S32) 601 .scalarize(0); 602 603 if (ST.has16BitInsts()) { 604 if (ST.hasVOP3PInsts()) { 605 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 606 .legalFor({S32, S16, V2S16}) 607 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 608 .clampMaxNumElements(0, S16, 2) 609 .clampScalar(0, S16, S32) 610 .widenScalarToNextPow2(0) 611 .scalarize(0); 612 } else { 613 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 614 .legalFor({S32, S16}) 615 .widenScalarToNextPow2(0) 616 .clampScalar(0, S16, S32) 617 .scalarize(0); 618 } 619 } else { 620 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 621 .legalFor({S32}) 622 .clampScalar(0, S32, S32) 623 .widenScalarToNextPow2(0) 624 .scalarize(0); 625 } 626 627 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 628 return [=](const LegalityQuery &Query) { 629 return Query.Types[TypeIdx0].getSizeInBits() < 630 Query.Types[TypeIdx1].getSizeInBits(); 631 }; 632 }; 633 634 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 635 return [=](const LegalityQuery &Query) { 636 return Query.Types[TypeIdx0].getSizeInBits() > 637 Query.Types[TypeIdx1].getSizeInBits(); 638 }; 639 }; 640 641 getActionDefinitionsBuilder(G_INTTOPTR) 642 // List the common cases 643 .legalForCartesianProduct(AddrSpaces64, {S64}) 644 .legalForCartesianProduct(AddrSpaces32, {S32}) 645 .scalarize(0) 646 // Accept any address space as long as the size matches 647 .legalIf(sameSize(0, 1)) 648 .widenScalarIf(smallerThan(1, 0), 649 [](const LegalityQuery &Query) { 650 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 651 }) 652 .narrowScalarIf(greaterThan(1, 0), 653 [](const LegalityQuery &Query) { 654 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 655 }); 656 657 getActionDefinitionsBuilder(G_PTRTOINT) 658 // List the common cases 659 .legalForCartesianProduct(AddrSpaces64, {S64}) 660 .legalForCartesianProduct(AddrSpaces32, {S32}) 661 .scalarize(0) 662 // Accept any address space as long as the size matches 663 .legalIf(sameSize(0, 1)) 664 .widenScalarIf(smallerThan(0, 1), 665 [](const LegalityQuery &Query) { 666 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 667 }) 668 .narrowScalarIf( 669 greaterThan(0, 1), 670 [](const LegalityQuery &Query) { 671 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 672 }); 673 674 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 675 .scalarize(0) 676 .custom(); 677 678 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 679 // handle some operations by just promoting the register during 680 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 681 auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned { 682 switch (AS) { 683 // FIXME: Private element size. 684 case AMDGPUAS::PRIVATE_ADDRESS: 685 return 32; 686 // FIXME: Check subtarget 687 case AMDGPUAS::LOCAL_ADDRESS: 688 return ST.useDS128() ? 128 : 64; 689 690 // Treat constant and global as identical. SMRD loads are sometimes usable 691 // for global loads (ideally constant address space should be eliminated) 692 // depending on the context. Legality cannot be context dependent, but 693 // RegBankSelect can split the load as necessary depending on the pointer 694 // register bank/uniformity and if the memory is invariant or not written in 695 // a kernel. 696 case AMDGPUAS::CONSTANT_ADDRESS: 697 case AMDGPUAS::GLOBAL_ADDRESS: 698 return IsLoad ? 512 : 128; 699 default: 700 return 128; 701 } 702 }; 703 704 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 705 bool IsLoad) -> bool { 706 const LLT DstTy = Query.Types[0]; 707 708 // Split vector extloads. 709 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 710 unsigned Align = Query.MMODescrs[0].AlignInBits; 711 712 if (MemSize < DstTy.getSizeInBits()) 713 MemSize = std::max(MemSize, Align); 714 715 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 716 return true; 717 718 const LLT PtrTy = Query.Types[1]; 719 unsigned AS = PtrTy.getAddressSpace(); 720 if (MemSize > maxSizeForAddrSpace(AS, IsLoad)) 721 return true; 722 723 // Catch weird sized loads that don't evenly divide into the access sizes 724 // TODO: May be able to widen depending on alignment etc. 725 unsigned NumRegs = (MemSize + 31) / 32; 726 if (NumRegs == 3) { 727 if (!ST.hasDwordx3LoadStores()) 728 return true; 729 } else { 730 // If the alignment allows, these should have been widened. 731 if (!isPowerOf2_32(NumRegs)) 732 return true; 733 } 734 735 if (Align < MemSize) { 736 const SITargetLowering *TLI = ST.getTargetLowering(); 737 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 738 } 739 740 return false; 741 }; 742 743 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool { 744 unsigned Size = Query.Types[0].getSizeInBits(); 745 if (isPowerOf2_32(Size)) 746 return false; 747 748 if (Size == 96 && ST.hasDwordx3LoadStores()) 749 return false; 750 751 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 752 if (Size >= maxSizeForAddrSpace(AddrSpace, true)) 753 return false; 754 755 unsigned Align = Query.MMODescrs[0].AlignInBits; 756 unsigned RoundedSize = NextPowerOf2(Size); 757 return (Align >= RoundedSize); 758 }; 759 760 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 761 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 762 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 763 764 // TODO: Refine based on subtargets which support unaligned access or 128-bit 765 // LDS 766 // TODO: Unsupported flat for SI. 767 768 for (unsigned Op : {G_LOAD, G_STORE}) { 769 const bool IsStore = Op == G_STORE; 770 771 auto &Actions = getActionDefinitionsBuilder(Op); 772 // Whitelist the common cases. 773 // TODO: Loads to s16 on gfx9 774 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 775 {V2S32, GlobalPtr, 64, GlobalAlign32}, 776 {V4S32, GlobalPtr, 128, GlobalAlign32}, 777 {S128, GlobalPtr, 128, GlobalAlign32}, 778 {S64, GlobalPtr, 64, GlobalAlign32}, 779 {V2S64, GlobalPtr, 128, GlobalAlign32}, 780 {V2S16, GlobalPtr, 32, GlobalAlign32}, 781 {S32, GlobalPtr, 8, GlobalAlign8}, 782 {S32, GlobalPtr, 16, GlobalAlign16}, 783 784 {S32, LocalPtr, 32, 32}, 785 {S64, LocalPtr, 64, 32}, 786 {V2S32, LocalPtr, 64, 32}, 787 {S32, LocalPtr, 8, 8}, 788 {S32, LocalPtr, 16, 16}, 789 {V2S16, LocalPtr, 32, 32}, 790 791 {S32, PrivatePtr, 32, 32}, 792 {S32, PrivatePtr, 8, 8}, 793 {S32, PrivatePtr, 16, 16}, 794 {V2S16, PrivatePtr, 32, 32}, 795 796 {S32, FlatPtr, 32, GlobalAlign32}, 797 {S32, FlatPtr, 16, GlobalAlign16}, 798 {S32, FlatPtr, 8, GlobalAlign8}, 799 {V2S16, FlatPtr, 32, GlobalAlign32}, 800 801 {S32, ConstantPtr, 32, GlobalAlign32}, 802 {V2S32, ConstantPtr, 64, GlobalAlign32}, 803 {V4S32, ConstantPtr, 128, GlobalAlign32}, 804 {S64, ConstantPtr, 64, GlobalAlign32}, 805 {S128, ConstantPtr, 128, GlobalAlign32}, 806 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 807 Actions 808 .customIf(typeIs(1, Constant32Ptr)) 809 // Widen suitably aligned loads by loading extra elements. 810 .moreElementsIf([=](const LegalityQuery &Query) { 811 const LLT Ty = Query.Types[0]; 812 return Op == G_LOAD && Ty.isVector() && 813 shouldWidenLoadResult(Query); 814 }, moreElementsToNextPow2(0)) 815 .widenScalarIf([=](const LegalityQuery &Query) { 816 const LLT Ty = Query.Types[0]; 817 return Op == G_LOAD && !Ty.isVector() && 818 shouldWidenLoadResult(Query); 819 }, widenScalarOrEltToNextPow2(0)) 820 .narrowScalarIf( 821 [=](const LegalityQuery &Query) -> bool { 822 return !Query.Types[0].isVector() && 823 needToSplitMemOp(Query, Op == G_LOAD); 824 }, 825 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 826 const LLT DstTy = Query.Types[0]; 827 const LLT PtrTy = Query.Types[1]; 828 829 const unsigned DstSize = DstTy.getSizeInBits(); 830 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 831 832 // Split extloads. 833 if (DstSize > MemSize) 834 return std::make_pair(0, LLT::scalar(MemSize)); 835 836 if (!isPowerOf2_32(DstSize)) { 837 // We're probably decomposing an odd sized store. Try to split 838 // to the widest type. TODO: Account for alignment. As-is it 839 // should be OK, since the new parts will be further legalized. 840 unsigned FloorSize = PowerOf2Floor(DstSize); 841 return std::make_pair(0, LLT::scalar(FloorSize)); 842 } 843 844 if (DstSize > 32 && (DstSize % 32 != 0)) { 845 // FIXME: Need a way to specify non-extload of larger size if 846 // suitably aligned. 847 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 848 } 849 850 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 851 Op == G_LOAD); 852 if (MemSize > MaxSize) 853 return std::make_pair(0, LLT::scalar(MaxSize)); 854 855 unsigned Align = Query.MMODescrs[0].AlignInBits; 856 return std::make_pair(0, LLT::scalar(Align)); 857 }) 858 .fewerElementsIf( 859 [=](const LegalityQuery &Query) -> bool { 860 return Query.Types[0].isVector() && 861 needToSplitMemOp(Query, Op == G_LOAD); 862 }, 863 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 864 const LLT DstTy = Query.Types[0]; 865 const LLT PtrTy = Query.Types[1]; 866 867 LLT EltTy = DstTy.getElementType(); 868 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 869 Op == G_LOAD); 870 871 // FIXME: Handle widened to power of 2 results better. This ends 872 // up scalarizing. 873 // FIXME: 3 element stores scalarized on SI 874 875 // Split if it's too large for the address space. 876 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 877 unsigned NumElts = DstTy.getNumElements(); 878 unsigned EltSize = EltTy.getSizeInBits(); 879 880 if (MaxSize % EltSize == 0) { 881 return std::make_pair( 882 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 883 } 884 885 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 886 887 // FIXME: Refine when odd breakdowns handled 888 // The scalars will need to be re-legalized. 889 if (NumPieces == 1 || NumPieces >= NumElts || 890 NumElts % NumPieces != 0) 891 return std::make_pair(0, EltTy); 892 893 return std::make_pair(0, 894 LLT::vector(NumElts / NumPieces, EltTy)); 895 } 896 897 // FIXME: We could probably handle weird extending loads better. 898 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 899 if (DstTy.getSizeInBits() > MemSize) 900 return std::make_pair(0, EltTy); 901 902 unsigned EltSize = EltTy.getSizeInBits(); 903 unsigned DstSize = DstTy.getSizeInBits(); 904 if (!isPowerOf2_32(DstSize)) { 905 // We're probably decomposing an odd sized store. Try to split 906 // to the widest type. TODO: Account for alignment. As-is it 907 // should be OK, since the new parts will be further legalized. 908 unsigned FloorSize = PowerOf2Floor(DstSize); 909 return std::make_pair( 910 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 911 } 912 913 // Need to split because of alignment. 914 unsigned Align = Query.MMODescrs[0].AlignInBits; 915 if (EltSize > Align && 916 (EltSize / Align < DstTy.getNumElements())) { 917 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 918 } 919 920 // May need relegalization for the scalars. 921 return std::make_pair(0, EltTy); 922 }) 923 .minScalar(0, S32); 924 925 if (IsStore) 926 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 927 928 // TODO: Need a bitcast lower option? 929 Actions 930 .legalIf([=](const LegalityQuery &Query) { 931 const LLT Ty0 = Query.Types[0]; 932 unsigned Size = Ty0.getSizeInBits(); 933 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 934 unsigned Align = Query.MMODescrs[0].AlignInBits; 935 936 // FIXME: Widening store from alignment not valid. 937 if (MemSize < Size) 938 MemSize = std::max(MemSize, Align); 939 940 // No extending vector loads. 941 if (Size > MemSize && Ty0.isVector()) 942 return false; 943 944 switch (MemSize) { 945 case 8: 946 case 16: 947 return Size == 32; 948 case 32: 949 case 64: 950 case 128: 951 return true; 952 case 96: 953 return ST.hasDwordx3LoadStores(); 954 case 256: 955 case 512: 956 return true; 957 default: 958 return false; 959 } 960 }) 961 .widenScalarToNextPow2(0) 962 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 963 } 964 965 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 966 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 967 {S32, GlobalPtr, 16, 2 * 8}, 968 {S32, LocalPtr, 8, 8}, 969 {S32, LocalPtr, 16, 16}, 970 {S32, PrivatePtr, 8, 8}, 971 {S32, PrivatePtr, 16, 16}, 972 {S32, ConstantPtr, 8, 8}, 973 {S32, ConstantPtr, 16, 2 * 8}}); 974 if (ST.hasFlatAddressSpace()) { 975 ExtLoads.legalForTypesWithMemDesc( 976 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 977 } 978 979 ExtLoads.clampScalar(0, S32, S32) 980 .widenScalarToNextPow2(0) 981 .unsupportedIfMemSizeNotPow2() 982 .lower(); 983 984 auto &Atomics = getActionDefinitionsBuilder( 985 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 986 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 987 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 988 G_ATOMICRMW_UMIN}) 989 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 990 {S64, GlobalPtr}, {S64, LocalPtr}}); 991 if (ST.hasFlatAddressSpace()) { 992 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 993 } 994 995 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 996 .legalFor({{S32, LocalPtr}}); 997 998 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 999 // demarshalling 1000 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1001 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1002 {S32, FlatPtr}, {S64, FlatPtr}}) 1003 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1004 {S32, RegionPtr}, {S64, RegionPtr}}); 1005 // TODO: Pointer types, any 32-bit or 64-bit vector 1006 1007 // Condition should be s32 for scalar, s1 for vector. 1008 getActionDefinitionsBuilder(G_SELECT) 1009 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1010 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1011 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1012 .clampScalar(0, S16, S64) 1013 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1014 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1015 .scalarize(1) 1016 .clampMaxNumElements(0, S32, 2) 1017 .clampMaxNumElements(0, LocalPtr, 2) 1018 .clampMaxNumElements(0, PrivatePtr, 2) 1019 .scalarize(0) 1020 .widenScalarToNextPow2(0) 1021 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1022 1023 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1024 // be more flexible with the shift amount type. 1025 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1026 .legalFor({{S32, S32}, {S64, S32}}); 1027 if (ST.has16BitInsts()) { 1028 if (ST.hasVOP3PInsts()) { 1029 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 1030 .clampMaxNumElements(0, S16, 2); 1031 } else 1032 Shifts.legalFor({{S16, S32}, {S16, S16}}); 1033 1034 // TODO: Support 16-bit shift amounts 1035 Shifts.clampScalar(1, S32, S32); 1036 Shifts.clampScalar(0, S16, S64); 1037 Shifts.widenScalarToNextPow2(0, 16); 1038 } else { 1039 // Make sure we legalize the shift amount type first, as the general 1040 // expansion for the shifted type will produce much worse code if it hasn't 1041 // been truncated already. 1042 Shifts.clampScalar(1, S32, S32); 1043 Shifts.clampScalar(0, S32, S64); 1044 Shifts.widenScalarToNextPow2(0, 32); 1045 } 1046 Shifts.scalarize(0); 1047 1048 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1049 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1050 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1051 unsigned IdxTypeIdx = 2; 1052 1053 getActionDefinitionsBuilder(Op) 1054 .customIf([=](const LegalityQuery &Query) { 1055 const LLT EltTy = Query.Types[EltTypeIdx]; 1056 const LLT VecTy = Query.Types[VecTypeIdx]; 1057 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1058 return (EltTy.getSizeInBits() == 16 || 1059 EltTy.getSizeInBits() % 32 == 0) && 1060 VecTy.getSizeInBits() % 32 == 0 && 1061 VecTy.getSizeInBits() <= 1024 && 1062 IdxTy.getSizeInBits() == 32; 1063 }) 1064 .clampScalar(EltTypeIdx, S32, S64) 1065 .clampScalar(VecTypeIdx, S32, S64) 1066 .clampScalar(IdxTypeIdx, S32, S32); 1067 } 1068 1069 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1070 .unsupportedIf([=](const LegalityQuery &Query) { 1071 const LLT &EltTy = Query.Types[1].getElementType(); 1072 return Query.Types[0] != EltTy; 1073 }); 1074 1075 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1076 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1077 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1078 1079 // FIXME: Doesn't handle extract of illegal sizes. 1080 getActionDefinitionsBuilder(Op) 1081 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1082 // FIXME: Multiples of 16 should not be legal. 1083 .legalIf([=](const LegalityQuery &Query) { 1084 const LLT BigTy = Query.Types[BigTyIdx]; 1085 const LLT LitTy = Query.Types[LitTyIdx]; 1086 return (BigTy.getSizeInBits() % 32 == 0) && 1087 (LitTy.getSizeInBits() % 16 == 0); 1088 }) 1089 .widenScalarIf( 1090 [=](const LegalityQuery &Query) { 1091 const LLT BigTy = Query.Types[BigTyIdx]; 1092 return (BigTy.getScalarSizeInBits() < 16); 1093 }, 1094 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1095 .widenScalarIf( 1096 [=](const LegalityQuery &Query) { 1097 const LLT LitTy = Query.Types[LitTyIdx]; 1098 return (LitTy.getScalarSizeInBits() < 16); 1099 }, 1100 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1101 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1102 .widenScalarToNextPow2(BigTyIdx, 32); 1103 1104 } 1105 1106 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1107 .legalForCartesianProduct(AllS32Vectors, {S32}) 1108 .legalForCartesianProduct(AllS64Vectors, {S64}) 1109 .clampNumElements(0, V16S32, V32S32) 1110 .clampNumElements(0, V2S64, V16S64) 1111 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1112 1113 if (ST.hasScalarPackInsts()) { 1114 BuildVector 1115 // FIXME: Should probably widen s1 vectors straight to s32 1116 .minScalarOrElt(0, S16) 1117 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1118 .minScalar(1, S32); 1119 1120 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1121 .legalFor({V2S16, S32}) 1122 .lower(); 1123 BuildVector.minScalarOrElt(0, S32); 1124 } else { 1125 BuildVector.customFor({V2S16, S16}); 1126 BuildVector.minScalarOrElt(0, S32); 1127 1128 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1129 .customFor({V2S16, S32}) 1130 .lower(); 1131 } 1132 1133 BuildVector.legalIf(isRegisterType(0)); 1134 1135 // FIXME: Clamp maximum size 1136 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1137 .legalIf(isRegisterType(0)); 1138 1139 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1140 // pre-legalize. 1141 if (ST.hasVOP3PInsts()) { 1142 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1143 .customFor({V2S16, V2S16}) 1144 .lower(); 1145 } else 1146 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1147 1148 // Merge/Unmerge 1149 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1150 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1151 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1152 1153 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1154 const LLT &Ty = Query.Types[TypeIdx]; 1155 if (Ty.isVector()) { 1156 const LLT &EltTy = Ty.getElementType(); 1157 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 1158 return true; 1159 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1160 return true; 1161 } 1162 return false; 1163 }; 1164 1165 auto &Builder = getActionDefinitionsBuilder(Op) 1166 // Try to widen to s16 first for small types. 1167 // TODO: Only do this on targets with legal s16 shifts 1168 .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1169 1170 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1171 .lowerFor({{S16, V2S16}}) 1172 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1173 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1174 elementTypeIs(1, S16)), 1175 changeTo(1, V2S16)) 1176 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1177 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1178 // valid. 1179 .clampScalar(LitTyIdx, S32, S256) 1180 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1181 // Break up vectors with weird elements into scalars 1182 .fewerElementsIf( 1183 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 1184 scalarize(0)) 1185 .fewerElementsIf( 1186 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 1187 scalarize(1)) 1188 .clampScalar(BigTyIdx, S32, S1024); 1189 1190 if (Op == G_MERGE_VALUES) { 1191 Builder.widenScalarIf( 1192 // TODO: Use 16-bit shifts if legal for 8-bit values? 1193 [=](const LegalityQuery &Query) { 1194 const LLT Ty = Query.Types[LitTyIdx]; 1195 return Ty.getSizeInBits() < 32; 1196 }, 1197 changeTo(LitTyIdx, S32)); 1198 } 1199 1200 Builder.widenScalarIf( 1201 [=](const LegalityQuery &Query) { 1202 const LLT Ty = Query.Types[BigTyIdx]; 1203 return !isPowerOf2_32(Ty.getSizeInBits()) && 1204 Ty.getSizeInBits() % 16 != 0; 1205 }, 1206 [=](const LegalityQuery &Query) { 1207 // Pick the next power of 2, or a multiple of 64 over 128. 1208 // Whichever is smaller. 1209 const LLT &Ty = Query.Types[BigTyIdx]; 1210 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1211 if (NewSizeInBits >= 256) { 1212 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1213 if (RoundedTo < NewSizeInBits) 1214 NewSizeInBits = RoundedTo; 1215 } 1216 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1217 }) 1218 .legalIf([=](const LegalityQuery &Query) { 1219 const LLT &BigTy = Query.Types[BigTyIdx]; 1220 const LLT &LitTy = Query.Types[LitTyIdx]; 1221 1222 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1223 return false; 1224 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1225 return false; 1226 1227 return BigTy.getSizeInBits() % 16 == 0 && 1228 LitTy.getSizeInBits() % 16 == 0 && 1229 BigTy.getSizeInBits() <= 1024; 1230 }) 1231 // Any vectors left are the wrong size. Scalarize them. 1232 .scalarize(0) 1233 .scalarize(1); 1234 } 1235 1236 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1237 // RegBankSelect. 1238 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1239 .legalFor({{S32}, {S64}}); 1240 1241 if (ST.hasVOP3PInsts()) { 1242 SextInReg.lowerFor({{V2S16}}) 1243 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1244 // get more vector shift opportunities, since we'll get those when 1245 // expanded. 1246 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1247 } else if (ST.has16BitInsts()) { 1248 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1249 } else { 1250 // Prefer to promote to s32 before lowering if we don't have 16-bit 1251 // shifts. This avoid a lot of intermediate truncate and extend operations. 1252 SextInReg.lowerFor({{S32}, {S64}}); 1253 } 1254 1255 SextInReg 1256 .scalarize(0) 1257 .clampScalar(0, S32, S64) 1258 .lower(); 1259 1260 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1261 .legalFor({S64}); 1262 1263 getActionDefinitionsBuilder({ 1264 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1265 G_FCOPYSIGN, 1266 1267 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1268 G_READ_REGISTER, 1269 G_WRITE_REGISTER, 1270 1271 G_SADDO, G_SSUBO, 1272 1273 // TODO: Implement 1274 G_FMINIMUM, G_FMAXIMUM 1275 }).lower(); 1276 1277 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1278 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1279 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1280 .unsupported(); 1281 1282 computeTables(); 1283 verify(*ST.getInstrInfo()); 1284 } 1285 1286 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1287 MachineRegisterInfo &MRI, 1288 MachineIRBuilder &B, 1289 GISelChangeObserver &Observer) const { 1290 switch (MI.getOpcode()) { 1291 case TargetOpcode::G_ADDRSPACE_CAST: 1292 return legalizeAddrSpaceCast(MI, MRI, B); 1293 case TargetOpcode::G_FRINT: 1294 return legalizeFrint(MI, MRI, B); 1295 case TargetOpcode::G_FCEIL: 1296 return legalizeFceil(MI, MRI, B); 1297 case TargetOpcode::G_INTRINSIC_TRUNC: 1298 return legalizeIntrinsicTrunc(MI, MRI, B); 1299 case TargetOpcode::G_SITOFP: 1300 return legalizeITOFP(MI, MRI, B, true); 1301 case TargetOpcode::G_UITOFP: 1302 return legalizeITOFP(MI, MRI, B, false); 1303 case TargetOpcode::G_FPTOSI: 1304 return legalizeFPTOI(MI, MRI, B, true); 1305 case TargetOpcode::G_FPTOUI: 1306 return legalizeFPTOI(MI, MRI, B, false); 1307 case TargetOpcode::G_FMINNUM: 1308 case TargetOpcode::G_FMAXNUM: 1309 case TargetOpcode::G_FMINNUM_IEEE: 1310 case TargetOpcode::G_FMAXNUM_IEEE: 1311 return legalizeMinNumMaxNum(MI, MRI, B); 1312 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1313 return legalizeExtractVectorElt(MI, MRI, B); 1314 case TargetOpcode::G_INSERT_VECTOR_ELT: 1315 return legalizeInsertVectorElt(MI, MRI, B); 1316 case TargetOpcode::G_SHUFFLE_VECTOR: 1317 return legalizeShuffleVector(MI, MRI, B); 1318 case TargetOpcode::G_FSIN: 1319 case TargetOpcode::G_FCOS: 1320 return legalizeSinCos(MI, MRI, B); 1321 case TargetOpcode::G_GLOBAL_VALUE: 1322 return legalizeGlobalValue(MI, MRI, B); 1323 case TargetOpcode::G_LOAD: 1324 return legalizeLoad(MI, MRI, B, Observer); 1325 case TargetOpcode::G_FMAD: 1326 return legalizeFMad(MI, MRI, B); 1327 case TargetOpcode::G_FDIV: 1328 return legalizeFDIV(MI, MRI, B); 1329 case TargetOpcode::G_ATOMIC_CMPXCHG: 1330 return legalizeAtomicCmpXChg(MI, MRI, B); 1331 case TargetOpcode::G_FLOG: 1332 return legalizeFlog(MI, B, 1.0f / numbers::log2ef); 1333 case TargetOpcode::G_FLOG10: 1334 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1335 case TargetOpcode::G_FEXP: 1336 return legalizeFExp(MI, B); 1337 case TargetOpcode::G_FFLOOR: 1338 return legalizeFFloor(MI, MRI, B); 1339 case TargetOpcode::G_BUILD_VECTOR: 1340 return legalizeBuildVector(MI, MRI, B); 1341 default: 1342 return false; 1343 } 1344 1345 llvm_unreachable("expected switch to return"); 1346 } 1347 1348 Register AMDGPULegalizerInfo::getSegmentAperture( 1349 unsigned AS, 1350 MachineRegisterInfo &MRI, 1351 MachineIRBuilder &B) const { 1352 MachineFunction &MF = B.getMF(); 1353 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1354 const LLT S32 = LLT::scalar(32); 1355 1356 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1357 1358 if (ST.hasApertureRegs()) { 1359 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1360 // getreg. 1361 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1362 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1363 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1364 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1365 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1366 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1367 unsigned Encoding = 1368 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1369 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1370 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1371 1372 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1373 1374 B.buildInstr(AMDGPU::S_GETREG_B32) 1375 .addDef(GetReg) 1376 .addImm(Encoding); 1377 MRI.setType(GetReg, S32); 1378 1379 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1380 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1381 } 1382 1383 Register QueuePtr = MRI.createGenericVirtualRegister( 1384 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1385 1386 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1387 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1388 return Register(); 1389 1390 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1391 // private_segment_aperture_base_hi. 1392 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1393 1394 // TODO: can we be smarter about machine pointer info? 1395 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1396 MachineMemOperand *MMO = MF.getMachineMemOperand( 1397 PtrInfo, 1398 MachineMemOperand::MOLoad | 1399 MachineMemOperand::MODereferenceable | 1400 MachineMemOperand::MOInvariant, 1401 4, 1402 MinAlign(64, StructOffset)); 1403 1404 Register LoadAddr; 1405 1406 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1407 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1408 } 1409 1410 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1411 MachineInstr &MI, MachineRegisterInfo &MRI, 1412 MachineIRBuilder &B) const { 1413 MachineFunction &MF = B.getMF(); 1414 1415 B.setInstr(MI); 1416 1417 const LLT S32 = LLT::scalar(32); 1418 Register Dst = MI.getOperand(0).getReg(); 1419 Register Src = MI.getOperand(1).getReg(); 1420 1421 LLT DstTy = MRI.getType(Dst); 1422 LLT SrcTy = MRI.getType(Src); 1423 unsigned DestAS = DstTy.getAddressSpace(); 1424 unsigned SrcAS = SrcTy.getAddressSpace(); 1425 1426 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1427 // vector element. 1428 assert(!DstTy.isVector()); 1429 1430 const AMDGPUTargetMachine &TM 1431 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1432 1433 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1434 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1435 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1436 return true; 1437 } 1438 1439 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1440 // Truncate. 1441 B.buildExtract(Dst, Src, 0); 1442 MI.eraseFromParent(); 1443 return true; 1444 } 1445 1446 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1447 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1448 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1449 1450 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1451 // another. Merge operands are required to be the same type, but creating an 1452 // extra ptrtoint would be kind of pointless. 1453 auto HighAddr = B.buildConstant( 1454 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1455 B.buildMerge(Dst, {Src, HighAddr}); 1456 MI.eraseFromParent(); 1457 return true; 1458 } 1459 1460 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1461 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1462 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1463 unsigned NullVal = TM.getNullPointerValue(DestAS); 1464 1465 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1466 auto FlatNull = B.buildConstant(SrcTy, 0); 1467 1468 // Extract low 32-bits of the pointer. 1469 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1470 1471 auto CmpRes = 1472 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1473 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1474 1475 MI.eraseFromParent(); 1476 return true; 1477 } 1478 1479 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1480 return false; 1481 1482 if (!ST.hasFlatAddressSpace()) 1483 return false; 1484 1485 auto SegmentNull = 1486 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1487 auto FlatNull = 1488 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1489 1490 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1491 if (!ApertureReg.isValid()) 1492 return false; 1493 1494 auto CmpRes = 1495 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1496 1497 // Coerce the type of the low half of the result so we can use merge_values. 1498 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1499 1500 // TODO: Should we allow mismatched types but matching sizes in merges to 1501 // avoid the ptrtoint? 1502 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1503 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1504 1505 MI.eraseFromParent(); 1506 return true; 1507 } 1508 1509 bool AMDGPULegalizerInfo::legalizeFrint( 1510 MachineInstr &MI, MachineRegisterInfo &MRI, 1511 MachineIRBuilder &B) const { 1512 B.setInstr(MI); 1513 1514 Register Src = MI.getOperand(1).getReg(); 1515 LLT Ty = MRI.getType(Src); 1516 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1517 1518 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1519 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1520 1521 auto C1 = B.buildFConstant(Ty, C1Val); 1522 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1523 1524 // TODO: Should this propagate fast-math-flags? 1525 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1526 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1527 1528 auto C2 = B.buildFConstant(Ty, C2Val); 1529 auto Fabs = B.buildFAbs(Ty, Src); 1530 1531 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1532 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1533 return true; 1534 } 1535 1536 bool AMDGPULegalizerInfo::legalizeFceil( 1537 MachineInstr &MI, MachineRegisterInfo &MRI, 1538 MachineIRBuilder &B) const { 1539 B.setInstr(MI); 1540 1541 const LLT S1 = LLT::scalar(1); 1542 const LLT S64 = LLT::scalar(64); 1543 1544 Register Src = MI.getOperand(1).getReg(); 1545 assert(MRI.getType(Src) == S64); 1546 1547 // result = trunc(src) 1548 // if (src > 0.0 && src != result) 1549 // result += 1.0 1550 1551 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1552 1553 const auto Zero = B.buildFConstant(S64, 0.0); 1554 const auto One = B.buildFConstant(S64, 1.0); 1555 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1556 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1557 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1558 auto Add = B.buildSelect(S64, And, One, Zero); 1559 1560 // TODO: Should this propagate fast-math-flags? 1561 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1562 return true; 1563 } 1564 1565 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1566 MachineIRBuilder &B) { 1567 const unsigned FractBits = 52; 1568 const unsigned ExpBits = 11; 1569 LLT S32 = LLT::scalar(32); 1570 1571 auto Const0 = B.buildConstant(S32, FractBits - 32); 1572 auto Const1 = B.buildConstant(S32, ExpBits); 1573 1574 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1575 .addUse(Const0.getReg(0)) 1576 .addUse(Const1.getReg(0)); 1577 1578 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1579 } 1580 1581 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1582 MachineInstr &MI, MachineRegisterInfo &MRI, 1583 MachineIRBuilder &B) const { 1584 B.setInstr(MI); 1585 1586 const LLT S1 = LLT::scalar(1); 1587 const LLT S32 = LLT::scalar(32); 1588 const LLT S64 = LLT::scalar(64); 1589 1590 Register Src = MI.getOperand(1).getReg(); 1591 assert(MRI.getType(Src) == S64); 1592 1593 // TODO: Should this use extract since the low half is unused? 1594 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1595 Register Hi = Unmerge.getReg(1); 1596 1597 // Extract the upper half, since this is where we will find the sign and 1598 // exponent. 1599 auto Exp = extractF64Exponent(Hi, B); 1600 1601 const unsigned FractBits = 52; 1602 1603 // Extract the sign bit. 1604 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1605 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1606 1607 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1608 1609 const auto Zero32 = B.buildConstant(S32, 0); 1610 1611 // Extend back to 64-bits. 1612 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1613 1614 auto Shr = B.buildAShr(S64, FractMask, Exp); 1615 auto Not = B.buildNot(S64, Shr); 1616 auto Tmp0 = B.buildAnd(S64, Src, Not); 1617 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1618 1619 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1620 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1621 1622 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1623 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1624 return true; 1625 } 1626 1627 bool AMDGPULegalizerInfo::legalizeITOFP( 1628 MachineInstr &MI, MachineRegisterInfo &MRI, 1629 MachineIRBuilder &B, bool Signed) const { 1630 B.setInstr(MI); 1631 1632 Register Dst = MI.getOperand(0).getReg(); 1633 Register Src = MI.getOperand(1).getReg(); 1634 1635 const LLT S64 = LLT::scalar(64); 1636 const LLT S32 = LLT::scalar(32); 1637 1638 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1639 1640 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1641 1642 auto CvtHi = Signed ? 1643 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1644 B.buildUITOFP(S64, Unmerge.getReg(1)); 1645 1646 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1647 1648 auto ThirtyTwo = B.buildConstant(S32, 32); 1649 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1650 .addUse(CvtHi.getReg(0)) 1651 .addUse(ThirtyTwo.getReg(0)); 1652 1653 // TODO: Should this propagate fast-math-flags? 1654 B.buildFAdd(Dst, LdExp, CvtLo); 1655 MI.eraseFromParent(); 1656 return true; 1657 } 1658 1659 // TODO: Copied from DAG implementation. Verify logic and document how this 1660 // actually works. 1661 bool AMDGPULegalizerInfo::legalizeFPTOI( 1662 MachineInstr &MI, MachineRegisterInfo &MRI, 1663 MachineIRBuilder &B, bool Signed) const { 1664 B.setInstr(MI); 1665 1666 Register Dst = MI.getOperand(0).getReg(); 1667 Register Src = MI.getOperand(1).getReg(); 1668 1669 const LLT S64 = LLT::scalar(64); 1670 const LLT S32 = LLT::scalar(32); 1671 1672 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1673 1674 unsigned Flags = MI.getFlags(); 1675 1676 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1677 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1678 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1679 1680 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1681 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1682 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1683 1684 auto Hi = Signed ? 1685 B.buildFPTOSI(S32, FloorMul) : 1686 B.buildFPTOUI(S32, FloorMul); 1687 auto Lo = B.buildFPTOUI(S32, Fma); 1688 1689 B.buildMerge(Dst, { Lo, Hi }); 1690 MI.eraseFromParent(); 1691 1692 return true; 1693 } 1694 1695 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1696 MachineInstr &MI, MachineRegisterInfo &MRI, 1697 MachineIRBuilder &B) const { 1698 MachineFunction &MF = B.getMF(); 1699 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1700 1701 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1702 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1703 1704 // With ieee_mode disabled, the instructions have the correct behavior 1705 // already for G_FMINNUM/G_FMAXNUM 1706 if (!MFI->getMode().IEEE) 1707 return !IsIEEEOp; 1708 1709 if (IsIEEEOp) 1710 return true; 1711 1712 MachineIRBuilder HelperBuilder(MI); 1713 GISelObserverWrapper DummyObserver; 1714 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1715 HelperBuilder.setInstr(MI); 1716 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1717 } 1718 1719 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1720 MachineInstr &MI, MachineRegisterInfo &MRI, 1721 MachineIRBuilder &B) const { 1722 // TODO: Should move some of this into LegalizerHelper. 1723 1724 // TODO: Promote dynamic indexing of s16 to s32 1725 1726 // FIXME: Artifact combiner probably should have replaced the truncated 1727 // constant before this, so we shouldn't need 1728 // getConstantVRegValWithLookThrough. 1729 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1730 MI.getOperand(2).getReg(), MRI); 1731 if (!IdxVal) // Dynamic case will be selected to register indexing. 1732 return true; 1733 1734 Register Dst = MI.getOperand(0).getReg(); 1735 Register Vec = MI.getOperand(1).getReg(); 1736 1737 LLT VecTy = MRI.getType(Vec); 1738 LLT EltTy = VecTy.getElementType(); 1739 assert(EltTy == MRI.getType(Dst)); 1740 1741 B.setInstr(MI); 1742 1743 if (IdxVal->Value < VecTy.getNumElements()) 1744 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1745 else 1746 B.buildUndef(Dst); 1747 1748 MI.eraseFromParent(); 1749 return true; 1750 } 1751 1752 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1753 MachineInstr &MI, MachineRegisterInfo &MRI, 1754 MachineIRBuilder &B) const { 1755 // TODO: Should move some of this into LegalizerHelper. 1756 1757 // TODO: Promote dynamic indexing of s16 to s32 1758 1759 // FIXME: Artifact combiner probably should have replaced the truncated 1760 // constant before this, so we shouldn't need 1761 // getConstantVRegValWithLookThrough. 1762 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1763 MI.getOperand(3).getReg(), MRI); 1764 if (!IdxVal) // Dynamic case will be selected to register indexing. 1765 return true; 1766 1767 Register Dst = MI.getOperand(0).getReg(); 1768 Register Vec = MI.getOperand(1).getReg(); 1769 Register Ins = MI.getOperand(2).getReg(); 1770 1771 LLT VecTy = MRI.getType(Vec); 1772 LLT EltTy = VecTy.getElementType(); 1773 assert(EltTy == MRI.getType(Ins)); 1774 1775 B.setInstr(MI); 1776 1777 if (IdxVal->Value < VecTy.getNumElements()) 1778 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 1779 else 1780 B.buildUndef(Dst); 1781 1782 MI.eraseFromParent(); 1783 return true; 1784 } 1785 1786 static bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask) { 1787 assert(Mask.size() == 2); 1788 1789 // If one half is undef, the other is trivially in the same reg. 1790 if (Mask[0] == -1 || Mask[1] == -1) 1791 return true; 1792 return ((Mask[0] == 0 || Mask[0] == 1) && (Mask[1] == 0 || Mask[1] == 1)) || 1793 ((Mask[0] == 2 || Mask[0] == 3) && (Mask[1] == 2 || Mask[1] == 3)); 1794 } 1795 1796 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1797 MachineInstr &MI, MachineRegisterInfo &MRI, 1798 MachineIRBuilder &B) const { 1799 const LLT V2S16 = LLT::vector(2, 16); 1800 1801 Register Dst = MI.getOperand(0).getReg(); 1802 Register Src0 = MI.getOperand(1).getReg(); 1803 LLT DstTy = MRI.getType(Dst); 1804 LLT SrcTy = MRI.getType(Src0); 1805 1806 if (SrcTy == V2S16 && DstTy == V2S16 && 1807 isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1808 return true; 1809 1810 MachineIRBuilder HelperBuilder(MI); 1811 GISelObserverWrapper DummyObserver; 1812 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1813 HelperBuilder.setInstr(MI); 1814 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1815 } 1816 1817 bool AMDGPULegalizerInfo::legalizeSinCos( 1818 MachineInstr &MI, MachineRegisterInfo &MRI, 1819 MachineIRBuilder &B) const { 1820 B.setInstr(MI); 1821 1822 Register DstReg = MI.getOperand(0).getReg(); 1823 Register SrcReg = MI.getOperand(1).getReg(); 1824 LLT Ty = MRI.getType(DstReg); 1825 unsigned Flags = MI.getFlags(); 1826 1827 Register TrigVal; 1828 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1829 if (ST.hasTrigReducedRange()) { 1830 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1831 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1832 .addUse(MulVal.getReg(0)) 1833 .setMIFlags(Flags).getReg(0); 1834 } else 1835 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1836 1837 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1838 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1839 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1840 .addUse(TrigVal) 1841 .setMIFlags(Flags); 1842 MI.eraseFromParent(); 1843 return true; 1844 } 1845 1846 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1847 Register DstReg, LLT PtrTy, 1848 MachineIRBuilder &B, const GlobalValue *GV, 1849 unsigned Offset, unsigned GAFlags) const { 1850 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1851 // to the following code sequence: 1852 // 1853 // For constant address space: 1854 // s_getpc_b64 s[0:1] 1855 // s_add_u32 s0, s0, $symbol 1856 // s_addc_u32 s1, s1, 0 1857 // 1858 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1859 // a fixup or relocation is emitted to replace $symbol with a literal 1860 // constant, which is a pc-relative offset from the encoding of the $symbol 1861 // operand to the global variable. 1862 // 1863 // For global address space: 1864 // s_getpc_b64 s[0:1] 1865 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1866 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1867 // 1868 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1869 // fixups or relocations are emitted to replace $symbol@*@lo and 1870 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1871 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1872 // operand to the global variable. 1873 // 1874 // What we want here is an offset from the value returned by s_getpc 1875 // (which is the address of the s_add_u32 instruction) to the global 1876 // variable, but since the encoding of $symbol starts 4 bytes after the start 1877 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1878 // small. This requires us to add 4 to the global variable offset in order to 1879 // compute the correct address. 1880 1881 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1882 1883 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1884 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1885 1886 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1887 .addDef(PCReg); 1888 1889 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1890 if (GAFlags == SIInstrInfo::MO_NONE) 1891 MIB.addImm(0); 1892 else 1893 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1894 1895 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1896 1897 if (PtrTy.getSizeInBits() == 32) 1898 B.buildExtract(DstReg, PCReg, 0); 1899 return true; 1900 } 1901 1902 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1903 MachineInstr &MI, MachineRegisterInfo &MRI, 1904 MachineIRBuilder &B) const { 1905 Register DstReg = MI.getOperand(0).getReg(); 1906 LLT Ty = MRI.getType(DstReg); 1907 unsigned AS = Ty.getAddressSpace(); 1908 1909 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1910 MachineFunction &MF = B.getMF(); 1911 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1912 B.setInstr(MI); 1913 1914 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1915 if (!MFI->isEntryFunction()) { 1916 const Function &Fn = MF.getFunction(); 1917 DiagnosticInfoUnsupported BadLDSDecl( 1918 Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 1919 Fn.getContext().diagnose(BadLDSDecl); 1920 } 1921 1922 // TODO: We could emit code to handle the initialization somewhere. 1923 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1924 const SITargetLowering *TLI = ST.getTargetLowering(); 1925 if (!TLI->shouldUseLDSConstAddress(GV)) { 1926 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 1927 return true; // Leave in place; 1928 } 1929 1930 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1931 MI.eraseFromParent(); 1932 return true; 1933 } 1934 1935 const Function &Fn = MF.getFunction(); 1936 DiagnosticInfoUnsupported BadInit( 1937 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1938 Fn.getContext().diagnose(BadInit); 1939 return true; 1940 } 1941 1942 const SITargetLowering *TLI = ST.getTargetLowering(); 1943 1944 if (TLI->shouldEmitFixup(GV)) { 1945 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 1946 MI.eraseFromParent(); 1947 return true; 1948 } 1949 1950 if (TLI->shouldEmitPCReloc(GV)) { 1951 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 1952 MI.eraseFromParent(); 1953 return true; 1954 } 1955 1956 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1957 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 1958 1959 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 1960 MachinePointerInfo::getGOT(MF), 1961 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1962 MachineMemOperand::MOInvariant, 1963 8 /*Size*/, 8 /*Align*/); 1964 1965 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 1966 1967 if (Ty.getSizeInBits() == 32) { 1968 // Truncate if this is a 32-bit constant adrdess. 1969 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 1970 B.buildExtract(DstReg, Load, 0); 1971 } else 1972 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 1973 1974 MI.eraseFromParent(); 1975 return true; 1976 } 1977 1978 bool AMDGPULegalizerInfo::legalizeLoad( 1979 MachineInstr &MI, MachineRegisterInfo &MRI, 1980 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 1981 B.setInstr(MI); 1982 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1983 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 1984 Observer.changingInstr(MI); 1985 MI.getOperand(1).setReg(Cast.getReg(0)); 1986 Observer.changedInstr(MI); 1987 return true; 1988 } 1989 1990 bool AMDGPULegalizerInfo::legalizeFMad( 1991 MachineInstr &MI, MachineRegisterInfo &MRI, 1992 MachineIRBuilder &B) const { 1993 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 1994 assert(Ty.isScalar()); 1995 1996 MachineFunction &MF = B.getMF(); 1997 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1998 1999 // TODO: Always legal with future ftz flag. 2000 // FIXME: Do we need just output? 2001 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2002 return true; 2003 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2004 return true; 2005 2006 MachineIRBuilder HelperBuilder(MI); 2007 GISelObserverWrapper DummyObserver; 2008 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2009 HelperBuilder.setMBB(*MI.getParent()); 2010 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2011 } 2012 2013 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2014 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2015 Register DstReg = MI.getOperand(0).getReg(); 2016 Register PtrReg = MI.getOperand(1).getReg(); 2017 Register CmpVal = MI.getOperand(2).getReg(); 2018 Register NewVal = MI.getOperand(3).getReg(); 2019 2020 assert(SITargetLowering::isFlatGlobalAddrSpace( 2021 MRI.getType(PtrReg).getAddressSpace()) && 2022 "this should not have been custom lowered"); 2023 2024 LLT ValTy = MRI.getType(CmpVal); 2025 LLT VecTy = LLT::vector(2, ValTy); 2026 2027 B.setInstr(MI); 2028 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2029 2030 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2031 .addDef(DstReg) 2032 .addUse(PtrReg) 2033 .addUse(PackedVal) 2034 .setMemRefs(MI.memoperands()); 2035 2036 MI.eraseFromParent(); 2037 return true; 2038 } 2039 2040 bool AMDGPULegalizerInfo::legalizeFlog( 2041 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2042 Register Dst = MI.getOperand(0).getReg(); 2043 Register Src = MI.getOperand(1).getReg(); 2044 LLT Ty = B.getMRI()->getType(Dst); 2045 unsigned Flags = MI.getFlags(); 2046 B.setInstr(MI); 2047 2048 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2049 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2050 2051 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2052 MI.eraseFromParent(); 2053 return true; 2054 } 2055 2056 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2057 MachineIRBuilder &B) const { 2058 Register Dst = MI.getOperand(0).getReg(); 2059 Register Src = MI.getOperand(1).getReg(); 2060 unsigned Flags = MI.getFlags(); 2061 LLT Ty = B.getMRI()->getType(Dst); 2062 B.setInstr(MI); 2063 2064 auto K = B.buildFConstant(Ty, numbers::log2e); 2065 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2066 B.buildFExp2(Dst, Mul, Flags); 2067 MI.eraseFromParent(); 2068 return true; 2069 } 2070 2071 // Find a source register, ignoring any possible source modifiers. 2072 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2073 Register ModSrc = OrigSrc; 2074 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2075 ModSrc = SrcFNeg->getOperand(1).getReg(); 2076 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2077 ModSrc = SrcFAbs->getOperand(1).getReg(); 2078 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2079 ModSrc = SrcFAbs->getOperand(1).getReg(); 2080 return ModSrc; 2081 } 2082 2083 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2084 MachineRegisterInfo &MRI, 2085 MachineIRBuilder &B) const { 2086 B.setInstr(MI); 2087 2088 const LLT S1 = LLT::scalar(1); 2089 const LLT S64 = LLT::scalar(64); 2090 Register Dst = MI.getOperand(0).getReg(); 2091 Register OrigSrc = MI.getOperand(1).getReg(); 2092 unsigned Flags = MI.getFlags(); 2093 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2094 "this should not have been custom lowered"); 2095 2096 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2097 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2098 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2099 // V_FRACT bug is: 2100 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2101 // 2102 // Convert floor(x) to (x - fract(x)) 2103 2104 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2105 .addUse(OrigSrc) 2106 .setMIFlags(Flags); 2107 2108 // Give source modifier matching some assistance before obscuring a foldable 2109 // pattern. 2110 2111 // TODO: We can avoid the neg on the fract? The input sign to fract 2112 // shouldn't matter? 2113 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2114 2115 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2116 2117 Register Min = MRI.createGenericVirtualRegister(S64); 2118 2119 // We don't need to concern ourselves with the snan handling difference, so 2120 // use the one which will directly select. 2121 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2122 if (MFI->getMode().IEEE) 2123 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2124 else 2125 B.buildFMinNum(Min, Fract, Const, Flags); 2126 2127 Register CorrectedFract = Min; 2128 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2129 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2130 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2131 } 2132 2133 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2134 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2135 2136 MI.eraseFromParent(); 2137 return true; 2138 } 2139 2140 // Turn an illegal packed v2s16 build vector into bit operations. 2141 // TODO: This should probably be a bitcast action in LegalizerHelper. 2142 bool AMDGPULegalizerInfo::legalizeBuildVector( 2143 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2144 Register Dst = MI.getOperand(0).getReg(); 2145 LLT DstTy = MRI.getType(Dst); 2146 const LLT S32 = LLT::scalar(32); 2147 const LLT V2S16 = LLT::vector(2, 16); 2148 (void)DstTy; 2149 (void)V2S16; 2150 assert(DstTy == V2S16); 2151 2152 Register Src0 = MI.getOperand(1).getReg(); 2153 Register Src1 = MI.getOperand(2).getReg(); 2154 assert(MRI.getType(Src0) == LLT::scalar(16)); 2155 2156 B.setInstr(MI); 2157 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2158 B.buildBitcast(Dst, Merge); 2159 2160 MI.eraseFromParent(); 2161 return true; 2162 } 2163 2164 // Return the use branch instruction, otherwise null if the usage is invalid. 2165 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2166 MachineRegisterInfo &MRI, 2167 MachineInstr *&Br) { 2168 Register CondDef = MI.getOperand(0).getReg(); 2169 if (!MRI.hasOneNonDBGUse(CondDef)) 2170 return nullptr; 2171 2172 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2173 if (UseMI.getParent() != MI.getParent() || 2174 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2175 return nullptr; 2176 2177 // Make sure the cond br is followed by a G_BR 2178 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2179 if (Next != MI.getParent()->end()) { 2180 if (Next->getOpcode() != AMDGPU::G_BR) 2181 return nullptr; 2182 Br = &*Next; 2183 } 2184 2185 return &UseMI; 2186 } 2187 2188 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 2189 Register Reg, LLT Ty) const { 2190 Register LiveIn = MRI.getLiveInVirtReg(Reg); 2191 if (LiveIn) 2192 return LiveIn; 2193 2194 Register NewReg = MRI.createGenericVirtualRegister(Ty); 2195 MRI.addLiveIn(Reg, NewReg); 2196 return NewReg; 2197 } 2198 2199 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2200 const ArgDescriptor *Arg) const { 2201 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2202 return false; // TODO: Handle these 2203 2204 assert(Arg->getRegister().isPhysical()); 2205 2206 MachineRegisterInfo &MRI = *B.getMRI(); 2207 2208 LLT Ty = MRI.getType(DstReg); 2209 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 2210 2211 if (Arg->isMasked()) { 2212 // TODO: Should we try to emit this once in the entry block? 2213 const LLT S32 = LLT::scalar(32); 2214 const unsigned Mask = Arg->getMask(); 2215 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2216 2217 Register AndMaskSrc = LiveIn; 2218 2219 if (Shift != 0) { 2220 auto ShiftAmt = B.buildConstant(S32, Shift); 2221 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2222 } 2223 2224 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2225 } else 2226 B.buildCopy(DstReg, LiveIn); 2227 2228 // Insert the argument copy if it doens't already exist. 2229 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2230 if (!MRI.getVRegDef(LiveIn)) { 2231 // FIXME: Should have scoped insert pt 2232 MachineBasicBlock &OrigInsBB = B.getMBB(); 2233 auto OrigInsPt = B.getInsertPt(); 2234 2235 MachineBasicBlock &EntryMBB = B.getMF().front(); 2236 EntryMBB.addLiveIn(Arg->getRegister()); 2237 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2238 B.buildCopy(LiveIn, Arg->getRegister()); 2239 2240 B.setInsertPt(OrigInsBB, OrigInsPt); 2241 } 2242 2243 return true; 2244 } 2245 2246 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2247 MachineInstr &MI, 2248 MachineRegisterInfo &MRI, 2249 MachineIRBuilder &B, 2250 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2251 B.setInstr(MI); 2252 2253 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2254 2255 const ArgDescriptor *Arg; 2256 const TargetRegisterClass *RC; 2257 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 2258 if (!Arg) { 2259 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2260 return false; 2261 } 2262 2263 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 2264 MI.eraseFromParent(); 2265 return true; 2266 } 2267 2268 return false; 2269 } 2270 2271 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2272 MachineRegisterInfo &MRI, 2273 MachineIRBuilder &B) const { 2274 B.setInstr(MI); 2275 Register Dst = MI.getOperand(0).getReg(); 2276 LLT DstTy = MRI.getType(Dst); 2277 LLT S16 = LLT::scalar(16); 2278 LLT S32 = LLT::scalar(32); 2279 LLT S64 = LLT::scalar(64); 2280 2281 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2282 return true; 2283 2284 if (DstTy == S16) 2285 return legalizeFDIV16(MI, MRI, B); 2286 if (DstTy == S32) 2287 return legalizeFDIV32(MI, MRI, B); 2288 if (DstTy == S64) 2289 return legalizeFDIV64(MI, MRI, B); 2290 2291 return false; 2292 } 2293 2294 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2295 MachineRegisterInfo &MRI, 2296 MachineIRBuilder &B) const { 2297 Register Res = MI.getOperand(0).getReg(); 2298 Register LHS = MI.getOperand(1).getReg(); 2299 Register RHS = MI.getOperand(2).getReg(); 2300 2301 uint16_t Flags = MI.getFlags(); 2302 2303 LLT ResTy = MRI.getType(Res); 2304 LLT S32 = LLT::scalar(32); 2305 LLT S64 = LLT::scalar(64); 2306 2307 const MachineFunction &MF = B.getMF(); 2308 bool Unsafe = 2309 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2310 2311 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2312 return false; 2313 2314 if (!Unsafe && ResTy == S32 && 2315 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2316 return false; 2317 2318 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2319 // 1 / x -> RCP(x) 2320 if (CLHS->isExactlyValue(1.0)) { 2321 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2322 .addUse(RHS) 2323 .setMIFlags(Flags); 2324 2325 MI.eraseFromParent(); 2326 return true; 2327 } 2328 2329 // -1 / x -> RCP( FNEG(x) ) 2330 if (CLHS->isExactlyValue(-1.0)) { 2331 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2332 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2333 .addUse(FNeg.getReg(0)) 2334 .setMIFlags(Flags); 2335 2336 MI.eraseFromParent(); 2337 return true; 2338 } 2339 } 2340 2341 // x / y -> x * (1.0 / y) 2342 if (Unsafe) { 2343 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2344 .addUse(RHS) 2345 .setMIFlags(Flags); 2346 B.buildFMul(Res, LHS, RCP, Flags); 2347 2348 MI.eraseFromParent(); 2349 return true; 2350 } 2351 2352 return false; 2353 } 2354 2355 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2356 MachineRegisterInfo &MRI, 2357 MachineIRBuilder &B) const { 2358 B.setInstr(MI); 2359 Register Res = MI.getOperand(0).getReg(); 2360 Register LHS = MI.getOperand(1).getReg(); 2361 Register RHS = MI.getOperand(2).getReg(); 2362 2363 uint16_t Flags = MI.getFlags(); 2364 2365 LLT S16 = LLT::scalar(16); 2366 LLT S32 = LLT::scalar(32); 2367 2368 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2369 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2370 2371 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2372 .addUse(RHSExt.getReg(0)) 2373 .setMIFlags(Flags); 2374 2375 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2376 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2377 2378 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2379 .addUse(RDst.getReg(0)) 2380 .addUse(RHS) 2381 .addUse(LHS) 2382 .setMIFlags(Flags); 2383 2384 MI.eraseFromParent(); 2385 return true; 2386 } 2387 2388 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2389 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2390 static void toggleSPDenormMode(bool Enable, 2391 MachineIRBuilder &B, 2392 const GCNSubtarget &ST, 2393 AMDGPU::SIModeRegisterDefaults Mode) { 2394 // Set SP denorm mode to this value. 2395 unsigned SPDenormMode = 2396 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2397 2398 if (ST.hasDenormModeInst()) { 2399 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2400 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2401 2402 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2403 B.buildInstr(AMDGPU::S_DENORM_MODE) 2404 .addImm(NewDenormModeValue); 2405 2406 } else { 2407 // Select FP32 bit field in mode register. 2408 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2409 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2410 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2411 2412 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2413 .addImm(SPDenormMode) 2414 .addImm(SPDenormModeBitField); 2415 } 2416 } 2417 2418 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2419 MachineRegisterInfo &MRI, 2420 MachineIRBuilder &B) const { 2421 B.setInstr(MI); 2422 Register Res = MI.getOperand(0).getReg(); 2423 Register LHS = MI.getOperand(1).getReg(); 2424 Register RHS = MI.getOperand(2).getReg(); 2425 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2426 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2427 2428 uint16_t Flags = MI.getFlags(); 2429 2430 LLT S32 = LLT::scalar(32); 2431 LLT S1 = LLT::scalar(1); 2432 2433 auto One = B.buildFConstant(S32, 1.0f); 2434 2435 auto DenominatorScaled = 2436 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2437 .addUse(RHS) 2438 .addUse(LHS) 2439 .addImm(1) 2440 .setMIFlags(Flags); 2441 auto NumeratorScaled = 2442 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2443 .addUse(LHS) 2444 .addUse(RHS) 2445 .addImm(0) 2446 .setMIFlags(Flags); 2447 2448 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2449 .addUse(DenominatorScaled.getReg(0)) 2450 .setMIFlags(Flags); 2451 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2452 2453 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2454 // aren't modeled as reading it. 2455 if (!Mode.allFP32Denormals()) 2456 toggleSPDenormMode(true, B, ST, Mode); 2457 2458 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2459 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2460 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2461 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2462 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2463 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2464 2465 if (!Mode.allFP32Denormals()) 2466 toggleSPDenormMode(false, B, ST, Mode); 2467 2468 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2469 .addUse(Fma4.getReg(0)) 2470 .addUse(Fma1.getReg(0)) 2471 .addUse(Fma3.getReg(0)) 2472 .addUse(NumeratorScaled.getReg(1)) 2473 .setMIFlags(Flags); 2474 2475 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2476 .addUse(Fmas.getReg(0)) 2477 .addUse(RHS) 2478 .addUse(LHS) 2479 .setMIFlags(Flags); 2480 2481 MI.eraseFromParent(); 2482 return true; 2483 } 2484 2485 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2486 MachineRegisterInfo &MRI, 2487 MachineIRBuilder &B) const { 2488 B.setInstr(MI); 2489 Register Res = MI.getOperand(0).getReg(); 2490 Register LHS = MI.getOperand(1).getReg(); 2491 Register RHS = MI.getOperand(2).getReg(); 2492 2493 uint16_t Flags = MI.getFlags(); 2494 2495 LLT S64 = LLT::scalar(64); 2496 LLT S1 = LLT::scalar(1); 2497 2498 auto One = B.buildFConstant(S64, 1.0); 2499 2500 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2501 .addUse(LHS) 2502 .addUse(RHS) 2503 .addImm(1) 2504 .setMIFlags(Flags); 2505 2506 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2507 2508 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2509 .addUse(DivScale0.getReg(0)) 2510 .setMIFlags(Flags); 2511 2512 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2513 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2514 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2515 2516 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2517 .addUse(LHS) 2518 .addUse(RHS) 2519 .addImm(0) 2520 .setMIFlags(Flags); 2521 2522 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2523 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); 2524 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2525 2526 Register Scale; 2527 if (!ST.hasUsableDivScaleConditionOutput()) { 2528 // Workaround a hardware bug on SI where the condition output from div_scale 2529 // is not usable. 2530 2531 LLT S32 = LLT::scalar(32); 2532 2533 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2534 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2535 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2536 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 2537 2538 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 2539 Scale1Unmerge.getReg(1)); 2540 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 2541 Scale0Unmerge.getReg(1)); 2542 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 2543 } else { 2544 Scale = DivScale1.getReg(1); 2545 } 2546 2547 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 2548 .addUse(Fma4.getReg(0)) 2549 .addUse(Fma3.getReg(0)) 2550 .addUse(Mul.getReg(0)) 2551 .addUse(Scale) 2552 .setMIFlags(Flags); 2553 2554 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 2555 .addUse(Fmas.getReg(0)) 2556 .addUse(RHS) 2557 .addUse(LHS) 2558 .setMIFlags(Flags); 2559 2560 MI.eraseFromParent(); 2561 return true; 2562 } 2563 2564 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 2565 MachineRegisterInfo &MRI, 2566 MachineIRBuilder &B) const { 2567 B.setInstr(MI); 2568 Register Res = MI.getOperand(0).getReg(); 2569 Register LHS = MI.getOperand(2).getReg(); 2570 Register RHS = MI.getOperand(3).getReg(); 2571 uint16_t Flags = MI.getFlags(); 2572 2573 LLT S32 = LLT::scalar(32); 2574 LLT S1 = LLT::scalar(1); 2575 2576 auto Abs = B.buildFAbs(S32, RHS, Flags); 2577 const APFloat C0Val(1.0f); 2578 2579 auto C0 = B.buildConstant(S32, 0x6f800000); 2580 auto C1 = B.buildConstant(S32, 0x2f800000); 2581 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 2582 2583 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 2584 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 2585 2586 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 2587 2588 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2589 .addUse(Mul0.getReg(0)) 2590 .setMIFlags(Flags); 2591 2592 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 2593 2594 B.buildFMul(Res, Sel, Mul1, Flags); 2595 2596 MI.eraseFromParent(); 2597 return true; 2598 } 2599 2600 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 2601 MachineRegisterInfo &MRI, 2602 MachineIRBuilder &B) const { 2603 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2604 if (!MFI->isEntryFunction()) { 2605 return legalizePreloadedArgIntrin(MI, MRI, B, 2606 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 2607 } 2608 2609 B.setInstr(MI); 2610 2611 uint64_t Offset = 2612 ST.getTargetLowering()->getImplicitParameterOffset( 2613 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 2614 Register DstReg = MI.getOperand(0).getReg(); 2615 LLT DstTy = MRI.getType(DstReg); 2616 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 2617 2618 const ArgDescriptor *Arg; 2619 const TargetRegisterClass *RC; 2620 std::tie(Arg, RC) 2621 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2622 if (!Arg) 2623 return false; 2624 2625 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 2626 if (!loadInputValue(KernargPtrReg, B, Arg)) 2627 return false; 2628 2629 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 2630 MI.eraseFromParent(); 2631 return true; 2632 } 2633 2634 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 2635 MachineRegisterInfo &MRI, 2636 MachineIRBuilder &B, 2637 unsigned AddrSpace) const { 2638 B.setInstr(MI); 2639 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 2640 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 2641 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 2642 MI.eraseFromParent(); 2643 return true; 2644 } 2645 2646 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 2647 // offset (the offset that is included in bounds checking and swizzling, to be 2648 // split between the instruction's voffset and immoffset fields) and soffset 2649 // (the offset that is excluded from bounds checking and swizzling, to go in 2650 // the instruction's soffset field). This function takes the first kind of 2651 // offset and figures out how to split it between voffset and immoffset. 2652 std::tuple<Register, unsigned, unsigned> 2653 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 2654 Register OrigOffset) const { 2655 const unsigned MaxImm = 4095; 2656 Register BaseReg; 2657 unsigned TotalConstOffset; 2658 MachineInstr *OffsetDef; 2659 const LLT S32 = LLT::scalar(32); 2660 2661 std::tie(BaseReg, TotalConstOffset, OffsetDef) 2662 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 2663 2664 unsigned ImmOffset = TotalConstOffset; 2665 2666 // If the immediate value is too big for the immoffset field, put the value 2667 // and -4096 into the immoffset field so that the value that is copied/added 2668 // for the voffset field is a multiple of 4096, and it stands more chance 2669 // of being CSEd with the copy/add for another similar load/store. 2670 // However, do not do that rounding down to a multiple of 4096 if that is a 2671 // negative number, as it appears to be illegal to have a negative offset 2672 // in the vgpr, even if adding the immediate offset makes it positive. 2673 unsigned Overflow = ImmOffset & ~MaxImm; 2674 ImmOffset -= Overflow; 2675 if ((int32_t)Overflow < 0) { 2676 Overflow += ImmOffset; 2677 ImmOffset = 0; 2678 } 2679 2680 if (Overflow != 0) { 2681 if (!BaseReg) { 2682 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 2683 } else { 2684 auto OverflowVal = B.buildConstant(S32, Overflow); 2685 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 2686 } 2687 } 2688 2689 if (!BaseReg) 2690 BaseReg = B.buildConstant(S32, 0).getReg(0); 2691 2692 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 2693 } 2694 2695 /// Handle register layout difference for f16 images for some subtargets. 2696 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 2697 MachineRegisterInfo &MRI, 2698 Register Reg) const { 2699 if (!ST.hasUnpackedD16VMem()) 2700 return Reg; 2701 2702 const LLT S16 = LLT::scalar(16); 2703 const LLT S32 = LLT::scalar(32); 2704 LLT StoreVT = MRI.getType(Reg); 2705 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 2706 2707 auto Unmerge = B.buildUnmerge(S16, Reg); 2708 2709 SmallVector<Register, 4> WideRegs; 2710 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 2711 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 2712 2713 int NumElts = StoreVT.getNumElements(); 2714 2715 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 2716 } 2717 2718 Register AMDGPULegalizerInfo::fixStoreSourceType( 2719 MachineIRBuilder &B, Register VData, bool IsFormat) const { 2720 MachineRegisterInfo *MRI = B.getMRI(); 2721 LLT Ty = MRI->getType(VData); 2722 2723 const LLT S16 = LLT::scalar(16); 2724 2725 // Fixup illegal register types for i8 stores. 2726 if (Ty == LLT::scalar(8) || Ty == S16) { 2727 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 2728 return AnyExt; 2729 } 2730 2731 if (Ty.isVector()) { 2732 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 2733 if (IsFormat) 2734 return handleD16VData(B, *MRI, VData); 2735 } 2736 } 2737 2738 return VData; 2739 } 2740 2741 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 2742 MachineRegisterInfo &MRI, 2743 MachineIRBuilder &B, 2744 bool IsTyped, 2745 bool IsFormat) const { 2746 B.setInstr(MI); 2747 2748 Register VData = MI.getOperand(1).getReg(); 2749 LLT Ty = MRI.getType(VData); 2750 LLT EltTy = Ty.getScalarType(); 2751 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 2752 const LLT S32 = LLT::scalar(32); 2753 2754 VData = fixStoreSourceType(B, VData, IsFormat); 2755 Register RSrc = MI.getOperand(2).getReg(); 2756 2757 MachineMemOperand *MMO = *MI.memoperands_begin(); 2758 const int MemSize = MMO->getSize(); 2759 2760 unsigned ImmOffset; 2761 unsigned TotalOffset; 2762 2763 // The typed intrinsics add an immediate after the registers. 2764 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 2765 2766 // The struct intrinsic variants add one additional operand over raw. 2767 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 2768 Register VIndex; 2769 int OpOffset = 0; 2770 if (HasVIndex) { 2771 VIndex = MI.getOperand(3).getReg(); 2772 OpOffset = 1; 2773 } 2774 2775 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 2776 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 2777 2778 unsigned Format = 0; 2779 if (IsTyped) { 2780 Format = MI.getOperand(5 + OpOffset).getImm(); 2781 ++OpOffset; 2782 } 2783 2784 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 2785 2786 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 2787 if (TotalOffset != 0) 2788 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 2789 2790 unsigned Opc; 2791 if (IsTyped) { 2792 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 2793 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 2794 } else if (IsFormat) { 2795 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 2796 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 2797 } else { 2798 switch (MemSize) { 2799 case 1: 2800 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 2801 break; 2802 case 2: 2803 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 2804 break; 2805 default: 2806 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 2807 break; 2808 } 2809 } 2810 2811 if (!VIndex) 2812 VIndex = B.buildConstant(S32, 0).getReg(0); 2813 2814 auto MIB = B.buildInstr(Opc) 2815 .addUse(VData) // vdata 2816 .addUse(RSrc) // rsrc 2817 .addUse(VIndex) // vindex 2818 .addUse(VOffset) // voffset 2819 .addUse(SOffset) // soffset 2820 .addImm(ImmOffset); // offset(imm) 2821 2822 if (IsTyped) 2823 MIB.addImm(Format); 2824 2825 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 2826 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 2827 .addMemOperand(MMO); 2828 2829 MI.eraseFromParent(); 2830 return true; 2831 } 2832 2833 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 2834 MachineRegisterInfo &MRI, 2835 MachineIRBuilder &B, 2836 bool IsFormat, 2837 bool IsTyped) const { 2838 B.setInstr(MI); 2839 2840 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 2841 MachineMemOperand *MMO = *MI.memoperands_begin(); 2842 const int MemSize = MMO->getSize(); 2843 const LLT S32 = LLT::scalar(32); 2844 2845 Register Dst = MI.getOperand(0).getReg(); 2846 Register RSrc = MI.getOperand(2).getReg(); 2847 2848 // The typed intrinsics add an immediate after the registers. 2849 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 2850 2851 // The struct intrinsic variants add one additional operand over raw. 2852 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 2853 Register VIndex; 2854 int OpOffset = 0; 2855 if (HasVIndex) { 2856 VIndex = MI.getOperand(3).getReg(); 2857 OpOffset = 1; 2858 } 2859 2860 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 2861 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 2862 2863 unsigned Format = 0; 2864 if (IsTyped) { 2865 Format = MI.getOperand(5 + OpOffset).getImm(); 2866 ++OpOffset; 2867 } 2868 2869 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 2870 unsigned ImmOffset; 2871 unsigned TotalOffset; 2872 2873 LLT Ty = MRI.getType(Dst); 2874 LLT EltTy = Ty.getScalarType(); 2875 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 2876 const bool Unpacked = ST.hasUnpackedD16VMem(); 2877 2878 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 2879 if (TotalOffset != 0) 2880 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 2881 2882 unsigned Opc; 2883 2884 if (IsTyped) { 2885 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 2886 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 2887 } else if (IsFormat) { 2888 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 2889 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 2890 } else { 2891 switch (MemSize) { 2892 case 1: 2893 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 2894 break; 2895 case 2: 2896 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 2897 break; 2898 default: 2899 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 2900 break; 2901 } 2902 } 2903 2904 Register LoadDstReg; 2905 2906 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 2907 LLT UnpackedTy = Ty.changeElementSize(32); 2908 2909 if (IsExtLoad) 2910 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 2911 else if (Unpacked && IsD16 && Ty.isVector()) 2912 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 2913 else 2914 LoadDstReg = Dst; 2915 2916 if (!VIndex) 2917 VIndex = B.buildConstant(S32, 0).getReg(0); 2918 2919 auto MIB = B.buildInstr(Opc) 2920 .addDef(LoadDstReg) // vdata 2921 .addUse(RSrc) // rsrc 2922 .addUse(VIndex) // vindex 2923 .addUse(VOffset) // voffset 2924 .addUse(SOffset) // soffset 2925 .addImm(ImmOffset); // offset(imm) 2926 2927 if (IsTyped) 2928 MIB.addImm(Format); 2929 2930 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 2931 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 2932 .addMemOperand(MMO); 2933 2934 if (LoadDstReg != Dst) { 2935 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2936 2937 // Widen result for extending loads was widened. 2938 if (IsExtLoad) 2939 B.buildTrunc(Dst, LoadDstReg); 2940 else { 2941 // Repack to original 16-bit vector result 2942 // FIXME: G_TRUNC should work, but legalization currently fails 2943 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 2944 SmallVector<Register, 4> Repack; 2945 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 2946 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 2947 B.buildMerge(Dst, Repack); 2948 } 2949 } 2950 2951 MI.eraseFromParent(); 2952 return true; 2953 } 2954 2955 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 2956 MachineIRBuilder &B, 2957 bool IsInc) const { 2958 B.setInstr(MI); 2959 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 2960 AMDGPU::G_AMDGPU_ATOMIC_DEC; 2961 B.buildInstr(Opc) 2962 .addDef(MI.getOperand(0).getReg()) 2963 .addUse(MI.getOperand(2).getReg()) 2964 .addUse(MI.getOperand(3).getReg()) 2965 .cloneMemRefs(MI); 2966 MI.eraseFromParent(); 2967 return true; 2968 } 2969 2970 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 2971 switch (IntrID) { 2972 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 2973 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 2974 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 2975 case Intrinsic::amdgcn_raw_buffer_atomic_add: 2976 case Intrinsic::amdgcn_struct_buffer_atomic_add: 2977 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 2978 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 2979 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 2980 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 2981 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 2982 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 2983 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 2984 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 2985 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 2986 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 2987 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 2988 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 2989 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 2990 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 2991 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 2992 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 2993 case Intrinsic::amdgcn_raw_buffer_atomic_and: 2994 case Intrinsic::amdgcn_struct_buffer_atomic_and: 2995 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 2996 case Intrinsic::amdgcn_raw_buffer_atomic_or: 2997 case Intrinsic::amdgcn_struct_buffer_atomic_or: 2998 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 2999 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3000 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3001 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3002 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3003 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3004 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3005 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3006 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3007 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3008 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3009 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3010 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3011 default: 3012 llvm_unreachable("unhandled atomic opcode"); 3013 } 3014 } 3015 3016 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3017 MachineIRBuilder &B, 3018 Intrinsic::ID IID) const { 3019 B.setInstr(MI); 3020 3021 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3022 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3023 3024 Register Dst = MI.getOperand(0).getReg(); 3025 Register VData = MI.getOperand(2).getReg(); 3026 3027 Register CmpVal; 3028 int OpOffset = 0; 3029 3030 if (IsCmpSwap) { 3031 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3032 ++OpOffset; 3033 } 3034 3035 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3036 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3037 3038 // The struct intrinsic variants add one additional operand over raw. 3039 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3040 Register VIndex; 3041 if (HasVIndex) { 3042 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3043 ++OpOffset; 3044 } 3045 3046 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3047 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3048 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3049 3050 MachineMemOperand *MMO = *MI.memoperands_begin(); 3051 3052 unsigned ImmOffset; 3053 unsigned TotalOffset; 3054 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3055 if (TotalOffset != 0) 3056 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3057 3058 if (!VIndex) 3059 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3060 3061 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3062 .addDef(Dst) 3063 .addUse(VData); // vdata 3064 3065 if (IsCmpSwap) 3066 MIB.addReg(CmpVal); 3067 3068 MIB.addUse(RSrc) // rsrc 3069 .addUse(VIndex) // vindex 3070 .addUse(VOffset) // voffset 3071 .addUse(SOffset) // soffset 3072 .addImm(ImmOffset) // offset(imm) 3073 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3074 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3075 .addMemOperand(MMO); 3076 3077 MI.eraseFromParent(); 3078 return true; 3079 } 3080 3081 // Produce a vector of s16 elements from s32 pieces. 3082 static void truncToS16Vector(MachineIRBuilder &B, Register DstReg, 3083 ArrayRef<Register> UnmergeParts) { 3084 const LLT S16 = LLT::scalar(16); 3085 3086 SmallVector<Register, 4> RemergeParts(UnmergeParts.size()); 3087 for (int I = 0, E = UnmergeParts.size(); I != E; ++I) 3088 RemergeParts[I] = B.buildTrunc(S16, UnmergeParts[I]).getReg(0); 3089 3090 B.buildBuildVector(DstReg, RemergeParts); 3091 } 3092 3093 /// Convert a set of s32 registers to a result vector with s16 elements. 3094 static void bitcastToS16Vector(MachineIRBuilder &B, Register DstReg, 3095 ArrayRef<Register> UnmergeParts) { 3096 MachineRegisterInfo &MRI = *B.getMRI(); 3097 const LLT V2S16 = LLT::vector(2, 16); 3098 LLT TargetTy = MRI.getType(DstReg); 3099 int NumElts = UnmergeParts.size(); 3100 3101 if (NumElts == 1) { 3102 assert(TargetTy == V2S16); 3103 B.buildBitcast(DstReg, UnmergeParts[0]); 3104 return; 3105 } 3106 3107 SmallVector<Register, 4> RemergeParts(NumElts); 3108 for (int I = 0; I != NumElts; ++I) 3109 RemergeParts[I] = B.buildBitcast(V2S16, UnmergeParts[I]).getReg(0); 3110 3111 if (TargetTy.getSizeInBits() == 32u * NumElts) { 3112 B.buildConcatVectors(DstReg, RemergeParts); 3113 return; 3114 } 3115 3116 const LLT V3S16 = LLT::vector(3, 16); 3117 const LLT V6S16 = LLT::vector(6, 16); 3118 3119 // Widen to v6s16 and unpack v3 parts. 3120 assert(TargetTy == V3S16); 3121 3122 RemergeParts.push_back(B.buildUndef(V2S16).getReg(0)); 3123 auto Concat = B.buildConcatVectors(V6S16, RemergeParts); 3124 B.buildUnmerge({DstReg, MRI.createGenericVirtualRegister(V3S16)}, Concat); 3125 } 3126 3127 // FIXME: Just vector trunc should be sufficent, but legalization currently 3128 // broken. 3129 static void repackUnpackedD16Load(MachineIRBuilder &B, Register DstReg, 3130 Register WideDstReg) { 3131 const LLT S32 = LLT::scalar(32); 3132 const LLT S16 = LLT::scalar(16); 3133 3134 auto Unmerge = B.buildUnmerge(S32, WideDstReg); 3135 3136 int NumOps = Unmerge->getNumOperands() - 1; 3137 SmallVector<Register, 4> RemergeParts(NumOps); 3138 for (int I = 0; I != NumOps; ++I) 3139 RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0); 3140 3141 B.buildBuildVector(DstReg, RemergeParts); 3142 } 3143 3144 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3145 MachineInstr &MI, MachineIRBuilder &B, 3146 GISelChangeObserver &Observer, 3147 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3148 bool IsTFE = MI.getNumExplicitDefs() == 2; 3149 3150 // We are only processing the operands of d16 image operations on subtargets 3151 // that use the unpacked register layout, or need to repack the TFE result. 3152 3153 // TODO: Need to handle a16 images too 3154 // TODO: Do we need to guard against already legalized intrinsics? 3155 if (!IsTFE && !ST.hasUnpackedD16VMem()) 3156 return true; 3157 3158 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3159 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3160 3161 if (BaseOpcode->Atomic) // No d16 atomics, or TFE. 3162 return true; 3163 3164 B.setInstr(MI); 3165 3166 MachineRegisterInfo *MRI = B.getMRI(); 3167 const LLT S32 = LLT::scalar(32); 3168 const LLT S16 = LLT::scalar(16); 3169 3170 if (BaseOpcode->Store) { // No TFE for stores? 3171 Register VData = MI.getOperand(1).getReg(); 3172 LLT Ty = MRI->getType(VData); 3173 if (!Ty.isVector() || Ty.getElementType() != S16) 3174 return true; 3175 3176 B.setInstr(MI); 3177 3178 Observer.changingInstr(MI); 3179 MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData)); 3180 Observer.changedInstr(MI); 3181 return true; 3182 } 3183 3184 Register DstReg = MI.getOperand(0).getReg(); 3185 LLT Ty = MRI->getType(DstReg); 3186 const LLT EltTy = Ty.getScalarType(); 3187 const bool IsD16 = Ty.getScalarType() == S16; 3188 const unsigned NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3189 3190 if (IsTFE) { 3191 // In the IR, TFE is supposed to be used with a 2 element struct return 3192 // type. The intruction really returns these two values in one contiguous 3193 // register, with one additional dword beyond the loaded data. Rewrite the 3194 // return type to use a single register result. 3195 Register Dst1Reg = MI.getOperand(1).getReg(); 3196 if (MRI->getType(Dst1Reg) != S32) 3197 return false; 3198 3199 // TODO: Make sure the TFE operand bit is set. 3200 3201 // The raw dword aligned data component of the load. The only legal cases 3202 // where this matters should be when using the packed D16 format, for 3203 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3204 LLT RoundedTy; 3205 LLT TFETy; 3206 3207 if (IsD16 && ST.hasUnpackedD16VMem()) { 3208 RoundedTy = LLT::scalarOrVector(NumElts, 32); 3209 TFETy = LLT::vector(NumElts + 1, 32); 3210 } else { 3211 unsigned EltSize = Ty.getScalarSizeInBits(); 3212 unsigned RoundedElts = (Ty.getSizeInBits() + 31) / 32; 3213 unsigned RoundedSize = 32 * RoundedElts; 3214 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3215 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3216 } 3217 3218 Register TFEReg = MRI->createGenericVirtualRegister(TFETy); 3219 Observer.changingInstr(MI); 3220 3221 MI.getOperand(0).setReg(TFEReg); 3222 MI.RemoveOperand(1); 3223 3224 Observer.changedInstr(MI); 3225 3226 // Insert after the instruction. 3227 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3228 3229 // Now figure out how to copy the new result register back into the old 3230 // result. 3231 3232 SmallVector<Register, 5> UnmergeResults(TFETy.getNumElements(), Dst1Reg); 3233 int NumDataElts = TFETy.getNumElements() - 1; 3234 3235 if (!Ty.isVector()) { 3236 // Simplest case is a trivial unmerge (plus a truncate for d16). 3237 UnmergeResults[0] = Ty == S32 ? 3238 DstReg : MRI->createGenericVirtualRegister(S32); 3239 3240 B.buildUnmerge(UnmergeResults, TFEReg); 3241 if (Ty != S32) 3242 B.buildTrunc(DstReg, UnmergeResults[0]); 3243 return true; 3244 } 3245 3246 // We have to repack into a new vector of some kind. 3247 for (int I = 0; I != NumDataElts; ++I) 3248 UnmergeResults[I] = MRI->createGenericVirtualRegister(S32); 3249 B.buildUnmerge(UnmergeResults, TFEReg); 3250 3251 // Drop the final TFE element. 3252 ArrayRef<Register> DataPart(UnmergeResults.data(), NumDataElts); 3253 3254 if (EltTy == S32) 3255 B.buildBuildVector(DstReg, DataPart); 3256 else if (ST.hasUnpackedD16VMem()) 3257 truncToS16Vector(B, DstReg, DataPart); 3258 else 3259 bitcastToS16Vector(B, DstReg, DataPart); 3260 3261 return true; 3262 } 3263 3264 // Must be an image load. 3265 if (!Ty.isVector() || Ty.getElementType() != S16) 3266 return true; 3267 3268 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3269 3270 LLT WidenedTy = Ty.changeElementType(S32); 3271 Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy); 3272 3273 Observer.changingInstr(MI); 3274 MI.getOperand(0).setReg(WideDstReg); 3275 Observer.changedInstr(MI); 3276 3277 repackUnpackedD16Load(B, DstReg, WideDstReg); 3278 return true; 3279 } 3280 3281 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 3282 MachineInstr &MI, MachineIRBuilder &B, 3283 GISelChangeObserver &Observer) const { 3284 Register Dst = MI.getOperand(0).getReg(); 3285 LLT Ty = B.getMRI()->getType(Dst); 3286 unsigned Size = Ty.getSizeInBits(); 3287 MachineFunction &MF = B.getMF(); 3288 3289 Observer.changingInstr(MI); 3290 3291 // FIXME: We don't really need this intermediate instruction. The intrinsic 3292 // should be fixed to have a memory operand. Since it's readnone, we're not 3293 // allowed to add one. 3294 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 3295 MI.RemoveOperand(1); // Remove intrinsic ID 3296 3297 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 3298 // TODO: Should this use datalayout alignment? 3299 const unsigned MemSize = (Size + 7) / 8; 3300 const unsigned MemAlign = 4; 3301 MachineMemOperand *MMO = MF.getMachineMemOperand( 3302 MachinePointerInfo(), 3303 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 3304 MachineMemOperand::MOInvariant, MemSize, MemAlign); 3305 MI.addMemOperand(MF, MMO); 3306 3307 // There are no 96-bit result scalar loads, but widening to 128-bit should 3308 // always be legal. We may need to restore this to a 96-bit result if it turns 3309 // out this needs to be converted to a vector load during RegBankSelect. 3310 if (!isPowerOf2_32(Size)) { 3311 LegalizerHelper Helper(MF, *this, Observer, B); 3312 B.setInstr(MI); 3313 3314 if (Ty.isVector()) 3315 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 3316 else 3317 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 3318 } 3319 3320 Observer.changedInstr(MI); 3321 return true; 3322 } 3323 3324 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 3325 MachineIRBuilder &B, 3326 GISelChangeObserver &Observer) const { 3327 MachineRegisterInfo &MRI = *B.getMRI(); 3328 3329 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 3330 auto IntrID = MI.getIntrinsicID(); 3331 switch (IntrID) { 3332 case Intrinsic::amdgcn_if: 3333 case Intrinsic::amdgcn_else: { 3334 MachineInstr *Br = nullptr; 3335 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 3336 const SIRegisterInfo *TRI 3337 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 3338 3339 B.setInstr(*BrCond); 3340 Register Def = MI.getOperand(1).getReg(); 3341 Register Use = MI.getOperand(3).getReg(); 3342 3343 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 3344 if (Br) 3345 BrTarget = Br->getOperand(0).getMBB(); 3346 3347 if (IntrID == Intrinsic::amdgcn_if) { 3348 B.buildInstr(AMDGPU::SI_IF) 3349 .addDef(Def) 3350 .addUse(Use) 3351 .addMBB(BrTarget); 3352 } else { 3353 B.buildInstr(AMDGPU::SI_ELSE) 3354 .addDef(Def) 3355 .addUse(Use) 3356 .addMBB(BrTarget) 3357 .addImm(0); 3358 } 3359 3360 if (Br) 3361 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 3362 3363 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 3364 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 3365 MI.eraseFromParent(); 3366 BrCond->eraseFromParent(); 3367 return true; 3368 } 3369 3370 return false; 3371 } 3372 case Intrinsic::amdgcn_loop: { 3373 MachineInstr *Br = nullptr; 3374 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 3375 const SIRegisterInfo *TRI 3376 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 3377 3378 B.setInstr(*BrCond); 3379 3380 // FIXME: Need to adjust branch targets based on unconditional branch. 3381 Register Reg = MI.getOperand(2).getReg(); 3382 B.buildInstr(AMDGPU::SI_LOOP) 3383 .addUse(Reg) 3384 .addMBB(BrCond->getOperand(1).getMBB()); 3385 MI.eraseFromParent(); 3386 BrCond->eraseFromParent(); 3387 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 3388 return true; 3389 } 3390 3391 return false; 3392 } 3393 case Intrinsic::amdgcn_kernarg_segment_ptr: 3394 return legalizePreloadedArgIntrin( 3395 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3396 case Intrinsic::amdgcn_implicitarg_ptr: 3397 return legalizeImplicitArgPtr(MI, MRI, B); 3398 case Intrinsic::amdgcn_workitem_id_x: 3399 return legalizePreloadedArgIntrin(MI, MRI, B, 3400 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 3401 case Intrinsic::amdgcn_workitem_id_y: 3402 return legalizePreloadedArgIntrin(MI, MRI, B, 3403 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 3404 case Intrinsic::amdgcn_workitem_id_z: 3405 return legalizePreloadedArgIntrin(MI, MRI, B, 3406 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 3407 case Intrinsic::amdgcn_workgroup_id_x: 3408 return legalizePreloadedArgIntrin(MI, MRI, B, 3409 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 3410 case Intrinsic::amdgcn_workgroup_id_y: 3411 return legalizePreloadedArgIntrin(MI, MRI, B, 3412 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 3413 case Intrinsic::amdgcn_workgroup_id_z: 3414 return legalizePreloadedArgIntrin(MI, MRI, B, 3415 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 3416 case Intrinsic::amdgcn_dispatch_ptr: 3417 return legalizePreloadedArgIntrin(MI, MRI, B, 3418 AMDGPUFunctionArgInfo::DISPATCH_PTR); 3419 case Intrinsic::amdgcn_queue_ptr: 3420 return legalizePreloadedArgIntrin(MI, MRI, B, 3421 AMDGPUFunctionArgInfo::QUEUE_PTR); 3422 case Intrinsic::amdgcn_implicit_buffer_ptr: 3423 return legalizePreloadedArgIntrin( 3424 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 3425 case Intrinsic::amdgcn_dispatch_id: 3426 return legalizePreloadedArgIntrin(MI, MRI, B, 3427 AMDGPUFunctionArgInfo::DISPATCH_ID); 3428 case Intrinsic::amdgcn_fdiv_fast: 3429 return legalizeFDIVFastIntrin(MI, MRI, B); 3430 case Intrinsic::amdgcn_is_shared: 3431 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 3432 case Intrinsic::amdgcn_is_private: 3433 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 3434 case Intrinsic::amdgcn_wavefrontsize: { 3435 B.setInstr(MI); 3436 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 3437 MI.eraseFromParent(); 3438 return true; 3439 } 3440 case Intrinsic::amdgcn_s_buffer_load: 3441 return legalizeSBufferLoad(MI, B, Observer); 3442 case Intrinsic::amdgcn_raw_buffer_store: 3443 case Intrinsic::amdgcn_struct_buffer_store: 3444 return legalizeBufferStore(MI, MRI, B, false, false); 3445 case Intrinsic::amdgcn_raw_buffer_store_format: 3446 case Intrinsic::amdgcn_struct_buffer_store_format: 3447 return legalizeBufferStore(MI, MRI, B, false, true); 3448 case Intrinsic::amdgcn_raw_tbuffer_store: 3449 case Intrinsic::amdgcn_struct_tbuffer_store: 3450 return legalizeBufferStore(MI, MRI, B, true, true); 3451 case Intrinsic::amdgcn_raw_buffer_load: 3452 case Intrinsic::amdgcn_struct_buffer_load: 3453 return legalizeBufferLoad(MI, MRI, B, false, false); 3454 case Intrinsic::amdgcn_raw_buffer_load_format: 3455 case Intrinsic::amdgcn_struct_buffer_load_format: 3456 return legalizeBufferLoad(MI, MRI, B, true, false); 3457 case Intrinsic::amdgcn_raw_tbuffer_load: 3458 case Intrinsic::amdgcn_struct_tbuffer_load: 3459 return legalizeBufferLoad(MI, MRI, B, true, true); 3460 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3461 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3462 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3463 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3464 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3465 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3466 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3467 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3468 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3469 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3470 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3471 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3472 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3473 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3474 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3475 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3476 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3477 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3478 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3479 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3480 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3481 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3482 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3483 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3484 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3485 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3486 return legalizeBufferAtomic(MI, B, IntrID); 3487 case Intrinsic::amdgcn_atomic_inc: 3488 return legalizeAtomicIncDec(MI, B, true); 3489 case Intrinsic::amdgcn_atomic_dec: 3490 return legalizeAtomicIncDec(MI, B, false); 3491 default: { 3492 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 3493 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 3494 return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr); 3495 return true; 3496 } 3497 } 3498 3499 return true; 3500 } 3501