1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPULegalizerInfo.h" 22 23 #include "AMDGPU.h" 24 #include "AMDGPUGlobalISelUtils.h" 25 #include "AMDGPUTargetMachine.h" 26 #include "SIMachineFunctionInfo.h" 27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 30 #include "llvm/CodeGen/TargetOpcodes.h" 31 #include "llvm/CodeGen/ValueTypes.h" 32 #include "llvm/IR/DerivedTypes.h" 33 #include "llvm/IR/DiagnosticInfo.h" 34 #include "llvm/IR/Type.h" 35 #include "llvm/Support/Debug.h" 36 37 #define DEBUG_TYPE "amdgpu-legalinfo" 38 39 using namespace llvm; 40 using namespace LegalizeActions; 41 using namespace LegalizeMutations; 42 using namespace LegalityPredicates; 43 using namespace MIPatternMatch; 44 45 // Round the number of elements to the next power of two elements 46 static LLT getPow2VectorType(LLT Ty) { 47 unsigned NElts = Ty.getNumElements(); 48 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 49 return Ty.changeNumElements(Pow2NElts); 50 } 51 52 // Round the number of bits to the next power of two bits 53 static LLT getPow2ScalarType(LLT Ty) { 54 unsigned Bits = Ty.getSizeInBits(); 55 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 56 return LLT::scalar(Pow2Bits); 57 } 58 59 static LegalityPredicate isMultiple32(unsigned TypeIdx, 60 unsigned MaxSize = 1024) { 61 return [=](const LegalityQuery &Query) { 62 const LLT Ty = Query.Types[TypeIdx]; 63 const LLT EltTy = Ty.getScalarType(); 64 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 65 }; 66 } 67 68 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 69 return [=](const LegalityQuery &Query) { 70 return Query.Types[TypeIdx].getSizeInBits() == Size; 71 }; 72 } 73 74 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 75 return [=](const LegalityQuery &Query) { 76 const LLT Ty = Query.Types[TypeIdx]; 77 return Ty.isVector() && 78 Ty.getNumElements() % 2 != 0 && 79 Ty.getElementType().getSizeInBits() < 32 && 80 Ty.getSizeInBits() % 32 != 0; 81 }; 82 } 83 84 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 85 return [=](const LegalityQuery &Query) { 86 const LLT Ty = Query.Types[TypeIdx]; 87 const LLT EltTy = Ty.getScalarType(); 88 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 89 }; 90 } 91 92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 93 return [=](const LegalityQuery &Query) { 94 const LLT Ty = Query.Types[TypeIdx]; 95 const LLT EltTy = Ty.getElementType(); 96 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 97 }; 98 } 99 100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 101 return [=](const LegalityQuery &Query) { 102 const LLT Ty = Query.Types[TypeIdx]; 103 const LLT EltTy = Ty.getElementType(); 104 unsigned Size = Ty.getSizeInBits(); 105 unsigned Pieces = (Size + 63) / 64; 106 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 107 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 108 }; 109 } 110 111 // Increase the number of vector elements to reach the next multiple of 32-bit 112 // type. 113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 114 return [=](const LegalityQuery &Query) { 115 const LLT Ty = Query.Types[TypeIdx]; 116 117 const LLT EltTy = Ty.getElementType(); 118 const int Size = Ty.getSizeInBits(); 119 const int EltSize = EltTy.getSizeInBits(); 120 const int NextMul32 = (Size + 31) / 32; 121 122 assert(EltSize < 32); 123 124 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 125 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 126 }; 127 } 128 129 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 130 return [=](const LegalityQuery &Query) { 131 const LLT QueryTy = Query.Types[TypeIdx]; 132 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 133 }; 134 } 135 136 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 137 return [=](const LegalityQuery &Query) { 138 const LLT QueryTy = Query.Types[TypeIdx]; 139 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 140 }; 141 } 142 143 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 144 return [=](const LegalityQuery &Query) { 145 const LLT QueryTy = Query.Types[TypeIdx]; 146 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 147 }; 148 } 149 150 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 151 // v2s16. 152 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 153 return [=](const LegalityQuery &Query) { 154 const LLT Ty = Query.Types[TypeIdx]; 155 if (Ty.isVector()) { 156 const int EltSize = Ty.getElementType().getSizeInBits(); 157 return EltSize == 32 || EltSize == 64 || 158 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 159 EltSize == 128 || EltSize == 256; 160 } 161 162 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 163 }; 164 } 165 166 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 167 return [=](const LegalityQuery &Query) { 168 const LLT QueryTy = Query.Types[TypeIdx]; 169 return QueryTy.isVector() && QueryTy.getElementType() == Type; 170 }; 171 } 172 173 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 174 return [=](const LegalityQuery &Query) { 175 const LLT Ty = Query.Types[TypeIdx]; 176 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 177 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 178 }; 179 } 180 181 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 182 const GCNTargetMachine &TM) 183 : ST(ST_) { 184 using namespace TargetOpcode; 185 186 auto GetAddrSpacePtr = [&TM](unsigned AS) { 187 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 188 }; 189 190 const LLT S1 = LLT::scalar(1); 191 const LLT S16 = LLT::scalar(16); 192 const LLT S32 = LLT::scalar(32); 193 const LLT S64 = LLT::scalar(64); 194 const LLT S128 = LLT::scalar(128); 195 const LLT S256 = LLT::scalar(256); 196 const LLT S1024 = LLT::scalar(1024); 197 198 const LLT V2S16 = LLT::vector(2, 16); 199 const LLT V4S16 = LLT::vector(4, 16); 200 201 const LLT V2S32 = LLT::vector(2, 32); 202 const LLT V3S32 = LLT::vector(3, 32); 203 const LLT V4S32 = LLT::vector(4, 32); 204 const LLT V5S32 = LLT::vector(5, 32); 205 const LLT V6S32 = LLT::vector(6, 32); 206 const LLT V7S32 = LLT::vector(7, 32); 207 const LLT V8S32 = LLT::vector(8, 32); 208 const LLT V9S32 = LLT::vector(9, 32); 209 const LLT V10S32 = LLT::vector(10, 32); 210 const LLT V11S32 = LLT::vector(11, 32); 211 const LLT V12S32 = LLT::vector(12, 32); 212 const LLT V13S32 = LLT::vector(13, 32); 213 const LLT V14S32 = LLT::vector(14, 32); 214 const LLT V15S32 = LLT::vector(15, 32); 215 const LLT V16S32 = LLT::vector(16, 32); 216 const LLT V32S32 = LLT::vector(32, 32); 217 218 const LLT V2S64 = LLT::vector(2, 64); 219 const LLT V3S64 = LLT::vector(3, 64); 220 const LLT V4S64 = LLT::vector(4, 64); 221 const LLT V5S64 = LLT::vector(5, 64); 222 const LLT V6S64 = LLT::vector(6, 64); 223 const LLT V7S64 = LLT::vector(7, 64); 224 const LLT V8S64 = LLT::vector(8, 64); 225 const LLT V16S64 = LLT::vector(16, 64); 226 227 std::initializer_list<LLT> AllS32Vectors = 228 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 229 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 230 std::initializer_list<LLT> AllS64Vectors = 231 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 232 233 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 234 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 235 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 236 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 237 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 238 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 239 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 240 241 const LLT CodePtr = FlatPtr; 242 243 const std::initializer_list<LLT> AddrSpaces64 = { 244 GlobalPtr, ConstantPtr, FlatPtr 245 }; 246 247 const std::initializer_list<LLT> AddrSpaces32 = { 248 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 249 }; 250 251 const std::initializer_list<LLT> FPTypesBase = { 252 S32, S64 253 }; 254 255 const std::initializer_list<LLT> FPTypes16 = { 256 S32, S64, S16 257 }; 258 259 const std::initializer_list<LLT> FPTypesPK16 = { 260 S32, S64, S16, V2S16 261 }; 262 263 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 264 265 setAction({G_BRCOND, S1}, Legal); // VCC branches 266 setAction({G_BRCOND, S32}, Legal); // SCC branches 267 268 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 269 // elements for v3s16 270 getActionDefinitionsBuilder(G_PHI) 271 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 272 .legalFor(AllS32Vectors) 273 .legalFor(AllS64Vectors) 274 .legalFor(AddrSpaces64) 275 .legalFor(AddrSpaces32) 276 .clampScalar(0, S32, S256) 277 .widenScalarToNextPow2(0, 32) 278 .clampMaxNumElements(0, S32, 16) 279 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 280 .legalIf(isPointer(0)); 281 282 if (ST.has16BitInsts()) { 283 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 284 .legalFor({S32, S16}) 285 .clampScalar(0, S16, S32) 286 .scalarize(0) 287 .widenScalarToNextPow2(0, 32); 288 } else { 289 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 290 .legalFor({S32}) 291 .clampScalar(0, S32, S32) 292 .scalarize(0); 293 } 294 295 // FIXME: Not really legal. Placeholder for custom lowering. 296 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 297 .legalFor({S32, S64}) 298 .clampScalar(0, S32, S64) 299 .widenScalarToNextPow2(0, 32) 300 .scalarize(0); 301 302 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 303 .legalFor({S32}) 304 .clampScalar(0, S32, S32) 305 .scalarize(0); 306 307 // Report legal for any types we can handle anywhere. For the cases only legal 308 // on the SALU, RegBankSelect will be able to re-legalize. 309 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 310 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 311 .clampScalar(0, S32, S64) 312 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 313 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 314 .widenScalarToNextPow2(0) 315 .scalarize(0); 316 317 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 318 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 319 .legalFor({{S32, S1}, {S32, S32}}) 320 .clampScalar(0, S32, S32) 321 .scalarize(0); // TODO: Implement. 322 323 getActionDefinitionsBuilder(G_BITCAST) 324 // Don't worry about the size constraint. 325 .legalIf(all(isRegisterType(0), isRegisterType(1))) 326 .lower(); 327 328 329 getActionDefinitionsBuilder(G_CONSTANT) 330 .legalFor({S1, S32, S64, S16, GlobalPtr, 331 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 332 .clampScalar(0, S32, S64) 333 .widenScalarToNextPow2(0) 334 .legalIf(isPointer(0)); 335 336 getActionDefinitionsBuilder(G_FCONSTANT) 337 .legalFor({S32, S64, S16}) 338 .clampScalar(0, S16, S64); 339 340 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 341 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 342 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 343 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 344 .clampScalarOrElt(0, S32, S1024) 345 .legalIf(isMultiple32(0)) 346 .widenScalarToNextPow2(0, 32) 347 .clampMaxNumElements(0, S32, 16); 348 349 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 350 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 351 .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr}); 352 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 353 354 auto &FPOpActions = getActionDefinitionsBuilder( 355 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 356 .legalFor({S32, S64}); 357 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 358 .customFor({S32, S64}); 359 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 360 .customFor({S32, S64}); 361 362 if (ST.has16BitInsts()) { 363 if (ST.hasVOP3PInsts()) 364 FPOpActions.legalFor({S16, V2S16}); 365 else 366 FPOpActions.legalFor({S16}); 367 368 TrigActions.customFor({S16}); 369 FDIVActions.customFor({S16}); 370 } 371 372 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 373 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 374 375 if (ST.hasVOP3PInsts()) { 376 MinNumMaxNum.customFor(FPTypesPK16) 377 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 378 .clampMaxNumElements(0, S16, 2) 379 .clampScalar(0, S16, S64) 380 .scalarize(0); 381 } else if (ST.has16BitInsts()) { 382 MinNumMaxNum.customFor(FPTypes16) 383 .clampScalar(0, S16, S64) 384 .scalarize(0); 385 } else { 386 MinNumMaxNum.customFor(FPTypesBase) 387 .clampScalar(0, S32, S64) 388 .scalarize(0); 389 } 390 391 if (ST.hasVOP3PInsts()) 392 FPOpActions.clampMaxNumElements(0, S16, 2); 393 394 FPOpActions 395 .scalarize(0) 396 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 397 398 TrigActions 399 .scalarize(0) 400 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 401 402 FDIVActions 403 .scalarize(0) 404 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 405 406 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 407 .legalFor(FPTypesPK16) 408 .clampMaxNumElements(0, S16, 2) 409 .scalarize(0) 410 .clampScalar(0, S16, S64); 411 412 if (ST.has16BitInsts()) { 413 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 414 .legalFor({S32, S64, S16}) 415 .scalarize(0) 416 .clampScalar(0, S16, S64); 417 } else { 418 getActionDefinitionsBuilder(G_FSQRT) 419 .legalFor({S32, S64}) 420 .scalarize(0) 421 .clampScalar(0, S32, S64); 422 423 if (ST.hasFractBug()) { 424 getActionDefinitionsBuilder(G_FFLOOR) 425 .customFor({S64}) 426 .legalFor({S32, S64}) 427 .scalarize(0) 428 .clampScalar(0, S32, S64); 429 } else { 430 getActionDefinitionsBuilder(G_FFLOOR) 431 .legalFor({S32, S64}) 432 .scalarize(0) 433 .clampScalar(0, S32, S64); 434 } 435 } 436 437 getActionDefinitionsBuilder(G_FPTRUNC) 438 .legalFor({{S32, S64}, {S16, S32}}) 439 .scalarize(0); 440 441 getActionDefinitionsBuilder(G_FPEXT) 442 .legalFor({{S64, S32}, {S32, S16}}) 443 .lowerFor({{S64, S16}}) // FIXME: Implement 444 .scalarize(0); 445 446 getActionDefinitionsBuilder(G_FSUB) 447 // Use actual fsub instruction 448 .legalFor({S32}) 449 // Must use fadd + fneg 450 .lowerFor({S64, S16, V2S16}) 451 .scalarize(0) 452 .clampScalar(0, S32, S64); 453 454 // Whether this is legal depends on the floating point mode for the function. 455 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 456 if (ST.hasMadF16()) 457 FMad.customFor({S32, S16}); 458 else 459 FMad.customFor({S32}); 460 FMad.scalarize(0) 461 .lower(); 462 463 getActionDefinitionsBuilder(G_TRUNC) 464 .alwaysLegal(); 465 466 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 467 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 468 {S32, S1}, {S64, S1}, {S16, S1}}) 469 .scalarize(0) 470 .clampScalar(0, S32, S64) 471 .widenScalarToNextPow2(1, 32); 472 473 // TODO: Split s1->s64 during regbankselect for VALU. 474 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 475 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 476 .lowerFor({{S32, S64}}) 477 .lowerIf(typeIs(1, S1)) 478 .customFor({{S64, S64}}); 479 if (ST.has16BitInsts()) 480 IToFP.legalFor({{S16, S16}}); 481 IToFP.clampScalar(1, S32, S64) 482 .scalarize(0); 483 484 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 485 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 486 .customFor({{S64, S64}}); 487 if (ST.has16BitInsts()) 488 FPToI.legalFor({{S16, S16}}); 489 else 490 FPToI.minScalar(1, S32); 491 492 FPToI.minScalar(0, S32) 493 .scalarize(0) 494 .lower(); 495 496 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 497 .scalarize(0) 498 .lower(); 499 500 if (ST.has16BitInsts()) { 501 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 502 .legalFor({S16, S32, S64}) 503 .clampScalar(0, S16, S64) 504 .scalarize(0); 505 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 506 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 507 .legalFor({S32, S64}) 508 .clampScalar(0, S32, S64) 509 .scalarize(0); 510 } else { 511 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 512 .legalFor({S32}) 513 .customFor({S64}) 514 .clampScalar(0, S32, S64) 515 .scalarize(0); 516 } 517 518 getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK}) 519 .scalarize(0) 520 .alwaysLegal(); 521 522 auto &CmpBuilder = 523 getActionDefinitionsBuilder(G_ICMP) 524 // The compare output type differs based on the register bank of the output, 525 // so make both s1 and s32 legal. 526 // 527 // Scalar compares producing output in scc will be promoted to s32, as that 528 // is the allocatable register type that will be needed for the copy from 529 // scc. This will be promoted during RegBankSelect, and we assume something 530 // before that won't try to use s32 result types. 531 // 532 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 533 // bank. 534 .legalForCartesianProduct( 535 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 536 .legalForCartesianProduct( 537 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 538 if (ST.has16BitInsts()) { 539 CmpBuilder.legalFor({{S1, S16}}); 540 } 541 542 CmpBuilder 543 .widenScalarToNextPow2(1) 544 .clampScalar(1, S32, S64) 545 .scalarize(0) 546 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 547 548 getActionDefinitionsBuilder(G_FCMP) 549 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 550 .widenScalarToNextPow2(1) 551 .clampScalar(1, S32, S64) 552 .scalarize(0); 553 554 // FIXME: fpow has a selection pattern that should move to custom lowering. 555 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2, G_FPOW}); 556 if (ST.has16BitInsts()) 557 Exp2Ops.legalFor({S32, S16}); 558 else 559 Exp2Ops.legalFor({S32}); 560 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 561 Exp2Ops.scalarize(0); 562 563 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10}); 564 if (ST.has16BitInsts()) 565 ExpOps.customFor({{S32}, {S16}}); 566 else 567 ExpOps.customFor({S32}); 568 ExpOps.clampScalar(0, MinScalarFPTy, S32) 569 .scalarize(0); 570 571 // The 64-bit versions produce 32-bit results, but only on the SALU. 572 getActionDefinitionsBuilder(G_CTPOP) 573 .legalFor({{S32, S32}, {S32, S64}}) 574 .clampScalar(0, S32, S32) 575 .clampScalar(1, S32, S64) 576 .scalarize(0) 577 .widenScalarToNextPow2(0, 32) 578 .widenScalarToNextPow2(1, 32); 579 580 // The hardware instructions return a different result on 0 than the generic 581 // instructions expect. The hardware produces -1, but these produce the 582 // bitwidth. 583 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 584 .scalarize(0) 585 .clampScalar(0, S32, S32) 586 .clampScalar(1, S32, S64) 587 .widenScalarToNextPow2(0, 32) 588 .widenScalarToNextPow2(1, 32) 589 .lower(); 590 591 // The 64-bit versions produce 32-bit results, but only on the SALU. 592 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 593 .legalFor({{S32, S32}, {S32, S64}}) 594 .clampScalar(0, S32, S32) 595 .clampScalar(1, S32, S64) 596 .scalarize(0) 597 .widenScalarToNextPow2(0, 32) 598 .widenScalarToNextPow2(1, 32); 599 600 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE}) 601 .legalFor({S32}) 602 .clampScalar(0, S32, S32) 603 .scalarize(0); 604 605 if (ST.has16BitInsts()) { 606 if (ST.hasVOP3PInsts()) { 607 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 608 .legalFor({S32, S16, V2S16}) 609 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 610 .clampMaxNumElements(0, S16, 2) 611 .clampScalar(0, S16, S32) 612 .widenScalarToNextPow2(0) 613 .scalarize(0); 614 } else { 615 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 616 .legalFor({S32, S16}) 617 .widenScalarToNextPow2(0) 618 .clampScalar(0, S16, S32) 619 .scalarize(0); 620 } 621 } else { 622 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 623 .legalFor({S32}) 624 .clampScalar(0, S32, S32) 625 .widenScalarToNextPow2(0) 626 .scalarize(0); 627 } 628 629 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 630 return [=](const LegalityQuery &Query) { 631 return Query.Types[TypeIdx0].getSizeInBits() < 632 Query.Types[TypeIdx1].getSizeInBits(); 633 }; 634 }; 635 636 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 637 return [=](const LegalityQuery &Query) { 638 return Query.Types[TypeIdx0].getSizeInBits() > 639 Query.Types[TypeIdx1].getSizeInBits(); 640 }; 641 }; 642 643 getActionDefinitionsBuilder(G_INTTOPTR) 644 // List the common cases 645 .legalForCartesianProduct(AddrSpaces64, {S64}) 646 .legalForCartesianProduct(AddrSpaces32, {S32}) 647 .scalarize(0) 648 // Accept any address space as long as the size matches 649 .legalIf(sameSize(0, 1)) 650 .widenScalarIf(smallerThan(1, 0), 651 [](const LegalityQuery &Query) { 652 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 653 }) 654 .narrowScalarIf(greaterThan(1, 0), 655 [](const LegalityQuery &Query) { 656 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 657 }); 658 659 getActionDefinitionsBuilder(G_PTRTOINT) 660 // List the common cases 661 .legalForCartesianProduct(AddrSpaces64, {S64}) 662 .legalForCartesianProduct(AddrSpaces32, {S32}) 663 .scalarize(0) 664 // Accept any address space as long as the size matches 665 .legalIf(sameSize(0, 1)) 666 .widenScalarIf(smallerThan(0, 1), 667 [](const LegalityQuery &Query) { 668 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 669 }) 670 .narrowScalarIf( 671 greaterThan(0, 1), 672 [](const LegalityQuery &Query) { 673 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 674 }); 675 676 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 677 .scalarize(0) 678 .custom(); 679 680 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 681 // handle some operations by just promoting the register during 682 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 683 auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned { 684 switch (AS) { 685 // FIXME: Private element size. 686 case AMDGPUAS::PRIVATE_ADDRESS: 687 return 32; 688 // FIXME: Check subtarget 689 case AMDGPUAS::LOCAL_ADDRESS: 690 return ST.useDS128() ? 128 : 64; 691 692 // Treat constant and global as identical. SMRD loads are sometimes usable 693 // for global loads (ideally constant address space should be eliminated) 694 // depending on the context. Legality cannot be context dependent, but 695 // RegBankSelect can split the load as necessary depending on the pointer 696 // register bank/uniformity and if the memory is invariant or not written in 697 // a kernel. 698 case AMDGPUAS::CONSTANT_ADDRESS: 699 case AMDGPUAS::GLOBAL_ADDRESS: 700 return IsLoad ? 512 : 128; 701 default: 702 return 128; 703 } 704 }; 705 706 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 707 bool IsLoad) -> bool { 708 const LLT DstTy = Query.Types[0]; 709 710 // Split vector extloads. 711 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 712 unsigned Align = Query.MMODescrs[0].AlignInBits; 713 714 if (MemSize < DstTy.getSizeInBits()) 715 MemSize = std::max(MemSize, Align); 716 717 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 718 return true; 719 720 const LLT PtrTy = Query.Types[1]; 721 unsigned AS = PtrTy.getAddressSpace(); 722 if (MemSize > maxSizeForAddrSpace(AS, IsLoad)) 723 return true; 724 725 // Catch weird sized loads that don't evenly divide into the access sizes 726 // TODO: May be able to widen depending on alignment etc. 727 unsigned NumRegs = (MemSize + 31) / 32; 728 if (NumRegs == 3) { 729 if (!ST.hasDwordx3LoadStores()) 730 return true; 731 } else { 732 // If the alignment allows, these should have been widened. 733 if (!isPowerOf2_32(NumRegs)) 734 return true; 735 } 736 737 if (Align < MemSize) { 738 const SITargetLowering *TLI = ST.getTargetLowering(); 739 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 740 } 741 742 return false; 743 }; 744 745 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool { 746 unsigned Size = Query.Types[0].getSizeInBits(); 747 if (isPowerOf2_32(Size)) 748 return false; 749 750 if (Size == 96 && ST.hasDwordx3LoadStores()) 751 return false; 752 753 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 754 if (Size >= maxSizeForAddrSpace(AddrSpace, true)) 755 return false; 756 757 unsigned Align = Query.MMODescrs[0].AlignInBits; 758 unsigned RoundedSize = NextPowerOf2(Size); 759 return (Align >= RoundedSize); 760 }; 761 762 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 763 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 764 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 765 766 // TODO: Refine based on subtargets which support unaligned access or 128-bit 767 // LDS 768 // TODO: Unsupported flat for SI. 769 770 for (unsigned Op : {G_LOAD, G_STORE}) { 771 const bool IsStore = Op == G_STORE; 772 773 auto &Actions = getActionDefinitionsBuilder(Op); 774 // Whitelist the common cases. 775 // TODO: Loads to s16 on gfx9 776 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 777 {V2S32, GlobalPtr, 64, GlobalAlign32}, 778 {V4S32, GlobalPtr, 128, GlobalAlign32}, 779 {S128, GlobalPtr, 128, GlobalAlign32}, 780 {S64, GlobalPtr, 64, GlobalAlign32}, 781 {V2S64, GlobalPtr, 128, GlobalAlign32}, 782 {V2S16, GlobalPtr, 32, GlobalAlign32}, 783 {S32, GlobalPtr, 8, GlobalAlign8}, 784 {S32, GlobalPtr, 16, GlobalAlign16}, 785 786 {S32, LocalPtr, 32, 32}, 787 {S64, LocalPtr, 64, 32}, 788 {V2S32, LocalPtr, 64, 32}, 789 {S32, LocalPtr, 8, 8}, 790 {S32, LocalPtr, 16, 16}, 791 {V2S16, LocalPtr, 32, 32}, 792 793 {S32, PrivatePtr, 32, 32}, 794 {S32, PrivatePtr, 8, 8}, 795 {S32, PrivatePtr, 16, 16}, 796 {V2S16, PrivatePtr, 32, 32}, 797 798 {S32, FlatPtr, 32, GlobalAlign32}, 799 {S32, FlatPtr, 16, GlobalAlign16}, 800 {S32, FlatPtr, 8, GlobalAlign8}, 801 {V2S16, FlatPtr, 32, GlobalAlign32}, 802 803 {S32, ConstantPtr, 32, GlobalAlign32}, 804 {V2S32, ConstantPtr, 64, GlobalAlign32}, 805 {V4S32, ConstantPtr, 128, GlobalAlign32}, 806 {S64, ConstantPtr, 64, GlobalAlign32}, 807 {S128, ConstantPtr, 128, GlobalAlign32}, 808 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 809 Actions 810 .customIf(typeIs(1, Constant32Ptr)) 811 // Widen suitably aligned loads by loading extra elements. 812 .moreElementsIf([=](const LegalityQuery &Query) { 813 const LLT Ty = Query.Types[0]; 814 return Op == G_LOAD && Ty.isVector() && 815 shouldWidenLoadResult(Query); 816 }, moreElementsToNextPow2(0)) 817 .widenScalarIf([=](const LegalityQuery &Query) { 818 const LLT Ty = Query.Types[0]; 819 return Op == G_LOAD && !Ty.isVector() && 820 shouldWidenLoadResult(Query); 821 }, widenScalarOrEltToNextPow2(0)) 822 .narrowScalarIf( 823 [=](const LegalityQuery &Query) -> bool { 824 return !Query.Types[0].isVector() && 825 needToSplitMemOp(Query, Op == G_LOAD); 826 }, 827 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 828 const LLT DstTy = Query.Types[0]; 829 const LLT PtrTy = Query.Types[1]; 830 831 const unsigned DstSize = DstTy.getSizeInBits(); 832 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 833 834 // Split extloads. 835 if (DstSize > MemSize) 836 return std::make_pair(0, LLT::scalar(MemSize)); 837 838 if (!isPowerOf2_32(DstSize)) { 839 // We're probably decomposing an odd sized store. Try to split 840 // to the widest type. TODO: Account for alignment. As-is it 841 // should be OK, since the new parts will be further legalized. 842 unsigned FloorSize = PowerOf2Floor(DstSize); 843 return std::make_pair(0, LLT::scalar(FloorSize)); 844 } 845 846 if (DstSize > 32 && (DstSize % 32 != 0)) { 847 // FIXME: Need a way to specify non-extload of larger size if 848 // suitably aligned. 849 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 850 } 851 852 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 853 Op == G_LOAD); 854 if (MemSize > MaxSize) 855 return std::make_pair(0, LLT::scalar(MaxSize)); 856 857 unsigned Align = Query.MMODescrs[0].AlignInBits; 858 return std::make_pair(0, LLT::scalar(Align)); 859 }) 860 .fewerElementsIf( 861 [=](const LegalityQuery &Query) -> bool { 862 return Query.Types[0].isVector() && 863 needToSplitMemOp(Query, Op == G_LOAD); 864 }, 865 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 866 const LLT DstTy = Query.Types[0]; 867 const LLT PtrTy = Query.Types[1]; 868 869 LLT EltTy = DstTy.getElementType(); 870 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 871 Op == G_LOAD); 872 873 // FIXME: Handle widened to power of 2 results better. This ends 874 // up scalarizing. 875 // FIXME: 3 element stores scalarized on SI 876 877 // Split if it's too large for the address space. 878 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 879 unsigned NumElts = DstTy.getNumElements(); 880 unsigned EltSize = EltTy.getSizeInBits(); 881 882 if (MaxSize % EltSize == 0) { 883 return std::make_pair( 884 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 885 } 886 887 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 888 889 // FIXME: Refine when odd breakdowns handled 890 // The scalars will need to be re-legalized. 891 if (NumPieces == 1 || NumPieces >= NumElts || 892 NumElts % NumPieces != 0) 893 return std::make_pair(0, EltTy); 894 895 return std::make_pair(0, 896 LLT::vector(NumElts / NumPieces, EltTy)); 897 } 898 899 // FIXME: We could probably handle weird extending loads better. 900 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 901 if (DstTy.getSizeInBits() > MemSize) 902 return std::make_pair(0, EltTy); 903 904 unsigned EltSize = EltTy.getSizeInBits(); 905 unsigned DstSize = DstTy.getSizeInBits(); 906 if (!isPowerOf2_32(DstSize)) { 907 // We're probably decomposing an odd sized store. Try to split 908 // to the widest type. TODO: Account for alignment. As-is it 909 // should be OK, since the new parts will be further legalized. 910 unsigned FloorSize = PowerOf2Floor(DstSize); 911 return std::make_pair( 912 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 913 } 914 915 // Need to split because of alignment. 916 unsigned Align = Query.MMODescrs[0].AlignInBits; 917 if (EltSize > Align && 918 (EltSize / Align < DstTy.getNumElements())) { 919 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 920 } 921 922 // May need relegalization for the scalars. 923 return std::make_pair(0, EltTy); 924 }) 925 .minScalar(0, S32); 926 927 if (IsStore) 928 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 929 930 // TODO: Need a bitcast lower option? 931 Actions 932 .legalIf([=](const LegalityQuery &Query) { 933 const LLT Ty0 = Query.Types[0]; 934 unsigned Size = Ty0.getSizeInBits(); 935 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 936 unsigned Align = Query.MMODescrs[0].AlignInBits; 937 938 // FIXME: Widening store from alignment not valid. 939 if (MemSize < Size) 940 MemSize = std::max(MemSize, Align); 941 942 // No extending vector loads. 943 if (Size > MemSize && Ty0.isVector()) 944 return false; 945 946 switch (MemSize) { 947 case 8: 948 case 16: 949 return Size == 32; 950 case 32: 951 case 64: 952 case 128: 953 return true; 954 case 96: 955 return ST.hasDwordx3LoadStores(); 956 case 256: 957 case 512: 958 return true; 959 default: 960 return false; 961 } 962 }) 963 .widenScalarToNextPow2(0) 964 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 965 } 966 967 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 968 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 969 {S32, GlobalPtr, 16, 2 * 8}, 970 {S32, LocalPtr, 8, 8}, 971 {S32, LocalPtr, 16, 16}, 972 {S32, PrivatePtr, 8, 8}, 973 {S32, PrivatePtr, 16, 16}, 974 {S32, ConstantPtr, 8, 8}, 975 {S32, ConstantPtr, 16, 2 * 8}}); 976 if (ST.hasFlatAddressSpace()) { 977 ExtLoads.legalForTypesWithMemDesc( 978 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 979 } 980 981 ExtLoads.clampScalar(0, S32, S32) 982 .widenScalarToNextPow2(0) 983 .unsupportedIfMemSizeNotPow2() 984 .lower(); 985 986 auto &Atomics = getActionDefinitionsBuilder( 987 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 988 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 989 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 990 G_ATOMICRMW_UMIN}) 991 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 992 {S64, GlobalPtr}, {S64, LocalPtr}}); 993 if (ST.hasFlatAddressSpace()) { 994 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 995 } 996 997 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 998 .legalFor({{S32, LocalPtr}}); 999 1000 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1001 // demarshalling 1002 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1003 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1004 {S32, FlatPtr}, {S64, FlatPtr}}) 1005 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1006 {S32, RegionPtr}, {S64, RegionPtr}}); 1007 // TODO: Pointer types, any 32-bit or 64-bit vector 1008 1009 // Condition should be s32 for scalar, s1 for vector. 1010 getActionDefinitionsBuilder(G_SELECT) 1011 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1012 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1013 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1014 .clampScalar(0, S16, S64) 1015 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1016 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1017 .scalarize(1) 1018 .clampMaxNumElements(0, S32, 2) 1019 .clampMaxNumElements(0, LocalPtr, 2) 1020 .clampMaxNumElements(0, PrivatePtr, 2) 1021 .scalarize(0) 1022 .widenScalarToNextPow2(0) 1023 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1024 1025 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1026 // be more flexible with the shift amount type. 1027 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1028 .legalFor({{S32, S32}, {S64, S32}}); 1029 if (ST.has16BitInsts()) { 1030 if (ST.hasVOP3PInsts()) { 1031 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 1032 .clampMaxNumElements(0, S16, 2); 1033 } else 1034 Shifts.legalFor({{S16, S32}, {S16, S16}}); 1035 1036 // TODO: Support 16-bit shift amounts 1037 Shifts.clampScalar(1, S32, S32); 1038 Shifts.clampScalar(0, S16, S64); 1039 Shifts.widenScalarToNextPow2(0, 16); 1040 } else { 1041 // Make sure we legalize the shift amount type first, as the general 1042 // expansion for the shifted type will produce much worse code if it hasn't 1043 // been truncated already. 1044 Shifts.clampScalar(1, S32, S32); 1045 Shifts.clampScalar(0, S32, S64); 1046 Shifts.widenScalarToNextPow2(0, 32); 1047 } 1048 Shifts.scalarize(0); 1049 1050 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1051 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1052 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1053 unsigned IdxTypeIdx = 2; 1054 1055 getActionDefinitionsBuilder(Op) 1056 .customIf([=](const LegalityQuery &Query) { 1057 const LLT EltTy = Query.Types[EltTypeIdx]; 1058 const LLT VecTy = Query.Types[VecTypeIdx]; 1059 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1060 return (EltTy.getSizeInBits() == 16 || 1061 EltTy.getSizeInBits() % 32 == 0) && 1062 VecTy.getSizeInBits() % 32 == 0 && 1063 VecTy.getSizeInBits() <= 1024 && 1064 IdxTy.getSizeInBits() == 32; 1065 }) 1066 .clampScalar(EltTypeIdx, S32, S64) 1067 .clampScalar(VecTypeIdx, S32, S64) 1068 .clampScalar(IdxTypeIdx, S32, S32); 1069 } 1070 1071 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1072 .unsupportedIf([=](const LegalityQuery &Query) { 1073 const LLT &EltTy = Query.Types[1].getElementType(); 1074 return Query.Types[0] != EltTy; 1075 }); 1076 1077 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1078 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1079 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1080 1081 // FIXME: Doesn't handle extract of illegal sizes. 1082 getActionDefinitionsBuilder(Op) 1083 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1084 // FIXME: Multiples of 16 should not be legal. 1085 .legalIf([=](const LegalityQuery &Query) { 1086 const LLT BigTy = Query.Types[BigTyIdx]; 1087 const LLT LitTy = Query.Types[LitTyIdx]; 1088 return (BigTy.getSizeInBits() % 32 == 0) && 1089 (LitTy.getSizeInBits() % 16 == 0); 1090 }) 1091 .widenScalarIf( 1092 [=](const LegalityQuery &Query) { 1093 const LLT BigTy = Query.Types[BigTyIdx]; 1094 return (BigTy.getScalarSizeInBits() < 16); 1095 }, 1096 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1097 .widenScalarIf( 1098 [=](const LegalityQuery &Query) { 1099 const LLT LitTy = Query.Types[LitTyIdx]; 1100 return (LitTy.getScalarSizeInBits() < 16); 1101 }, 1102 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1103 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1104 .widenScalarToNextPow2(BigTyIdx, 32); 1105 1106 } 1107 1108 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1109 .legalForCartesianProduct(AllS32Vectors, {S32}) 1110 .legalForCartesianProduct(AllS64Vectors, {S64}) 1111 .clampNumElements(0, V16S32, V32S32) 1112 .clampNumElements(0, V2S64, V16S64) 1113 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1114 1115 if (ST.hasScalarPackInsts()) { 1116 BuildVector 1117 // FIXME: Should probably widen s1 vectors straight to s32 1118 .minScalarOrElt(0, S16) 1119 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1120 .minScalar(1, S32); 1121 1122 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1123 .legalFor({V2S16, S32}) 1124 .lower(); 1125 BuildVector.minScalarOrElt(0, S32); 1126 } else { 1127 BuildVector.customFor({V2S16, S16}); 1128 BuildVector.minScalarOrElt(0, S32); 1129 1130 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1131 .customFor({V2S16, S32}) 1132 .lower(); 1133 } 1134 1135 BuildVector.legalIf(isRegisterType(0)); 1136 1137 // FIXME: Clamp maximum size 1138 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1139 .legalIf(isRegisterType(0)); 1140 1141 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1142 // pre-legalize. 1143 if (ST.hasVOP3PInsts()) { 1144 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1145 .customFor({V2S16, V2S16}) 1146 .lower(); 1147 } else 1148 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1149 1150 // Merge/Unmerge 1151 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1152 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1153 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1154 1155 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1156 const LLT &Ty = Query.Types[TypeIdx]; 1157 if (Ty.isVector()) { 1158 const LLT &EltTy = Ty.getElementType(); 1159 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 1160 return true; 1161 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1162 return true; 1163 } 1164 return false; 1165 }; 1166 1167 auto &Builder = getActionDefinitionsBuilder(Op) 1168 // Try to widen to s16 first for small types. 1169 // TODO: Only do this on targets with legal s16 shifts 1170 .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1171 1172 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1173 .lowerFor({{S16, V2S16}}) 1174 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1175 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1176 elementTypeIs(1, S16)), 1177 changeTo(1, V2S16)) 1178 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1179 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1180 // valid. 1181 .clampScalar(LitTyIdx, S32, S256) 1182 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1183 // Break up vectors with weird elements into scalars 1184 .fewerElementsIf( 1185 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 1186 scalarize(0)) 1187 .fewerElementsIf( 1188 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 1189 scalarize(1)) 1190 .clampScalar(BigTyIdx, S32, S1024); 1191 1192 if (Op == G_MERGE_VALUES) { 1193 Builder.widenScalarIf( 1194 // TODO: Use 16-bit shifts if legal for 8-bit values? 1195 [=](const LegalityQuery &Query) { 1196 const LLT Ty = Query.Types[LitTyIdx]; 1197 return Ty.getSizeInBits() < 32; 1198 }, 1199 changeTo(LitTyIdx, S32)); 1200 } 1201 1202 Builder.widenScalarIf( 1203 [=](const LegalityQuery &Query) { 1204 const LLT Ty = Query.Types[BigTyIdx]; 1205 return !isPowerOf2_32(Ty.getSizeInBits()) && 1206 Ty.getSizeInBits() % 16 != 0; 1207 }, 1208 [=](const LegalityQuery &Query) { 1209 // Pick the next power of 2, or a multiple of 64 over 128. 1210 // Whichever is smaller. 1211 const LLT &Ty = Query.Types[BigTyIdx]; 1212 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1213 if (NewSizeInBits >= 256) { 1214 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1215 if (RoundedTo < NewSizeInBits) 1216 NewSizeInBits = RoundedTo; 1217 } 1218 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1219 }) 1220 .legalIf([=](const LegalityQuery &Query) { 1221 const LLT &BigTy = Query.Types[BigTyIdx]; 1222 const LLT &LitTy = Query.Types[LitTyIdx]; 1223 1224 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1225 return false; 1226 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1227 return false; 1228 1229 return BigTy.getSizeInBits() % 16 == 0 && 1230 LitTy.getSizeInBits() % 16 == 0 && 1231 BigTy.getSizeInBits() <= 1024; 1232 }) 1233 // Any vectors left are the wrong size. Scalarize them. 1234 .scalarize(0) 1235 .scalarize(1); 1236 } 1237 1238 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1239 // RegBankSelect. 1240 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1241 .legalFor({{S32}, {S64}}); 1242 1243 if (ST.hasVOP3PInsts()) { 1244 SextInReg.lowerFor({{V2S16}}) 1245 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1246 // get more vector shift opportunities, since we'll get those when 1247 // expanded. 1248 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1249 } else if (ST.has16BitInsts()) { 1250 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1251 } else { 1252 // Prefer to promote to s32 before lowering if we don't have 16-bit 1253 // shifts. This avoid a lot of intermediate truncate and extend operations. 1254 SextInReg.lowerFor({{S32}, {S64}}); 1255 } 1256 1257 SextInReg 1258 .scalarize(0) 1259 .clampScalar(0, S32, S64) 1260 .lower(); 1261 1262 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1263 .legalFor({S64}); 1264 1265 getActionDefinitionsBuilder({ 1266 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1267 G_FCOPYSIGN, 1268 1269 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1270 G_READ_REGISTER, 1271 G_WRITE_REGISTER, 1272 1273 G_SADDO, G_SSUBO, 1274 1275 // TODO: Implement 1276 G_FMINIMUM, G_FMAXIMUM 1277 }).lower(); 1278 1279 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1280 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1281 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1282 .unsupported(); 1283 1284 computeTables(); 1285 verify(*ST.getInstrInfo()); 1286 } 1287 1288 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1289 MachineRegisterInfo &MRI, 1290 MachineIRBuilder &B, 1291 GISelChangeObserver &Observer) const { 1292 switch (MI.getOpcode()) { 1293 case TargetOpcode::G_ADDRSPACE_CAST: 1294 return legalizeAddrSpaceCast(MI, MRI, B); 1295 case TargetOpcode::G_FRINT: 1296 return legalizeFrint(MI, MRI, B); 1297 case TargetOpcode::G_FCEIL: 1298 return legalizeFceil(MI, MRI, B); 1299 case TargetOpcode::G_INTRINSIC_TRUNC: 1300 return legalizeIntrinsicTrunc(MI, MRI, B); 1301 case TargetOpcode::G_SITOFP: 1302 return legalizeITOFP(MI, MRI, B, true); 1303 case TargetOpcode::G_UITOFP: 1304 return legalizeITOFP(MI, MRI, B, false); 1305 case TargetOpcode::G_FPTOSI: 1306 return legalizeFPTOI(MI, MRI, B, true); 1307 case TargetOpcode::G_FPTOUI: 1308 return legalizeFPTOI(MI, MRI, B, false); 1309 case TargetOpcode::G_FMINNUM: 1310 case TargetOpcode::G_FMAXNUM: 1311 case TargetOpcode::G_FMINNUM_IEEE: 1312 case TargetOpcode::G_FMAXNUM_IEEE: 1313 return legalizeMinNumMaxNum(MI, MRI, B); 1314 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1315 return legalizeExtractVectorElt(MI, MRI, B); 1316 case TargetOpcode::G_INSERT_VECTOR_ELT: 1317 return legalizeInsertVectorElt(MI, MRI, B); 1318 case TargetOpcode::G_SHUFFLE_VECTOR: 1319 return legalizeShuffleVector(MI, MRI, B); 1320 case TargetOpcode::G_FSIN: 1321 case TargetOpcode::G_FCOS: 1322 return legalizeSinCos(MI, MRI, B); 1323 case TargetOpcode::G_GLOBAL_VALUE: 1324 return legalizeGlobalValue(MI, MRI, B); 1325 case TargetOpcode::G_LOAD: 1326 return legalizeLoad(MI, MRI, B, Observer); 1327 case TargetOpcode::G_FMAD: 1328 return legalizeFMad(MI, MRI, B); 1329 case TargetOpcode::G_FDIV: 1330 return legalizeFDIV(MI, MRI, B); 1331 case TargetOpcode::G_ATOMIC_CMPXCHG: 1332 return legalizeAtomicCmpXChg(MI, MRI, B); 1333 case TargetOpcode::G_FLOG: 1334 return legalizeFlog(MI, B, 1.0f / numbers::log2ef); 1335 case TargetOpcode::G_FLOG10: 1336 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1337 case TargetOpcode::G_FEXP: 1338 return legalizeFExp(MI, B); 1339 case TargetOpcode::G_FFLOOR: 1340 return legalizeFFloor(MI, MRI, B); 1341 case TargetOpcode::G_BUILD_VECTOR: 1342 return legalizeBuildVector(MI, MRI, B); 1343 default: 1344 return false; 1345 } 1346 1347 llvm_unreachable("expected switch to return"); 1348 } 1349 1350 Register AMDGPULegalizerInfo::getSegmentAperture( 1351 unsigned AS, 1352 MachineRegisterInfo &MRI, 1353 MachineIRBuilder &B) const { 1354 MachineFunction &MF = B.getMF(); 1355 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1356 const LLT S32 = LLT::scalar(32); 1357 1358 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1359 1360 if (ST.hasApertureRegs()) { 1361 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1362 // getreg. 1363 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1364 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1365 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1366 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1367 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1368 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1369 unsigned Encoding = 1370 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1371 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1372 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1373 1374 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1375 1376 B.buildInstr(AMDGPU::S_GETREG_B32) 1377 .addDef(GetReg) 1378 .addImm(Encoding); 1379 MRI.setType(GetReg, S32); 1380 1381 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1382 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1383 } 1384 1385 Register QueuePtr = MRI.createGenericVirtualRegister( 1386 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1387 1388 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1389 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1390 return Register(); 1391 1392 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1393 // private_segment_aperture_base_hi. 1394 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1395 1396 // TODO: can we be smarter about machine pointer info? 1397 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1398 MachineMemOperand *MMO = MF.getMachineMemOperand( 1399 PtrInfo, 1400 MachineMemOperand::MOLoad | 1401 MachineMemOperand::MODereferenceable | 1402 MachineMemOperand::MOInvariant, 1403 4, 1404 MinAlign(64, StructOffset)); 1405 1406 Register LoadAddr; 1407 1408 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1409 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1410 } 1411 1412 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1413 MachineInstr &MI, MachineRegisterInfo &MRI, 1414 MachineIRBuilder &B) const { 1415 MachineFunction &MF = B.getMF(); 1416 1417 B.setInstr(MI); 1418 1419 const LLT S32 = LLT::scalar(32); 1420 Register Dst = MI.getOperand(0).getReg(); 1421 Register Src = MI.getOperand(1).getReg(); 1422 1423 LLT DstTy = MRI.getType(Dst); 1424 LLT SrcTy = MRI.getType(Src); 1425 unsigned DestAS = DstTy.getAddressSpace(); 1426 unsigned SrcAS = SrcTy.getAddressSpace(); 1427 1428 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1429 // vector element. 1430 assert(!DstTy.isVector()); 1431 1432 const AMDGPUTargetMachine &TM 1433 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1434 1435 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1436 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1437 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1438 return true; 1439 } 1440 1441 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1442 // Truncate. 1443 B.buildExtract(Dst, Src, 0); 1444 MI.eraseFromParent(); 1445 return true; 1446 } 1447 1448 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1449 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1450 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1451 1452 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1453 // another. Merge operands are required to be the same type, but creating an 1454 // extra ptrtoint would be kind of pointless. 1455 auto HighAddr = B.buildConstant( 1456 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1457 B.buildMerge(Dst, {Src, HighAddr}); 1458 MI.eraseFromParent(); 1459 return true; 1460 } 1461 1462 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1463 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1464 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1465 unsigned NullVal = TM.getNullPointerValue(DestAS); 1466 1467 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1468 auto FlatNull = B.buildConstant(SrcTy, 0); 1469 1470 // Extract low 32-bits of the pointer. 1471 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1472 1473 auto CmpRes = 1474 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1475 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1476 1477 MI.eraseFromParent(); 1478 return true; 1479 } 1480 1481 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1482 return false; 1483 1484 if (!ST.hasFlatAddressSpace()) 1485 return false; 1486 1487 auto SegmentNull = 1488 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1489 auto FlatNull = 1490 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1491 1492 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1493 if (!ApertureReg.isValid()) 1494 return false; 1495 1496 auto CmpRes = 1497 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1498 1499 // Coerce the type of the low half of the result so we can use merge_values. 1500 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1501 1502 // TODO: Should we allow mismatched types but matching sizes in merges to 1503 // avoid the ptrtoint? 1504 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1505 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1506 1507 MI.eraseFromParent(); 1508 return true; 1509 } 1510 1511 bool AMDGPULegalizerInfo::legalizeFrint( 1512 MachineInstr &MI, MachineRegisterInfo &MRI, 1513 MachineIRBuilder &B) const { 1514 B.setInstr(MI); 1515 1516 Register Src = MI.getOperand(1).getReg(); 1517 LLT Ty = MRI.getType(Src); 1518 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1519 1520 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1521 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1522 1523 auto C1 = B.buildFConstant(Ty, C1Val); 1524 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1525 1526 // TODO: Should this propagate fast-math-flags? 1527 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1528 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1529 1530 auto C2 = B.buildFConstant(Ty, C2Val); 1531 auto Fabs = B.buildFAbs(Ty, Src); 1532 1533 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1534 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1535 return true; 1536 } 1537 1538 bool AMDGPULegalizerInfo::legalizeFceil( 1539 MachineInstr &MI, MachineRegisterInfo &MRI, 1540 MachineIRBuilder &B) const { 1541 B.setInstr(MI); 1542 1543 const LLT S1 = LLT::scalar(1); 1544 const LLT S64 = LLT::scalar(64); 1545 1546 Register Src = MI.getOperand(1).getReg(); 1547 assert(MRI.getType(Src) == S64); 1548 1549 // result = trunc(src) 1550 // if (src > 0.0 && src != result) 1551 // result += 1.0 1552 1553 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1554 1555 const auto Zero = B.buildFConstant(S64, 0.0); 1556 const auto One = B.buildFConstant(S64, 1.0); 1557 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1558 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1559 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1560 auto Add = B.buildSelect(S64, And, One, Zero); 1561 1562 // TODO: Should this propagate fast-math-flags? 1563 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1564 return true; 1565 } 1566 1567 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1568 MachineIRBuilder &B) { 1569 const unsigned FractBits = 52; 1570 const unsigned ExpBits = 11; 1571 LLT S32 = LLT::scalar(32); 1572 1573 auto Const0 = B.buildConstant(S32, FractBits - 32); 1574 auto Const1 = B.buildConstant(S32, ExpBits); 1575 1576 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1577 .addUse(Const0.getReg(0)) 1578 .addUse(Const1.getReg(0)); 1579 1580 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1581 } 1582 1583 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1584 MachineInstr &MI, MachineRegisterInfo &MRI, 1585 MachineIRBuilder &B) const { 1586 B.setInstr(MI); 1587 1588 const LLT S1 = LLT::scalar(1); 1589 const LLT S32 = LLT::scalar(32); 1590 const LLT S64 = LLT::scalar(64); 1591 1592 Register Src = MI.getOperand(1).getReg(); 1593 assert(MRI.getType(Src) == S64); 1594 1595 // TODO: Should this use extract since the low half is unused? 1596 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1597 Register Hi = Unmerge.getReg(1); 1598 1599 // Extract the upper half, since this is where we will find the sign and 1600 // exponent. 1601 auto Exp = extractF64Exponent(Hi, B); 1602 1603 const unsigned FractBits = 52; 1604 1605 // Extract the sign bit. 1606 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1607 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1608 1609 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1610 1611 const auto Zero32 = B.buildConstant(S32, 0); 1612 1613 // Extend back to 64-bits. 1614 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1615 1616 auto Shr = B.buildAShr(S64, FractMask, Exp); 1617 auto Not = B.buildNot(S64, Shr); 1618 auto Tmp0 = B.buildAnd(S64, Src, Not); 1619 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1620 1621 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1622 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1623 1624 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1625 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1626 return true; 1627 } 1628 1629 bool AMDGPULegalizerInfo::legalizeITOFP( 1630 MachineInstr &MI, MachineRegisterInfo &MRI, 1631 MachineIRBuilder &B, bool Signed) const { 1632 B.setInstr(MI); 1633 1634 Register Dst = MI.getOperand(0).getReg(); 1635 Register Src = MI.getOperand(1).getReg(); 1636 1637 const LLT S64 = LLT::scalar(64); 1638 const LLT S32 = LLT::scalar(32); 1639 1640 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1641 1642 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1643 1644 auto CvtHi = Signed ? 1645 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1646 B.buildUITOFP(S64, Unmerge.getReg(1)); 1647 1648 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1649 1650 auto ThirtyTwo = B.buildConstant(S32, 32); 1651 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1652 .addUse(CvtHi.getReg(0)) 1653 .addUse(ThirtyTwo.getReg(0)); 1654 1655 // TODO: Should this propagate fast-math-flags? 1656 B.buildFAdd(Dst, LdExp, CvtLo); 1657 MI.eraseFromParent(); 1658 return true; 1659 } 1660 1661 // TODO: Copied from DAG implementation. Verify logic and document how this 1662 // actually works. 1663 bool AMDGPULegalizerInfo::legalizeFPTOI( 1664 MachineInstr &MI, MachineRegisterInfo &MRI, 1665 MachineIRBuilder &B, bool Signed) const { 1666 B.setInstr(MI); 1667 1668 Register Dst = MI.getOperand(0).getReg(); 1669 Register Src = MI.getOperand(1).getReg(); 1670 1671 const LLT S64 = LLT::scalar(64); 1672 const LLT S32 = LLT::scalar(32); 1673 1674 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1675 1676 unsigned Flags = MI.getFlags(); 1677 1678 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1679 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1680 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1681 1682 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1683 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1684 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1685 1686 auto Hi = Signed ? 1687 B.buildFPTOSI(S32, FloorMul) : 1688 B.buildFPTOUI(S32, FloorMul); 1689 auto Lo = B.buildFPTOUI(S32, Fma); 1690 1691 B.buildMerge(Dst, { Lo, Hi }); 1692 MI.eraseFromParent(); 1693 1694 return true; 1695 } 1696 1697 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1698 MachineInstr &MI, MachineRegisterInfo &MRI, 1699 MachineIRBuilder &B) const { 1700 MachineFunction &MF = B.getMF(); 1701 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1702 1703 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1704 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1705 1706 // With ieee_mode disabled, the instructions have the correct behavior 1707 // already for G_FMINNUM/G_FMAXNUM 1708 if (!MFI->getMode().IEEE) 1709 return !IsIEEEOp; 1710 1711 if (IsIEEEOp) 1712 return true; 1713 1714 MachineIRBuilder HelperBuilder(MI); 1715 GISelObserverWrapper DummyObserver; 1716 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1717 HelperBuilder.setInstr(MI); 1718 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1719 } 1720 1721 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1722 MachineInstr &MI, MachineRegisterInfo &MRI, 1723 MachineIRBuilder &B) const { 1724 // TODO: Should move some of this into LegalizerHelper. 1725 1726 // TODO: Promote dynamic indexing of s16 to s32 1727 1728 // FIXME: Artifact combiner probably should have replaced the truncated 1729 // constant before this, so we shouldn't need 1730 // getConstantVRegValWithLookThrough. 1731 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1732 MI.getOperand(2).getReg(), MRI); 1733 if (!IdxVal) // Dynamic case will be selected to register indexing. 1734 return true; 1735 1736 Register Dst = MI.getOperand(0).getReg(); 1737 Register Vec = MI.getOperand(1).getReg(); 1738 1739 LLT VecTy = MRI.getType(Vec); 1740 LLT EltTy = VecTy.getElementType(); 1741 assert(EltTy == MRI.getType(Dst)); 1742 1743 B.setInstr(MI); 1744 1745 if (IdxVal->Value < VecTy.getNumElements()) 1746 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1747 else 1748 B.buildUndef(Dst); 1749 1750 MI.eraseFromParent(); 1751 return true; 1752 } 1753 1754 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1755 MachineInstr &MI, MachineRegisterInfo &MRI, 1756 MachineIRBuilder &B) const { 1757 // TODO: Should move some of this into LegalizerHelper. 1758 1759 // TODO: Promote dynamic indexing of s16 to s32 1760 1761 // FIXME: Artifact combiner probably should have replaced the truncated 1762 // constant before this, so we shouldn't need 1763 // getConstantVRegValWithLookThrough. 1764 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1765 MI.getOperand(3).getReg(), MRI); 1766 if (!IdxVal) // Dynamic case will be selected to register indexing. 1767 return true; 1768 1769 Register Dst = MI.getOperand(0).getReg(); 1770 Register Vec = MI.getOperand(1).getReg(); 1771 Register Ins = MI.getOperand(2).getReg(); 1772 1773 LLT VecTy = MRI.getType(Vec); 1774 LLT EltTy = VecTy.getElementType(); 1775 assert(EltTy == MRI.getType(Ins)); 1776 1777 B.setInstr(MI); 1778 1779 if (IdxVal->Value < VecTy.getNumElements()) 1780 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 1781 else 1782 B.buildUndef(Dst); 1783 1784 MI.eraseFromParent(); 1785 return true; 1786 } 1787 1788 static bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask) { 1789 assert(Mask.size() == 2); 1790 1791 // If one half is undef, the other is trivially in the same reg. 1792 if (Mask[0] == -1 || Mask[1] == -1) 1793 return true; 1794 return ((Mask[0] == 0 || Mask[0] == 1) && (Mask[1] == 0 || Mask[1] == 1)) || 1795 ((Mask[0] == 2 || Mask[0] == 3) && (Mask[1] == 2 || Mask[1] == 3)); 1796 } 1797 1798 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1799 MachineInstr &MI, MachineRegisterInfo &MRI, 1800 MachineIRBuilder &B) const { 1801 const LLT V2S16 = LLT::vector(2, 16); 1802 1803 Register Dst = MI.getOperand(0).getReg(); 1804 Register Src0 = MI.getOperand(1).getReg(); 1805 LLT DstTy = MRI.getType(Dst); 1806 LLT SrcTy = MRI.getType(Src0); 1807 1808 if (SrcTy == V2S16 && DstTy == V2S16 && 1809 isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1810 return true; 1811 1812 MachineIRBuilder HelperBuilder(MI); 1813 GISelObserverWrapper DummyObserver; 1814 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1815 HelperBuilder.setInstr(MI); 1816 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1817 } 1818 1819 bool AMDGPULegalizerInfo::legalizeSinCos( 1820 MachineInstr &MI, MachineRegisterInfo &MRI, 1821 MachineIRBuilder &B) const { 1822 B.setInstr(MI); 1823 1824 Register DstReg = MI.getOperand(0).getReg(); 1825 Register SrcReg = MI.getOperand(1).getReg(); 1826 LLT Ty = MRI.getType(DstReg); 1827 unsigned Flags = MI.getFlags(); 1828 1829 Register TrigVal; 1830 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1831 if (ST.hasTrigReducedRange()) { 1832 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1833 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1834 .addUse(MulVal.getReg(0)) 1835 .setMIFlags(Flags).getReg(0); 1836 } else 1837 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1838 1839 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1840 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1841 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1842 .addUse(TrigVal) 1843 .setMIFlags(Flags); 1844 MI.eraseFromParent(); 1845 return true; 1846 } 1847 1848 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1849 Register DstReg, LLT PtrTy, 1850 MachineIRBuilder &B, const GlobalValue *GV, 1851 unsigned Offset, unsigned GAFlags) const { 1852 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1853 // to the following code sequence: 1854 // 1855 // For constant address space: 1856 // s_getpc_b64 s[0:1] 1857 // s_add_u32 s0, s0, $symbol 1858 // s_addc_u32 s1, s1, 0 1859 // 1860 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1861 // a fixup or relocation is emitted to replace $symbol with a literal 1862 // constant, which is a pc-relative offset from the encoding of the $symbol 1863 // operand to the global variable. 1864 // 1865 // For global address space: 1866 // s_getpc_b64 s[0:1] 1867 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1868 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1869 // 1870 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1871 // fixups or relocations are emitted to replace $symbol@*@lo and 1872 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1873 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1874 // operand to the global variable. 1875 // 1876 // What we want here is an offset from the value returned by s_getpc 1877 // (which is the address of the s_add_u32 instruction) to the global 1878 // variable, but since the encoding of $symbol starts 4 bytes after the start 1879 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1880 // small. This requires us to add 4 to the global variable offset in order to 1881 // compute the correct address. 1882 1883 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1884 1885 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1886 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1887 1888 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1889 .addDef(PCReg); 1890 1891 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1892 if (GAFlags == SIInstrInfo::MO_NONE) 1893 MIB.addImm(0); 1894 else 1895 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1896 1897 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1898 1899 if (PtrTy.getSizeInBits() == 32) 1900 B.buildExtract(DstReg, PCReg, 0); 1901 return true; 1902 } 1903 1904 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1905 MachineInstr &MI, MachineRegisterInfo &MRI, 1906 MachineIRBuilder &B) const { 1907 Register DstReg = MI.getOperand(0).getReg(); 1908 LLT Ty = MRI.getType(DstReg); 1909 unsigned AS = Ty.getAddressSpace(); 1910 1911 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1912 MachineFunction &MF = B.getMF(); 1913 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1914 B.setInstr(MI); 1915 1916 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1917 if (!MFI->isEntryFunction()) { 1918 const Function &Fn = MF.getFunction(); 1919 DiagnosticInfoUnsupported BadLDSDecl( 1920 Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 1921 Fn.getContext().diagnose(BadLDSDecl); 1922 } 1923 1924 // TODO: We could emit code to handle the initialization somewhere. 1925 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1926 const SITargetLowering *TLI = ST.getTargetLowering(); 1927 if (!TLI->shouldUseLDSConstAddress(GV)) { 1928 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 1929 return true; // Leave in place; 1930 } 1931 1932 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1933 MI.eraseFromParent(); 1934 return true; 1935 } 1936 1937 const Function &Fn = MF.getFunction(); 1938 DiagnosticInfoUnsupported BadInit( 1939 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1940 Fn.getContext().diagnose(BadInit); 1941 return true; 1942 } 1943 1944 const SITargetLowering *TLI = ST.getTargetLowering(); 1945 1946 if (TLI->shouldEmitFixup(GV)) { 1947 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 1948 MI.eraseFromParent(); 1949 return true; 1950 } 1951 1952 if (TLI->shouldEmitPCReloc(GV)) { 1953 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 1954 MI.eraseFromParent(); 1955 return true; 1956 } 1957 1958 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1959 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 1960 1961 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 1962 MachinePointerInfo::getGOT(MF), 1963 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1964 MachineMemOperand::MOInvariant, 1965 8 /*Size*/, 8 /*Align*/); 1966 1967 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 1968 1969 if (Ty.getSizeInBits() == 32) { 1970 // Truncate if this is a 32-bit constant adrdess. 1971 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 1972 B.buildExtract(DstReg, Load, 0); 1973 } else 1974 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 1975 1976 MI.eraseFromParent(); 1977 return true; 1978 } 1979 1980 bool AMDGPULegalizerInfo::legalizeLoad( 1981 MachineInstr &MI, MachineRegisterInfo &MRI, 1982 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 1983 B.setInstr(MI); 1984 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1985 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 1986 Observer.changingInstr(MI); 1987 MI.getOperand(1).setReg(Cast.getReg(0)); 1988 Observer.changedInstr(MI); 1989 return true; 1990 } 1991 1992 bool AMDGPULegalizerInfo::legalizeFMad( 1993 MachineInstr &MI, MachineRegisterInfo &MRI, 1994 MachineIRBuilder &B) const { 1995 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 1996 assert(Ty.isScalar()); 1997 1998 MachineFunction &MF = B.getMF(); 1999 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2000 2001 // TODO: Always legal with future ftz flag. 2002 // FIXME: Do we need just output? 2003 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2004 return true; 2005 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2006 return true; 2007 2008 MachineIRBuilder HelperBuilder(MI); 2009 GISelObserverWrapper DummyObserver; 2010 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2011 HelperBuilder.setMBB(*MI.getParent()); 2012 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2013 } 2014 2015 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2016 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2017 Register DstReg = MI.getOperand(0).getReg(); 2018 Register PtrReg = MI.getOperand(1).getReg(); 2019 Register CmpVal = MI.getOperand(2).getReg(); 2020 Register NewVal = MI.getOperand(3).getReg(); 2021 2022 assert(SITargetLowering::isFlatGlobalAddrSpace( 2023 MRI.getType(PtrReg).getAddressSpace()) && 2024 "this should not have been custom lowered"); 2025 2026 LLT ValTy = MRI.getType(CmpVal); 2027 LLT VecTy = LLT::vector(2, ValTy); 2028 2029 B.setInstr(MI); 2030 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2031 2032 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2033 .addDef(DstReg) 2034 .addUse(PtrReg) 2035 .addUse(PackedVal) 2036 .setMemRefs(MI.memoperands()); 2037 2038 MI.eraseFromParent(); 2039 return true; 2040 } 2041 2042 bool AMDGPULegalizerInfo::legalizeFlog( 2043 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2044 Register Dst = MI.getOperand(0).getReg(); 2045 Register Src = MI.getOperand(1).getReg(); 2046 LLT Ty = B.getMRI()->getType(Dst); 2047 unsigned Flags = MI.getFlags(); 2048 B.setInstr(MI); 2049 2050 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2051 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2052 2053 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2054 MI.eraseFromParent(); 2055 return true; 2056 } 2057 2058 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2059 MachineIRBuilder &B) const { 2060 Register Dst = MI.getOperand(0).getReg(); 2061 Register Src = MI.getOperand(1).getReg(); 2062 unsigned Flags = MI.getFlags(); 2063 LLT Ty = B.getMRI()->getType(Dst); 2064 B.setInstr(MI); 2065 2066 auto K = B.buildFConstant(Ty, numbers::log2e); 2067 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2068 B.buildFExp2(Dst, Mul, Flags); 2069 MI.eraseFromParent(); 2070 return true; 2071 } 2072 2073 // Find a source register, ignoring any possible source modifiers. 2074 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2075 Register ModSrc = OrigSrc; 2076 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2077 ModSrc = SrcFNeg->getOperand(1).getReg(); 2078 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2079 ModSrc = SrcFAbs->getOperand(1).getReg(); 2080 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2081 ModSrc = SrcFAbs->getOperand(1).getReg(); 2082 return ModSrc; 2083 } 2084 2085 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2086 MachineRegisterInfo &MRI, 2087 MachineIRBuilder &B) const { 2088 B.setInstr(MI); 2089 2090 const LLT S1 = LLT::scalar(1); 2091 const LLT S64 = LLT::scalar(64); 2092 Register Dst = MI.getOperand(0).getReg(); 2093 Register OrigSrc = MI.getOperand(1).getReg(); 2094 unsigned Flags = MI.getFlags(); 2095 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2096 "this should not have been custom lowered"); 2097 2098 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2099 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2100 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2101 // V_FRACT bug is: 2102 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2103 // 2104 // Convert floor(x) to (x - fract(x)) 2105 2106 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2107 .addUse(OrigSrc) 2108 .setMIFlags(Flags); 2109 2110 // Give source modifier matching some assistance before obscuring a foldable 2111 // pattern. 2112 2113 // TODO: We can avoid the neg on the fract? The input sign to fract 2114 // shouldn't matter? 2115 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2116 2117 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2118 2119 Register Min = MRI.createGenericVirtualRegister(S64); 2120 2121 // We don't need to concern ourselves with the snan handling difference, so 2122 // use the one which will directly select. 2123 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2124 if (MFI->getMode().IEEE) 2125 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2126 else 2127 B.buildFMinNum(Min, Fract, Const, Flags); 2128 2129 Register CorrectedFract = Min; 2130 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2131 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2132 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2133 } 2134 2135 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2136 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2137 2138 MI.eraseFromParent(); 2139 return true; 2140 } 2141 2142 // Turn an illegal packed v2s16 build vector into bit operations. 2143 // TODO: This should probably be a bitcast action in LegalizerHelper. 2144 bool AMDGPULegalizerInfo::legalizeBuildVector( 2145 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2146 Register Dst = MI.getOperand(0).getReg(); 2147 LLT DstTy = MRI.getType(Dst); 2148 const LLT S32 = LLT::scalar(32); 2149 const LLT V2S16 = LLT::vector(2, 16); 2150 (void)DstTy; 2151 (void)V2S16; 2152 assert(DstTy == V2S16); 2153 2154 Register Src0 = MI.getOperand(1).getReg(); 2155 Register Src1 = MI.getOperand(2).getReg(); 2156 assert(MRI.getType(Src0) == LLT::scalar(16)); 2157 2158 B.setInstr(MI); 2159 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2160 B.buildBitcast(Dst, Merge); 2161 2162 MI.eraseFromParent(); 2163 return true; 2164 } 2165 2166 // Return the use branch instruction, otherwise null if the usage is invalid. 2167 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2168 MachineRegisterInfo &MRI, 2169 MachineInstr *&Br) { 2170 Register CondDef = MI.getOperand(0).getReg(); 2171 if (!MRI.hasOneNonDBGUse(CondDef)) 2172 return nullptr; 2173 2174 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2175 if (UseMI.getParent() != MI.getParent() || 2176 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2177 return nullptr; 2178 2179 // Make sure the cond br is followed by a G_BR 2180 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2181 if (Next != MI.getParent()->end()) { 2182 if (Next->getOpcode() != AMDGPU::G_BR) 2183 return nullptr; 2184 Br = &*Next; 2185 } 2186 2187 return &UseMI; 2188 } 2189 2190 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 2191 Register Reg, LLT Ty) const { 2192 Register LiveIn = MRI.getLiveInVirtReg(Reg); 2193 if (LiveIn) 2194 return LiveIn; 2195 2196 Register NewReg = MRI.createGenericVirtualRegister(Ty); 2197 MRI.addLiveIn(Reg, NewReg); 2198 return NewReg; 2199 } 2200 2201 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2202 const ArgDescriptor *Arg) const { 2203 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2204 return false; // TODO: Handle these 2205 2206 assert(Arg->getRegister().isPhysical()); 2207 2208 MachineRegisterInfo &MRI = *B.getMRI(); 2209 2210 LLT Ty = MRI.getType(DstReg); 2211 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 2212 2213 if (Arg->isMasked()) { 2214 // TODO: Should we try to emit this once in the entry block? 2215 const LLT S32 = LLT::scalar(32); 2216 const unsigned Mask = Arg->getMask(); 2217 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2218 2219 Register AndMaskSrc = LiveIn; 2220 2221 if (Shift != 0) { 2222 auto ShiftAmt = B.buildConstant(S32, Shift); 2223 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2224 } 2225 2226 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2227 } else 2228 B.buildCopy(DstReg, LiveIn); 2229 2230 // Insert the argument copy if it doens't already exist. 2231 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2232 if (!MRI.getVRegDef(LiveIn)) { 2233 // FIXME: Should have scoped insert pt 2234 MachineBasicBlock &OrigInsBB = B.getMBB(); 2235 auto OrigInsPt = B.getInsertPt(); 2236 2237 MachineBasicBlock &EntryMBB = B.getMF().front(); 2238 EntryMBB.addLiveIn(Arg->getRegister()); 2239 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2240 B.buildCopy(LiveIn, Arg->getRegister()); 2241 2242 B.setInsertPt(OrigInsBB, OrigInsPt); 2243 } 2244 2245 return true; 2246 } 2247 2248 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2249 MachineInstr &MI, 2250 MachineRegisterInfo &MRI, 2251 MachineIRBuilder &B, 2252 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2253 B.setInstr(MI); 2254 2255 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2256 2257 const ArgDescriptor *Arg; 2258 const TargetRegisterClass *RC; 2259 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 2260 if (!Arg) { 2261 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2262 return false; 2263 } 2264 2265 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 2266 MI.eraseFromParent(); 2267 return true; 2268 } 2269 2270 return false; 2271 } 2272 2273 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2274 MachineRegisterInfo &MRI, 2275 MachineIRBuilder &B) const { 2276 B.setInstr(MI); 2277 Register Dst = MI.getOperand(0).getReg(); 2278 LLT DstTy = MRI.getType(Dst); 2279 LLT S16 = LLT::scalar(16); 2280 LLT S32 = LLT::scalar(32); 2281 LLT S64 = LLT::scalar(64); 2282 2283 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2284 return true; 2285 2286 if (DstTy == S16) 2287 return legalizeFDIV16(MI, MRI, B); 2288 if (DstTy == S32) 2289 return legalizeFDIV32(MI, MRI, B); 2290 if (DstTy == S64) 2291 return legalizeFDIV64(MI, MRI, B); 2292 2293 return false; 2294 } 2295 2296 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2297 MachineRegisterInfo &MRI, 2298 MachineIRBuilder &B) const { 2299 Register Res = MI.getOperand(0).getReg(); 2300 Register LHS = MI.getOperand(1).getReg(); 2301 Register RHS = MI.getOperand(2).getReg(); 2302 2303 uint16_t Flags = MI.getFlags(); 2304 2305 LLT ResTy = MRI.getType(Res); 2306 LLT S32 = LLT::scalar(32); 2307 LLT S64 = LLT::scalar(64); 2308 2309 const MachineFunction &MF = B.getMF(); 2310 bool Unsafe = 2311 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2312 2313 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2314 return false; 2315 2316 if (!Unsafe && ResTy == S32 && 2317 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2318 return false; 2319 2320 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2321 // 1 / x -> RCP(x) 2322 if (CLHS->isExactlyValue(1.0)) { 2323 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2324 .addUse(RHS) 2325 .setMIFlags(Flags); 2326 2327 MI.eraseFromParent(); 2328 return true; 2329 } 2330 2331 // -1 / x -> RCP( FNEG(x) ) 2332 if (CLHS->isExactlyValue(-1.0)) { 2333 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2334 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2335 .addUse(FNeg.getReg(0)) 2336 .setMIFlags(Flags); 2337 2338 MI.eraseFromParent(); 2339 return true; 2340 } 2341 } 2342 2343 // x / y -> x * (1.0 / y) 2344 if (Unsafe) { 2345 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2346 .addUse(RHS) 2347 .setMIFlags(Flags); 2348 B.buildFMul(Res, LHS, RCP, Flags); 2349 2350 MI.eraseFromParent(); 2351 return true; 2352 } 2353 2354 return false; 2355 } 2356 2357 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2358 MachineRegisterInfo &MRI, 2359 MachineIRBuilder &B) const { 2360 B.setInstr(MI); 2361 Register Res = MI.getOperand(0).getReg(); 2362 Register LHS = MI.getOperand(1).getReg(); 2363 Register RHS = MI.getOperand(2).getReg(); 2364 2365 uint16_t Flags = MI.getFlags(); 2366 2367 LLT S16 = LLT::scalar(16); 2368 LLT S32 = LLT::scalar(32); 2369 2370 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2371 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2372 2373 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2374 .addUse(RHSExt.getReg(0)) 2375 .setMIFlags(Flags); 2376 2377 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2378 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2379 2380 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2381 .addUse(RDst.getReg(0)) 2382 .addUse(RHS) 2383 .addUse(LHS) 2384 .setMIFlags(Flags); 2385 2386 MI.eraseFromParent(); 2387 return true; 2388 } 2389 2390 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2391 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2392 static void toggleSPDenormMode(bool Enable, 2393 MachineIRBuilder &B, 2394 const GCNSubtarget &ST, 2395 AMDGPU::SIModeRegisterDefaults Mode) { 2396 // Set SP denorm mode to this value. 2397 unsigned SPDenormMode = 2398 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2399 2400 if (ST.hasDenormModeInst()) { 2401 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2402 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2403 2404 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2405 B.buildInstr(AMDGPU::S_DENORM_MODE) 2406 .addImm(NewDenormModeValue); 2407 2408 } else { 2409 // Select FP32 bit field in mode register. 2410 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2411 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2412 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2413 2414 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2415 .addImm(SPDenormMode) 2416 .addImm(SPDenormModeBitField); 2417 } 2418 } 2419 2420 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2421 MachineRegisterInfo &MRI, 2422 MachineIRBuilder &B) const { 2423 B.setInstr(MI); 2424 Register Res = MI.getOperand(0).getReg(); 2425 Register LHS = MI.getOperand(1).getReg(); 2426 Register RHS = MI.getOperand(2).getReg(); 2427 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2428 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2429 2430 uint16_t Flags = MI.getFlags(); 2431 2432 LLT S32 = LLT::scalar(32); 2433 LLT S1 = LLT::scalar(1); 2434 2435 auto One = B.buildFConstant(S32, 1.0f); 2436 2437 auto DenominatorScaled = 2438 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2439 .addUse(RHS) 2440 .addUse(LHS) 2441 .addImm(1) 2442 .setMIFlags(Flags); 2443 auto NumeratorScaled = 2444 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2445 .addUse(LHS) 2446 .addUse(RHS) 2447 .addImm(0) 2448 .setMIFlags(Flags); 2449 2450 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2451 .addUse(DenominatorScaled.getReg(0)) 2452 .setMIFlags(Flags); 2453 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2454 2455 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2456 // aren't modeled as reading it. 2457 if (!Mode.allFP32Denormals()) 2458 toggleSPDenormMode(true, B, ST, Mode); 2459 2460 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2461 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2462 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2463 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2464 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2465 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2466 2467 if (!Mode.allFP32Denormals()) 2468 toggleSPDenormMode(false, B, ST, Mode); 2469 2470 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2471 .addUse(Fma4.getReg(0)) 2472 .addUse(Fma1.getReg(0)) 2473 .addUse(Fma3.getReg(0)) 2474 .addUse(NumeratorScaled.getReg(1)) 2475 .setMIFlags(Flags); 2476 2477 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2478 .addUse(Fmas.getReg(0)) 2479 .addUse(RHS) 2480 .addUse(LHS) 2481 .setMIFlags(Flags); 2482 2483 MI.eraseFromParent(); 2484 return true; 2485 } 2486 2487 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2488 MachineRegisterInfo &MRI, 2489 MachineIRBuilder &B) const { 2490 B.setInstr(MI); 2491 Register Res = MI.getOperand(0).getReg(); 2492 Register LHS = MI.getOperand(1).getReg(); 2493 Register RHS = MI.getOperand(2).getReg(); 2494 2495 uint16_t Flags = MI.getFlags(); 2496 2497 LLT S64 = LLT::scalar(64); 2498 LLT S1 = LLT::scalar(1); 2499 2500 auto One = B.buildFConstant(S64, 1.0); 2501 2502 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2503 .addUse(LHS) 2504 .addUse(RHS) 2505 .addImm(1) 2506 .setMIFlags(Flags); 2507 2508 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2509 2510 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2511 .addUse(DivScale0.getReg(0)) 2512 .setMIFlags(Flags); 2513 2514 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2515 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2516 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2517 2518 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2519 .addUse(LHS) 2520 .addUse(RHS) 2521 .addImm(0) 2522 .setMIFlags(Flags); 2523 2524 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2525 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); 2526 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2527 2528 Register Scale; 2529 if (!ST.hasUsableDivScaleConditionOutput()) { 2530 // Workaround a hardware bug on SI where the condition output from div_scale 2531 // is not usable. 2532 2533 LLT S32 = LLT::scalar(32); 2534 2535 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2536 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2537 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2538 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 2539 2540 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 2541 Scale1Unmerge.getReg(1)); 2542 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 2543 Scale0Unmerge.getReg(1)); 2544 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 2545 } else { 2546 Scale = DivScale1.getReg(1); 2547 } 2548 2549 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 2550 .addUse(Fma4.getReg(0)) 2551 .addUse(Fma3.getReg(0)) 2552 .addUse(Mul.getReg(0)) 2553 .addUse(Scale) 2554 .setMIFlags(Flags); 2555 2556 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 2557 .addUse(Fmas.getReg(0)) 2558 .addUse(RHS) 2559 .addUse(LHS) 2560 .setMIFlags(Flags); 2561 2562 MI.eraseFromParent(); 2563 return true; 2564 } 2565 2566 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 2567 MachineRegisterInfo &MRI, 2568 MachineIRBuilder &B) const { 2569 B.setInstr(MI); 2570 Register Res = MI.getOperand(0).getReg(); 2571 Register LHS = MI.getOperand(2).getReg(); 2572 Register RHS = MI.getOperand(3).getReg(); 2573 uint16_t Flags = MI.getFlags(); 2574 2575 LLT S32 = LLT::scalar(32); 2576 LLT S1 = LLT::scalar(1); 2577 2578 auto Abs = B.buildFAbs(S32, RHS, Flags); 2579 const APFloat C0Val(1.0f); 2580 2581 auto C0 = B.buildConstant(S32, 0x6f800000); 2582 auto C1 = B.buildConstant(S32, 0x2f800000); 2583 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 2584 2585 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 2586 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 2587 2588 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 2589 2590 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2591 .addUse(Mul0.getReg(0)) 2592 .setMIFlags(Flags); 2593 2594 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 2595 2596 B.buildFMul(Res, Sel, Mul1, Flags); 2597 2598 MI.eraseFromParent(); 2599 return true; 2600 } 2601 2602 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 2603 MachineRegisterInfo &MRI, 2604 MachineIRBuilder &B) const { 2605 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2606 if (!MFI->isEntryFunction()) { 2607 return legalizePreloadedArgIntrin(MI, MRI, B, 2608 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 2609 } 2610 2611 B.setInstr(MI); 2612 2613 uint64_t Offset = 2614 ST.getTargetLowering()->getImplicitParameterOffset( 2615 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 2616 Register DstReg = MI.getOperand(0).getReg(); 2617 LLT DstTy = MRI.getType(DstReg); 2618 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 2619 2620 const ArgDescriptor *Arg; 2621 const TargetRegisterClass *RC; 2622 std::tie(Arg, RC) 2623 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2624 if (!Arg) 2625 return false; 2626 2627 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 2628 if (!loadInputValue(KernargPtrReg, B, Arg)) 2629 return false; 2630 2631 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 2632 MI.eraseFromParent(); 2633 return true; 2634 } 2635 2636 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 2637 MachineRegisterInfo &MRI, 2638 MachineIRBuilder &B, 2639 unsigned AddrSpace) const { 2640 B.setInstr(MI); 2641 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 2642 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 2643 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 2644 MI.eraseFromParent(); 2645 return true; 2646 } 2647 2648 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 2649 // offset (the offset that is included in bounds checking and swizzling, to be 2650 // split between the instruction's voffset and immoffset fields) and soffset 2651 // (the offset that is excluded from bounds checking and swizzling, to go in 2652 // the instruction's soffset field). This function takes the first kind of 2653 // offset and figures out how to split it between voffset and immoffset. 2654 std::tuple<Register, unsigned, unsigned> 2655 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 2656 Register OrigOffset) const { 2657 const unsigned MaxImm = 4095; 2658 Register BaseReg; 2659 unsigned TotalConstOffset; 2660 MachineInstr *OffsetDef; 2661 const LLT S32 = LLT::scalar(32); 2662 2663 std::tie(BaseReg, TotalConstOffset, OffsetDef) 2664 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 2665 2666 unsigned ImmOffset = TotalConstOffset; 2667 2668 // If the immediate value is too big for the immoffset field, put the value 2669 // and -4096 into the immoffset field so that the value that is copied/added 2670 // for the voffset field is a multiple of 4096, and it stands more chance 2671 // of being CSEd with the copy/add for another similar load/store. 2672 // However, do not do that rounding down to a multiple of 4096 if that is a 2673 // negative number, as it appears to be illegal to have a negative offset 2674 // in the vgpr, even if adding the immediate offset makes it positive. 2675 unsigned Overflow = ImmOffset & ~MaxImm; 2676 ImmOffset -= Overflow; 2677 if ((int32_t)Overflow < 0) { 2678 Overflow += ImmOffset; 2679 ImmOffset = 0; 2680 } 2681 2682 if (Overflow != 0) { 2683 if (!BaseReg) { 2684 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 2685 } else { 2686 auto OverflowVal = B.buildConstant(S32, Overflow); 2687 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 2688 } 2689 } 2690 2691 if (!BaseReg) 2692 BaseReg = B.buildConstant(S32, 0).getReg(0); 2693 2694 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 2695 } 2696 2697 /// Handle register layout difference for f16 images for some subtargets. 2698 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 2699 MachineRegisterInfo &MRI, 2700 Register Reg) const { 2701 if (!ST.hasUnpackedD16VMem()) 2702 return Reg; 2703 2704 const LLT S16 = LLT::scalar(16); 2705 const LLT S32 = LLT::scalar(32); 2706 LLT StoreVT = MRI.getType(Reg); 2707 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 2708 2709 auto Unmerge = B.buildUnmerge(S16, Reg); 2710 2711 SmallVector<Register, 4> WideRegs; 2712 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 2713 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 2714 2715 int NumElts = StoreVT.getNumElements(); 2716 2717 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 2718 } 2719 2720 Register AMDGPULegalizerInfo::fixStoreSourceType( 2721 MachineIRBuilder &B, Register VData, bool IsFormat) const { 2722 MachineRegisterInfo *MRI = B.getMRI(); 2723 LLT Ty = MRI->getType(VData); 2724 2725 const LLT S16 = LLT::scalar(16); 2726 2727 // Fixup illegal register types for i8 stores. 2728 if (Ty == LLT::scalar(8) || Ty == S16) { 2729 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 2730 return AnyExt; 2731 } 2732 2733 if (Ty.isVector()) { 2734 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 2735 if (IsFormat) 2736 return handleD16VData(B, *MRI, VData); 2737 } 2738 } 2739 2740 return VData; 2741 } 2742 2743 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 2744 MachineRegisterInfo &MRI, 2745 MachineIRBuilder &B, 2746 bool IsTyped, 2747 bool IsFormat) const { 2748 B.setInstr(MI); 2749 2750 Register VData = MI.getOperand(1).getReg(); 2751 LLT Ty = MRI.getType(VData); 2752 LLT EltTy = Ty.getScalarType(); 2753 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 2754 const LLT S32 = LLT::scalar(32); 2755 2756 VData = fixStoreSourceType(B, VData, IsFormat); 2757 Register RSrc = MI.getOperand(2).getReg(); 2758 2759 MachineMemOperand *MMO = *MI.memoperands_begin(); 2760 const int MemSize = MMO->getSize(); 2761 2762 unsigned ImmOffset; 2763 unsigned TotalOffset; 2764 2765 // The typed intrinsics add an immediate after the registers. 2766 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 2767 2768 // The struct intrinsic variants add one additional operand over raw. 2769 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 2770 Register VIndex; 2771 int OpOffset = 0; 2772 if (HasVIndex) { 2773 VIndex = MI.getOperand(3).getReg(); 2774 OpOffset = 1; 2775 } 2776 2777 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 2778 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 2779 2780 unsigned Format = 0; 2781 if (IsTyped) { 2782 Format = MI.getOperand(5 + OpOffset).getImm(); 2783 ++OpOffset; 2784 } 2785 2786 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 2787 2788 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 2789 if (TotalOffset != 0) 2790 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 2791 2792 unsigned Opc; 2793 if (IsTyped) { 2794 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 2795 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 2796 } else if (IsFormat) { 2797 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 2798 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 2799 } else { 2800 switch (MemSize) { 2801 case 1: 2802 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 2803 break; 2804 case 2: 2805 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 2806 break; 2807 default: 2808 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 2809 break; 2810 } 2811 } 2812 2813 if (!VIndex) 2814 VIndex = B.buildConstant(S32, 0).getReg(0); 2815 2816 auto MIB = B.buildInstr(Opc) 2817 .addUse(VData) // vdata 2818 .addUse(RSrc) // rsrc 2819 .addUse(VIndex) // vindex 2820 .addUse(VOffset) // voffset 2821 .addUse(SOffset) // soffset 2822 .addImm(ImmOffset); // offset(imm) 2823 2824 if (IsTyped) 2825 MIB.addImm(Format); 2826 2827 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 2828 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 2829 .addMemOperand(MMO); 2830 2831 MI.eraseFromParent(); 2832 return true; 2833 } 2834 2835 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 2836 MachineRegisterInfo &MRI, 2837 MachineIRBuilder &B, 2838 bool IsFormat, 2839 bool IsTyped) const { 2840 B.setInstr(MI); 2841 2842 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 2843 MachineMemOperand *MMO = *MI.memoperands_begin(); 2844 const int MemSize = MMO->getSize(); 2845 const LLT S32 = LLT::scalar(32); 2846 2847 Register Dst = MI.getOperand(0).getReg(); 2848 Register RSrc = MI.getOperand(2).getReg(); 2849 2850 // The typed intrinsics add an immediate after the registers. 2851 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 2852 2853 // The struct intrinsic variants add one additional operand over raw. 2854 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 2855 Register VIndex; 2856 int OpOffset = 0; 2857 if (HasVIndex) { 2858 VIndex = MI.getOperand(3).getReg(); 2859 OpOffset = 1; 2860 } 2861 2862 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 2863 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 2864 2865 unsigned Format = 0; 2866 if (IsTyped) { 2867 Format = MI.getOperand(5 + OpOffset).getImm(); 2868 ++OpOffset; 2869 } 2870 2871 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 2872 unsigned ImmOffset; 2873 unsigned TotalOffset; 2874 2875 LLT Ty = MRI.getType(Dst); 2876 LLT EltTy = Ty.getScalarType(); 2877 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 2878 const bool Unpacked = ST.hasUnpackedD16VMem(); 2879 2880 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 2881 if (TotalOffset != 0) 2882 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 2883 2884 unsigned Opc; 2885 2886 if (IsTyped) { 2887 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 2888 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 2889 } else if (IsFormat) { 2890 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 2891 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 2892 } else { 2893 switch (MemSize) { 2894 case 1: 2895 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 2896 break; 2897 case 2: 2898 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 2899 break; 2900 default: 2901 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 2902 break; 2903 } 2904 } 2905 2906 Register LoadDstReg; 2907 2908 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 2909 LLT UnpackedTy = Ty.changeElementSize(32); 2910 2911 if (IsExtLoad) 2912 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 2913 else if (Unpacked && IsD16 && Ty.isVector()) 2914 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 2915 else 2916 LoadDstReg = Dst; 2917 2918 if (!VIndex) 2919 VIndex = B.buildConstant(S32, 0).getReg(0); 2920 2921 auto MIB = B.buildInstr(Opc) 2922 .addDef(LoadDstReg) // vdata 2923 .addUse(RSrc) // rsrc 2924 .addUse(VIndex) // vindex 2925 .addUse(VOffset) // voffset 2926 .addUse(SOffset) // soffset 2927 .addImm(ImmOffset); // offset(imm) 2928 2929 if (IsTyped) 2930 MIB.addImm(Format); 2931 2932 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 2933 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 2934 .addMemOperand(MMO); 2935 2936 if (LoadDstReg != Dst) { 2937 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2938 2939 // Widen result for extending loads was widened. 2940 if (IsExtLoad) 2941 B.buildTrunc(Dst, LoadDstReg); 2942 else { 2943 // Repack to original 16-bit vector result 2944 // FIXME: G_TRUNC should work, but legalization currently fails 2945 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 2946 SmallVector<Register, 4> Repack; 2947 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 2948 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 2949 B.buildMerge(Dst, Repack); 2950 } 2951 } 2952 2953 MI.eraseFromParent(); 2954 return true; 2955 } 2956 2957 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 2958 MachineIRBuilder &B, 2959 bool IsInc) const { 2960 B.setInstr(MI); 2961 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 2962 AMDGPU::G_AMDGPU_ATOMIC_DEC; 2963 B.buildInstr(Opc) 2964 .addDef(MI.getOperand(0).getReg()) 2965 .addUse(MI.getOperand(2).getReg()) 2966 .addUse(MI.getOperand(3).getReg()) 2967 .cloneMemRefs(MI); 2968 MI.eraseFromParent(); 2969 return true; 2970 } 2971 2972 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 2973 switch (IntrID) { 2974 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 2975 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 2976 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 2977 case Intrinsic::amdgcn_raw_buffer_atomic_add: 2978 case Intrinsic::amdgcn_struct_buffer_atomic_add: 2979 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 2980 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 2981 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 2982 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 2983 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 2984 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 2985 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 2986 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 2987 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 2988 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 2989 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 2990 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 2991 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 2992 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 2993 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 2994 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 2995 case Intrinsic::amdgcn_raw_buffer_atomic_and: 2996 case Intrinsic::amdgcn_struct_buffer_atomic_and: 2997 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 2998 case Intrinsic::amdgcn_raw_buffer_atomic_or: 2999 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3000 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3001 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3002 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3003 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3004 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3005 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3006 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3007 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3008 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3009 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3010 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3011 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3012 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3013 default: 3014 llvm_unreachable("unhandled atomic opcode"); 3015 } 3016 } 3017 3018 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3019 MachineIRBuilder &B, 3020 Intrinsic::ID IID) const { 3021 B.setInstr(MI); 3022 3023 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3024 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3025 3026 Register Dst = MI.getOperand(0).getReg(); 3027 Register VData = MI.getOperand(2).getReg(); 3028 3029 Register CmpVal; 3030 int OpOffset = 0; 3031 3032 if (IsCmpSwap) { 3033 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3034 ++OpOffset; 3035 } 3036 3037 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3038 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3039 3040 // The struct intrinsic variants add one additional operand over raw. 3041 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3042 Register VIndex; 3043 if (HasVIndex) { 3044 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3045 ++OpOffset; 3046 } 3047 3048 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3049 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3050 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3051 3052 MachineMemOperand *MMO = *MI.memoperands_begin(); 3053 3054 unsigned ImmOffset; 3055 unsigned TotalOffset; 3056 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3057 if (TotalOffset != 0) 3058 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3059 3060 if (!VIndex) 3061 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3062 3063 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3064 .addDef(Dst) 3065 .addUse(VData); // vdata 3066 3067 if (IsCmpSwap) 3068 MIB.addReg(CmpVal); 3069 3070 MIB.addUse(RSrc) // rsrc 3071 .addUse(VIndex) // vindex 3072 .addUse(VOffset) // voffset 3073 .addUse(SOffset) // soffset 3074 .addImm(ImmOffset) // offset(imm) 3075 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3076 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3077 .addMemOperand(MMO); 3078 3079 MI.eraseFromParent(); 3080 return true; 3081 } 3082 3083 // Produce a vector of s16 elements from s32 pieces. 3084 static void truncToS16Vector(MachineIRBuilder &B, Register DstReg, 3085 ArrayRef<Register> UnmergeParts) { 3086 const LLT S16 = LLT::scalar(16); 3087 3088 SmallVector<Register, 4> RemergeParts(UnmergeParts.size()); 3089 for (int I = 0, E = UnmergeParts.size(); I != E; ++I) 3090 RemergeParts[I] = B.buildTrunc(S16, UnmergeParts[I]).getReg(0); 3091 3092 B.buildBuildVector(DstReg, RemergeParts); 3093 } 3094 3095 /// Convert a set of s32 registers to a result vector with s16 elements. 3096 static void bitcastToS16Vector(MachineIRBuilder &B, Register DstReg, 3097 ArrayRef<Register> UnmergeParts) { 3098 MachineRegisterInfo &MRI = *B.getMRI(); 3099 const LLT V2S16 = LLT::vector(2, 16); 3100 LLT TargetTy = MRI.getType(DstReg); 3101 int NumElts = UnmergeParts.size(); 3102 3103 if (NumElts == 1) { 3104 assert(TargetTy == V2S16); 3105 B.buildBitcast(DstReg, UnmergeParts[0]); 3106 return; 3107 } 3108 3109 SmallVector<Register, 4> RemergeParts(NumElts); 3110 for (int I = 0; I != NumElts; ++I) 3111 RemergeParts[I] = B.buildBitcast(V2S16, UnmergeParts[I]).getReg(0); 3112 3113 if (TargetTy.getSizeInBits() == 32u * NumElts) { 3114 B.buildConcatVectors(DstReg, RemergeParts); 3115 return; 3116 } 3117 3118 const LLT V3S16 = LLT::vector(3, 16); 3119 const LLT V6S16 = LLT::vector(6, 16); 3120 3121 // Widen to v6s16 and unpack v3 parts. 3122 assert(TargetTy == V3S16); 3123 3124 RemergeParts.push_back(B.buildUndef(V2S16).getReg(0)); 3125 auto Concat = B.buildConcatVectors(V6S16, RemergeParts); 3126 B.buildUnmerge({DstReg, MRI.createGenericVirtualRegister(V3S16)}, Concat); 3127 } 3128 3129 // FIXME: Just vector trunc should be sufficent, but legalization currently 3130 // broken. 3131 static void repackUnpackedD16Load(MachineIRBuilder &B, Register DstReg, 3132 Register WideDstReg) { 3133 const LLT S32 = LLT::scalar(32); 3134 const LLT S16 = LLT::scalar(16); 3135 3136 auto Unmerge = B.buildUnmerge(S32, WideDstReg); 3137 3138 int NumOps = Unmerge->getNumOperands() - 1; 3139 SmallVector<Register, 4> RemergeParts(NumOps); 3140 for (int I = 0; I != NumOps; ++I) 3141 RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0); 3142 3143 B.buildBuildVector(DstReg, RemergeParts); 3144 } 3145 3146 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3147 MachineInstr &MI, MachineIRBuilder &B, 3148 GISelChangeObserver &Observer, 3149 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3150 bool IsTFE = MI.getNumExplicitDefs() == 2; 3151 3152 // We are only processing the operands of d16 image operations on subtargets 3153 // that use the unpacked register layout, or need to repack the TFE result. 3154 3155 // TODO: Need to handle a16 images too 3156 // TODO: Do we need to guard against already legalized intrinsics? 3157 if (!IsTFE && !ST.hasUnpackedD16VMem()) 3158 return true; 3159 3160 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3161 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3162 3163 if (BaseOpcode->Atomic) // No d16 atomics, or TFE. 3164 return true; 3165 3166 B.setInstr(MI); 3167 3168 MachineRegisterInfo *MRI = B.getMRI(); 3169 const LLT S32 = LLT::scalar(32); 3170 const LLT S16 = LLT::scalar(16); 3171 3172 if (BaseOpcode->Store) { // No TFE for stores? 3173 Register VData = MI.getOperand(1).getReg(); 3174 LLT Ty = MRI->getType(VData); 3175 if (!Ty.isVector() || Ty.getElementType() != S16) 3176 return true; 3177 3178 B.setInstr(MI); 3179 3180 Observer.changingInstr(MI); 3181 MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData)); 3182 Observer.changedInstr(MI); 3183 return true; 3184 } 3185 3186 Register DstReg = MI.getOperand(0).getReg(); 3187 LLT Ty = MRI->getType(DstReg); 3188 const LLT EltTy = Ty.getScalarType(); 3189 const bool IsD16 = Ty.getScalarType() == S16; 3190 const unsigned NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3191 3192 if (IsTFE) { 3193 // In the IR, TFE is supposed to be used with a 2 element struct return 3194 // type. The intruction really returns these two values in one contiguous 3195 // register, with one additional dword beyond the loaded data. Rewrite the 3196 // return type to use a single register result. 3197 Register Dst1Reg = MI.getOperand(1).getReg(); 3198 if (MRI->getType(Dst1Reg) != S32) 3199 return false; 3200 3201 // TODO: Make sure the TFE operand bit is set. 3202 3203 // The raw dword aligned data component of the load. The only legal cases 3204 // where this matters should be when using the packed D16 format, for 3205 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3206 LLT RoundedTy; 3207 LLT TFETy; 3208 3209 if (IsD16 && ST.hasUnpackedD16VMem()) { 3210 RoundedTy = LLT::scalarOrVector(NumElts, 32); 3211 TFETy = LLT::vector(NumElts + 1, 32); 3212 } else { 3213 unsigned EltSize = Ty.getScalarSizeInBits(); 3214 unsigned RoundedElts = (Ty.getSizeInBits() + 31) / 32; 3215 unsigned RoundedSize = 32 * RoundedElts; 3216 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3217 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3218 } 3219 3220 Register TFEReg = MRI->createGenericVirtualRegister(TFETy); 3221 Observer.changingInstr(MI); 3222 3223 MI.getOperand(0).setReg(TFEReg); 3224 MI.RemoveOperand(1); 3225 3226 Observer.changedInstr(MI); 3227 3228 // Insert after the instruction. 3229 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3230 3231 // Now figure out how to copy the new result register back into the old 3232 // result. 3233 3234 SmallVector<Register, 5> UnmergeResults(TFETy.getNumElements(), Dst1Reg); 3235 int NumDataElts = TFETy.getNumElements() - 1; 3236 3237 if (!Ty.isVector()) { 3238 // Simplest case is a trivial unmerge (plus a truncate for d16). 3239 UnmergeResults[0] = Ty == S32 ? 3240 DstReg : MRI->createGenericVirtualRegister(S32); 3241 3242 B.buildUnmerge(UnmergeResults, TFEReg); 3243 if (Ty != S32) 3244 B.buildTrunc(DstReg, UnmergeResults[0]); 3245 return true; 3246 } 3247 3248 // We have to repack into a new vector of some kind. 3249 for (int I = 0; I != NumDataElts; ++I) 3250 UnmergeResults[I] = MRI->createGenericVirtualRegister(S32); 3251 B.buildUnmerge(UnmergeResults, TFEReg); 3252 3253 // Drop the final TFE element. 3254 ArrayRef<Register> DataPart(UnmergeResults.data(), NumDataElts); 3255 3256 if (EltTy == S32) 3257 B.buildBuildVector(DstReg, DataPart); 3258 else if (ST.hasUnpackedD16VMem()) 3259 truncToS16Vector(B, DstReg, DataPart); 3260 else 3261 bitcastToS16Vector(B, DstReg, DataPart); 3262 3263 return true; 3264 } 3265 3266 // Must be an image load. 3267 if (!Ty.isVector() || Ty.getElementType() != S16) 3268 return true; 3269 3270 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3271 3272 LLT WidenedTy = Ty.changeElementType(S32); 3273 Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy); 3274 3275 Observer.changingInstr(MI); 3276 MI.getOperand(0).setReg(WideDstReg); 3277 Observer.changedInstr(MI); 3278 3279 repackUnpackedD16Load(B, DstReg, WideDstReg); 3280 return true; 3281 } 3282 3283 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 3284 MachineInstr &MI, MachineIRBuilder &B, 3285 GISelChangeObserver &Observer) const { 3286 Register Dst = MI.getOperand(0).getReg(); 3287 LLT Ty = B.getMRI()->getType(Dst); 3288 unsigned Size = Ty.getSizeInBits(); 3289 MachineFunction &MF = B.getMF(); 3290 3291 Observer.changingInstr(MI); 3292 3293 // FIXME: We don't really need this intermediate instruction. The intrinsic 3294 // should be fixed to have a memory operand. Since it's readnone, we're not 3295 // allowed to add one. 3296 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 3297 MI.RemoveOperand(1); // Remove intrinsic ID 3298 3299 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 3300 // TODO: Should this use datalayout alignment? 3301 const unsigned MemSize = (Size + 7) / 8; 3302 const unsigned MemAlign = 4; 3303 MachineMemOperand *MMO = MF.getMachineMemOperand( 3304 MachinePointerInfo(), 3305 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 3306 MachineMemOperand::MOInvariant, MemSize, MemAlign); 3307 MI.addMemOperand(MF, MMO); 3308 3309 // There are no 96-bit result scalar loads, but widening to 128-bit should 3310 // always be legal. We may need to restore this to a 96-bit result if it turns 3311 // out this needs to be converted to a vector load during RegBankSelect. 3312 if (!isPowerOf2_32(Size)) { 3313 LegalizerHelper Helper(MF, *this, Observer, B); 3314 B.setInstr(MI); 3315 3316 if (Ty.isVector()) 3317 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 3318 else 3319 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 3320 } 3321 3322 Observer.changedInstr(MI); 3323 return true; 3324 } 3325 3326 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 3327 MachineIRBuilder &B, 3328 GISelChangeObserver &Observer) const { 3329 MachineRegisterInfo &MRI = *B.getMRI(); 3330 3331 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 3332 auto IntrID = MI.getIntrinsicID(); 3333 switch (IntrID) { 3334 case Intrinsic::amdgcn_if: 3335 case Intrinsic::amdgcn_else: { 3336 MachineInstr *Br = nullptr; 3337 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 3338 const SIRegisterInfo *TRI 3339 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 3340 3341 B.setInstr(*BrCond); 3342 Register Def = MI.getOperand(1).getReg(); 3343 Register Use = MI.getOperand(3).getReg(); 3344 3345 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 3346 if (Br) 3347 BrTarget = Br->getOperand(0).getMBB(); 3348 3349 if (IntrID == Intrinsic::amdgcn_if) { 3350 B.buildInstr(AMDGPU::SI_IF) 3351 .addDef(Def) 3352 .addUse(Use) 3353 .addMBB(BrTarget); 3354 } else { 3355 B.buildInstr(AMDGPU::SI_ELSE) 3356 .addDef(Def) 3357 .addUse(Use) 3358 .addMBB(BrTarget) 3359 .addImm(0); 3360 } 3361 3362 if (Br) 3363 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 3364 3365 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 3366 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 3367 MI.eraseFromParent(); 3368 BrCond->eraseFromParent(); 3369 return true; 3370 } 3371 3372 return false; 3373 } 3374 case Intrinsic::amdgcn_loop: { 3375 MachineInstr *Br = nullptr; 3376 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 3377 const SIRegisterInfo *TRI 3378 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 3379 3380 B.setInstr(*BrCond); 3381 3382 // FIXME: Need to adjust branch targets based on unconditional branch. 3383 Register Reg = MI.getOperand(2).getReg(); 3384 B.buildInstr(AMDGPU::SI_LOOP) 3385 .addUse(Reg) 3386 .addMBB(BrCond->getOperand(1).getMBB()); 3387 MI.eraseFromParent(); 3388 BrCond->eraseFromParent(); 3389 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 3390 return true; 3391 } 3392 3393 return false; 3394 } 3395 case Intrinsic::amdgcn_kernarg_segment_ptr: 3396 return legalizePreloadedArgIntrin( 3397 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3398 case Intrinsic::amdgcn_implicitarg_ptr: 3399 return legalizeImplicitArgPtr(MI, MRI, B); 3400 case Intrinsic::amdgcn_workitem_id_x: 3401 return legalizePreloadedArgIntrin(MI, MRI, B, 3402 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 3403 case Intrinsic::amdgcn_workitem_id_y: 3404 return legalizePreloadedArgIntrin(MI, MRI, B, 3405 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 3406 case Intrinsic::amdgcn_workitem_id_z: 3407 return legalizePreloadedArgIntrin(MI, MRI, B, 3408 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 3409 case Intrinsic::amdgcn_workgroup_id_x: 3410 return legalizePreloadedArgIntrin(MI, MRI, B, 3411 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 3412 case Intrinsic::amdgcn_workgroup_id_y: 3413 return legalizePreloadedArgIntrin(MI, MRI, B, 3414 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 3415 case Intrinsic::amdgcn_workgroup_id_z: 3416 return legalizePreloadedArgIntrin(MI, MRI, B, 3417 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 3418 case Intrinsic::amdgcn_dispatch_ptr: 3419 return legalizePreloadedArgIntrin(MI, MRI, B, 3420 AMDGPUFunctionArgInfo::DISPATCH_PTR); 3421 case Intrinsic::amdgcn_queue_ptr: 3422 return legalizePreloadedArgIntrin(MI, MRI, B, 3423 AMDGPUFunctionArgInfo::QUEUE_PTR); 3424 case Intrinsic::amdgcn_implicit_buffer_ptr: 3425 return legalizePreloadedArgIntrin( 3426 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 3427 case Intrinsic::amdgcn_dispatch_id: 3428 return legalizePreloadedArgIntrin(MI, MRI, B, 3429 AMDGPUFunctionArgInfo::DISPATCH_ID); 3430 case Intrinsic::amdgcn_fdiv_fast: 3431 return legalizeFDIVFastIntrin(MI, MRI, B); 3432 case Intrinsic::amdgcn_is_shared: 3433 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 3434 case Intrinsic::amdgcn_is_private: 3435 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 3436 case Intrinsic::amdgcn_wavefrontsize: { 3437 B.setInstr(MI); 3438 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 3439 MI.eraseFromParent(); 3440 return true; 3441 } 3442 case Intrinsic::amdgcn_s_buffer_load: 3443 return legalizeSBufferLoad(MI, B, Observer); 3444 case Intrinsic::amdgcn_raw_buffer_store: 3445 case Intrinsic::amdgcn_struct_buffer_store: 3446 return legalizeBufferStore(MI, MRI, B, false, false); 3447 case Intrinsic::amdgcn_raw_buffer_store_format: 3448 case Intrinsic::amdgcn_struct_buffer_store_format: 3449 return legalizeBufferStore(MI, MRI, B, false, true); 3450 case Intrinsic::amdgcn_raw_tbuffer_store: 3451 case Intrinsic::amdgcn_struct_tbuffer_store: 3452 return legalizeBufferStore(MI, MRI, B, true, true); 3453 case Intrinsic::amdgcn_raw_buffer_load: 3454 case Intrinsic::amdgcn_struct_buffer_load: 3455 return legalizeBufferLoad(MI, MRI, B, false, false); 3456 case Intrinsic::amdgcn_raw_buffer_load_format: 3457 case Intrinsic::amdgcn_struct_buffer_load_format: 3458 return legalizeBufferLoad(MI, MRI, B, true, false); 3459 case Intrinsic::amdgcn_raw_tbuffer_load: 3460 case Intrinsic::amdgcn_struct_tbuffer_load: 3461 return legalizeBufferLoad(MI, MRI, B, true, true); 3462 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3463 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3464 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3465 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3466 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3467 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3468 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3469 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3470 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3471 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3472 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3473 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3474 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3475 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3476 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3477 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3478 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3479 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3480 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3481 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3482 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3483 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3484 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3485 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3486 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3487 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3488 return legalizeBufferAtomic(MI, B, IntrID); 3489 case Intrinsic::amdgcn_atomic_inc: 3490 return legalizeAtomicIncDec(MI, B, true); 3491 case Intrinsic::amdgcn_atomic_dec: 3492 return legalizeAtomicIncDec(MI, B, false); 3493 default: { 3494 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 3495 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 3496 return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr); 3497 return true; 3498 } 3499 } 3500 3501 return true; 3502 } 3503