1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPULegalizerInfo.h" 22 23 #include "AMDGPU.h" 24 #include "AMDGPUGlobalISelUtils.h" 25 #include "AMDGPUTargetMachine.h" 26 #include "SIMachineFunctionInfo.h" 27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 30 #include "llvm/CodeGen/TargetOpcodes.h" 31 #include "llvm/CodeGen/ValueTypes.h" 32 #include "llvm/IR/DerivedTypes.h" 33 #include "llvm/IR/DiagnosticInfo.h" 34 #include "llvm/IR/Type.h" 35 #include "llvm/Support/Debug.h" 36 37 #define DEBUG_TYPE "amdgpu-legalinfo" 38 39 using namespace llvm; 40 using namespace LegalizeActions; 41 using namespace LegalizeMutations; 42 using namespace LegalityPredicates; 43 using namespace MIPatternMatch; 44 45 // Round the number of elements to the next power of two elements 46 static LLT getPow2VectorType(LLT Ty) { 47 unsigned NElts = Ty.getNumElements(); 48 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 49 return Ty.changeNumElements(Pow2NElts); 50 } 51 52 // Round the number of bits to the next power of two bits 53 static LLT getPow2ScalarType(LLT Ty) { 54 unsigned Bits = Ty.getSizeInBits(); 55 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 56 return LLT::scalar(Pow2Bits); 57 } 58 59 static LegalityPredicate isMultiple32(unsigned TypeIdx, 60 unsigned MaxSize = 1024) { 61 return [=](const LegalityQuery &Query) { 62 const LLT Ty = Query.Types[TypeIdx]; 63 const LLT EltTy = Ty.getScalarType(); 64 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 65 }; 66 } 67 68 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 69 return [=](const LegalityQuery &Query) { 70 return Query.Types[TypeIdx].getSizeInBits() == Size; 71 }; 72 } 73 74 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 75 return [=](const LegalityQuery &Query) { 76 const LLT Ty = Query.Types[TypeIdx]; 77 return Ty.isVector() && 78 Ty.getNumElements() % 2 != 0 && 79 Ty.getElementType().getSizeInBits() < 32 && 80 Ty.getSizeInBits() % 32 != 0; 81 }; 82 } 83 84 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 85 return [=](const LegalityQuery &Query) { 86 const LLT Ty = Query.Types[TypeIdx]; 87 const LLT EltTy = Ty.getScalarType(); 88 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 89 }; 90 } 91 92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 93 return [=](const LegalityQuery &Query) { 94 const LLT Ty = Query.Types[TypeIdx]; 95 const LLT EltTy = Ty.getElementType(); 96 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 97 }; 98 } 99 100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 101 return [=](const LegalityQuery &Query) { 102 const LLT Ty = Query.Types[TypeIdx]; 103 const LLT EltTy = Ty.getElementType(); 104 unsigned Size = Ty.getSizeInBits(); 105 unsigned Pieces = (Size + 63) / 64; 106 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 107 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 108 }; 109 } 110 111 // Increase the number of vector elements to reach the next multiple of 32-bit 112 // type. 113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 114 return [=](const LegalityQuery &Query) { 115 const LLT Ty = Query.Types[TypeIdx]; 116 117 const LLT EltTy = Ty.getElementType(); 118 const int Size = Ty.getSizeInBits(); 119 const int EltSize = EltTy.getSizeInBits(); 120 const int NextMul32 = (Size + 31) / 32; 121 122 assert(EltSize < 32); 123 124 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 125 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 126 }; 127 } 128 129 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 130 return [=](const LegalityQuery &Query) { 131 const LLT QueryTy = Query.Types[TypeIdx]; 132 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 133 }; 134 } 135 136 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 137 return [=](const LegalityQuery &Query) { 138 const LLT QueryTy = Query.Types[TypeIdx]; 139 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 140 }; 141 } 142 143 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 144 return [=](const LegalityQuery &Query) { 145 const LLT QueryTy = Query.Types[TypeIdx]; 146 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 147 }; 148 } 149 150 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 151 // v2s16. 152 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 153 return [=](const LegalityQuery &Query) { 154 const LLT Ty = Query.Types[TypeIdx]; 155 if (Ty.isVector()) { 156 const int EltSize = Ty.getElementType().getSizeInBits(); 157 return EltSize == 32 || EltSize == 64 || 158 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 159 EltSize == 128 || EltSize == 256; 160 } 161 162 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 163 }; 164 } 165 166 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 167 return [=](const LegalityQuery &Query) { 168 const LLT QueryTy = Query.Types[TypeIdx]; 169 return QueryTy.isVector() && QueryTy.getElementType() == Type; 170 }; 171 } 172 173 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 174 return [=](const LegalityQuery &Query) { 175 const LLT Ty = Query.Types[TypeIdx]; 176 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 177 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 178 }; 179 } 180 181 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 182 const GCNTargetMachine &TM) 183 : ST(ST_) { 184 using namespace TargetOpcode; 185 186 auto GetAddrSpacePtr = [&TM](unsigned AS) { 187 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 188 }; 189 190 const LLT S1 = LLT::scalar(1); 191 const LLT S16 = LLT::scalar(16); 192 const LLT S32 = LLT::scalar(32); 193 const LLT S64 = LLT::scalar(64); 194 const LLT S96 = LLT::scalar(96); 195 const LLT S128 = LLT::scalar(128); 196 const LLT S256 = LLT::scalar(256); 197 const LLT S1024 = LLT::scalar(1024); 198 199 const LLT V2S16 = LLT::vector(2, 16); 200 const LLT V4S16 = LLT::vector(4, 16); 201 202 const LLT V2S32 = LLT::vector(2, 32); 203 const LLT V3S32 = LLT::vector(3, 32); 204 const LLT V4S32 = LLT::vector(4, 32); 205 const LLT V5S32 = LLT::vector(5, 32); 206 const LLT V6S32 = LLT::vector(6, 32); 207 const LLT V7S32 = LLT::vector(7, 32); 208 const LLT V8S32 = LLT::vector(8, 32); 209 const LLT V9S32 = LLT::vector(9, 32); 210 const LLT V10S32 = LLT::vector(10, 32); 211 const LLT V11S32 = LLT::vector(11, 32); 212 const LLT V12S32 = LLT::vector(12, 32); 213 const LLT V13S32 = LLT::vector(13, 32); 214 const LLT V14S32 = LLT::vector(14, 32); 215 const LLT V15S32 = LLT::vector(15, 32); 216 const LLT V16S32 = LLT::vector(16, 32); 217 const LLT V32S32 = LLT::vector(32, 32); 218 219 const LLT V2S64 = LLT::vector(2, 64); 220 const LLT V3S64 = LLT::vector(3, 64); 221 const LLT V4S64 = LLT::vector(4, 64); 222 const LLT V5S64 = LLT::vector(5, 64); 223 const LLT V6S64 = LLT::vector(6, 64); 224 const LLT V7S64 = LLT::vector(7, 64); 225 const LLT V8S64 = LLT::vector(8, 64); 226 const LLT V16S64 = LLT::vector(16, 64); 227 228 std::initializer_list<LLT> AllS32Vectors = 229 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 230 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 231 std::initializer_list<LLT> AllS64Vectors = 232 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 233 234 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 235 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 236 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 237 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 238 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 239 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 240 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 241 242 const LLT CodePtr = FlatPtr; 243 244 const std::initializer_list<LLT> AddrSpaces64 = { 245 GlobalPtr, ConstantPtr, FlatPtr 246 }; 247 248 const std::initializer_list<LLT> AddrSpaces32 = { 249 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 250 }; 251 252 const std::initializer_list<LLT> FPTypesBase = { 253 S32, S64 254 }; 255 256 const std::initializer_list<LLT> FPTypes16 = { 257 S32, S64, S16 258 }; 259 260 const std::initializer_list<LLT> FPTypesPK16 = { 261 S32, S64, S16, V2S16 262 }; 263 264 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 265 266 setAction({G_BRCOND, S1}, Legal); // VCC branches 267 setAction({G_BRCOND, S32}, Legal); // SCC branches 268 269 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 270 // elements for v3s16 271 getActionDefinitionsBuilder(G_PHI) 272 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 273 .legalFor(AllS32Vectors) 274 .legalFor(AllS64Vectors) 275 .legalFor(AddrSpaces64) 276 .legalFor(AddrSpaces32) 277 .clampScalar(0, S32, S256) 278 .widenScalarToNextPow2(0, 32) 279 .clampMaxNumElements(0, S32, 16) 280 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 281 .legalIf(isPointer(0)); 282 283 if (ST.has16BitInsts()) { 284 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 285 .legalFor({S32, S16}) 286 .clampScalar(0, S16, S32) 287 .scalarize(0) 288 .widenScalarToNextPow2(0, 32); 289 } else { 290 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 291 .legalFor({S32}) 292 .clampScalar(0, S32, S32) 293 .scalarize(0); 294 } 295 296 // FIXME: Not really legal. Placeholder for custom lowering. 297 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 298 .legalFor({S32, S64}) 299 .clampScalar(0, S32, S64) 300 .widenScalarToNextPow2(0, 32) 301 .scalarize(0); 302 303 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 304 .legalFor({S32}) 305 .clampScalar(0, S32, S32) 306 .scalarize(0); 307 308 // Report legal for any types we can handle anywhere. For the cases only legal 309 // on the SALU, RegBankSelect will be able to re-legalize. 310 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 311 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 312 .clampScalar(0, S32, S64) 313 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 314 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 315 .widenScalarToNextPow2(0) 316 .scalarize(0); 317 318 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 319 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 320 .legalFor({{S32, S1}, {S32, S32}}) 321 .clampScalar(0, S32, S32) 322 .scalarize(0); // TODO: Implement. 323 324 getActionDefinitionsBuilder(G_BITCAST) 325 // Don't worry about the size constraint. 326 .legalIf(all(isRegisterType(0), isRegisterType(1))) 327 .lower(); 328 329 330 getActionDefinitionsBuilder(G_CONSTANT) 331 .legalFor({S1, S32, S64, S16, GlobalPtr, 332 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 333 .clampScalar(0, S32, S64) 334 .widenScalarToNextPow2(0) 335 .legalIf(isPointer(0)); 336 337 getActionDefinitionsBuilder(G_FCONSTANT) 338 .legalFor({S32, S64, S16}) 339 .clampScalar(0, S16, S64); 340 341 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 342 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 343 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 344 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 345 .clampScalarOrElt(0, S32, S1024) 346 .legalIf(isMultiple32(0)) 347 .widenScalarToNextPow2(0, 32) 348 .clampMaxNumElements(0, S32, 16); 349 350 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 351 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 352 .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr}); 353 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 354 355 auto &FPOpActions = getActionDefinitionsBuilder( 356 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 357 .legalFor({S32, S64}); 358 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 359 .customFor({S32, S64}); 360 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 361 .customFor({S32, S64}); 362 363 if (ST.has16BitInsts()) { 364 if (ST.hasVOP3PInsts()) 365 FPOpActions.legalFor({S16, V2S16}); 366 else 367 FPOpActions.legalFor({S16}); 368 369 TrigActions.customFor({S16}); 370 FDIVActions.customFor({S16}); 371 } 372 373 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 374 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 375 376 if (ST.hasVOP3PInsts()) { 377 MinNumMaxNum.customFor(FPTypesPK16) 378 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 379 .clampMaxNumElements(0, S16, 2) 380 .clampScalar(0, S16, S64) 381 .scalarize(0); 382 } else if (ST.has16BitInsts()) { 383 MinNumMaxNum.customFor(FPTypes16) 384 .clampScalar(0, S16, S64) 385 .scalarize(0); 386 } else { 387 MinNumMaxNum.customFor(FPTypesBase) 388 .clampScalar(0, S32, S64) 389 .scalarize(0); 390 } 391 392 if (ST.hasVOP3PInsts()) 393 FPOpActions.clampMaxNumElements(0, S16, 2); 394 395 FPOpActions 396 .scalarize(0) 397 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 398 399 TrigActions 400 .scalarize(0) 401 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 402 403 FDIVActions 404 .scalarize(0) 405 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 406 407 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 408 .legalFor(FPTypesPK16) 409 .clampMaxNumElements(0, S16, 2) 410 .scalarize(0) 411 .clampScalar(0, S16, S64); 412 413 if (ST.has16BitInsts()) { 414 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 415 .legalFor({S32, S64, S16}) 416 .scalarize(0) 417 .clampScalar(0, S16, S64); 418 } else { 419 getActionDefinitionsBuilder(G_FSQRT) 420 .legalFor({S32, S64}) 421 .scalarize(0) 422 .clampScalar(0, S32, S64); 423 424 if (ST.hasFractBug()) { 425 getActionDefinitionsBuilder(G_FFLOOR) 426 .customFor({S64}) 427 .legalFor({S32, S64}) 428 .scalarize(0) 429 .clampScalar(0, S32, S64); 430 } else { 431 getActionDefinitionsBuilder(G_FFLOOR) 432 .legalFor({S32, S64}) 433 .scalarize(0) 434 .clampScalar(0, S32, S64); 435 } 436 } 437 438 getActionDefinitionsBuilder(G_FPTRUNC) 439 .legalFor({{S32, S64}, {S16, S32}}) 440 .scalarize(0); 441 442 getActionDefinitionsBuilder(G_FPEXT) 443 .legalFor({{S64, S32}, {S32, S16}}) 444 .lowerFor({{S64, S16}}) // FIXME: Implement 445 .scalarize(0); 446 447 getActionDefinitionsBuilder(G_FSUB) 448 // Use actual fsub instruction 449 .legalFor({S32}) 450 // Must use fadd + fneg 451 .lowerFor({S64, S16, V2S16}) 452 .scalarize(0) 453 .clampScalar(0, S32, S64); 454 455 // Whether this is legal depends on the floating point mode for the function. 456 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 457 if (ST.hasMadF16()) 458 FMad.customFor({S32, S16}); 459 else 460 FMad.customFor({S32}); 461 FMad.scalarize(0) 462 .lower(); 463 464 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 465 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 466 {S32, S1}, {S64, S1}, {S16, S1}}) 467 .scalarize(0) 468 .clampScalar(0, S32, S64) 469 .widenScalarToNextPow2(1, 32); 470 471 // TODO: Split s1->s64 during regbankselect for VALU. 472 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 473 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 474 .lowerFor({{S32, S64}}) 475 .lowerIf(typeIs(1, S1)) 476 .customFor({{S64, S64}}); 477 if (ST.has16BitInsts()) 478 IToFP.legalFor({{S16, S16}}); 479 IToFP.clampScalar(1, S32, S64) 480 .scalarize(0); 481 482 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 483 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 484 .customFor({{S64, S64}}); 485 if (ST.has16BitInsts()) 486 FPToI.legalFor({{S16, S16}}); 487 else 488 FPToI.minScalar(1, S32); 489 490 FPToI.minScalar(0, S32) 491 .scalarize(0) 492 .lower(); 493 494 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 495 .scalarize(0) 496 .lower(); 497 498 if (ST.has16BitInsts()) { 499 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 500 .legalFor({S16, S32, S64}) 501 .clampScalar(0, S16, S64) 502 .scalarize(0); 503 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 504 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 505 .legalFor({S32, S64}) 506 .clampScalar(0, S32, S64) 507 .scalarize(0); 508 } else { 509 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 510 .legalFor({S32}) 511 .customFor({S64}) 512 .clampScalar(0, S32, S64) 513 .scalarize(0); 514 } 515 516 getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK}) 517 .scalarize(0) 518 .alwaysLegal(); 519 520 auto &CmpBuilder = 521 getActionDefinitionsBuilder(G_ICMP) 522 // The compare output type differs based on the register bank of the output, 523 // so make both s1 and s32 legal. 524 // 525 // Scalar compares producing output in scc will be promoted to s32, as that 526 // is the allocatable register type that will be needed for the copy from 527 // scc. This will be promoted during RegBankSelect, and we assume something 528 // before that won't try to use s32 result types. 529 // 530 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 531 // bank. 532 .legalForCartesianProduct( 533 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 534 .legalForCartesianProduct( 535 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 536 if (ST.has16BitInsts()) { 537 CmpBuilder.legalFor({{S1, S16}}); 538 } 539 540 CmpBuilder 541 .widenScalarToNextPow2(1) 542 .clampScalar(1, S32, S64) 543 .scalarize(0) 544 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 545 546 getActionDefinitionsBuilder(G_FCMP) 547 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 548 .widenScalarToNextPow2(1) 549 .clampScalar(1, S32, S64) 550 .scalarize(0); 551 552 // FIXME: fpow has a selection pattern that should move to custom lowering. 553 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2, G_FPOW}); 554 if (ST.has16BitInsts()) 555 Exp2Ops.legalFor({S32, S16}); 556 else 557 Exp2Ops.legalFor({S32}); 558 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 559 Exp2Ops.scalarize(0); 560 561 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10}); 562 if (ST.has16BitInsts()) 563 ExpOps.customFor({{S32}, {S16}}); 564 else 565 ExpOps.customFor({S32}); 566 ExpOps.clampScalar(0, MinScalarFPTy, S32) 567 .scalarize(0); 568 569 // The 64-bit versions produce 32-bit results, but only on the SALU. 570 getActionDefinitionsBuilder(G_CTPOP) 571 .legalFor({{S32, S32}, {S32, S64}}) 572 .clampScalar(0, S32, S32) 573 .clampScalar(1, S32, S64) 574 .scalarize(0) 575 .widenScalarToNextPow2(0, 32) 576 .widenScalarToNextPow2(1, 32); 577 578 // The hardware instructions return a different result on 0 than the generic 579 // instructions expect. The hardware produces -1, but these produce the 580 // bitwidth. 581 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 582 .scalarize(0) 583 .clampScalar(0, S32, S32) 584 .clampScalar(1, S32, S64) 585 .widenScalarToNextPow2(0, 32) 586 .widenScalarToNextPow2(1, 32) 587 .lower(); 588 589 // The 64-bit versions produce 32-bit results, but only on the SALU. 590 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 591 .legalFor({{S32, S32}, {S32, S64}}) 592 .clampScalar(0, S32, S32) 593 .clampScalar(1, S32, S64) 594 .scalarize(0) 595 .widenScalarToNextPow2(0, 32) 596 .widenScalarToNextPow2(1, 32); 597 598 // TODO: Expand for > s32 599 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE}) 600 .legalFor({S32}) 601 .clampScalar(0, S32, S32) 602 .scalarize(0); 603 604 if (ST.has16BitInsts()) { 605 if (ST.hasVOP3PInsts()) { 606 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 607 .legalFor({S32, S16, V2S16}) 608 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 609 .clampMaxNumElements(0, S16, 2) 610 .clampScalar(0, S16, S32) 611 .widenScalarToNextPow2(0) 612 .scalarize(0); 613 } else { 614 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 615 .legalFor({S32, S16}) 616 .widenScalarToNextPow2(0) 617 .clampScalar(0, S16, S32) 618 .scalarize(0); 619 } 620 } else { 621 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 622 .legalFor({S32}) 623 .clampScalar(0, S32, S32) 624 .widenScalarToNextPow2(0) 625 .scalarize(0); 626 } 627 628 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 629 return [=](const LegalityQuery &Query) { 630 return Query.Types[TypeIdx0].getSizeInBits() < 631 Query.Types[TypeIdx1].getSizeInBits(); 632 }; 633 }; 634 635 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 636 return [=](const LegalityQuery &Query) { 637 return Query.Types[TypeIdx0].getSizeInBits() > 638 Query.Types[TypeIdx1].getSizeInBits(); 639 }; 640 }; 641 642 getActionDefinitionsBuilder(G_INTTOPTR) 643 // List the common cases 644 .legalForCartesianProduct(AddrSpaces64, {S64}) 645 .legalForCartesianProduct(AddrSpaces32, {S32}) 646 .scalarize(0) 647 // Accept any address space as long as the size matches 648 .legalIf(sameSize(0, 1)) 649 .widenScalarIf(smallerThan(1, 0), 650 [](const LegalityQuery &Query) { 651 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 652 }) 653 .narrowScalarIf(greaterThan(1, 0), 654 [](const LegalityQuery &Query) { 655 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 656 }); 657 658 getActionDefinitionsBuilder(G_PTRTOINT) 659 // List the common cases 660 .legalForCartesianProduct(AddrSpaces64, {S64}) 661 .legalForCartesianProduct(AddrSpaces32, {S32}) 662 .scalarize(0) 663 // Accept any address space as long as the size matches 664 .legalIf(sameSize(0, 1)) 665 .widenScalarIf(smallerThan(0, 1), 666 [](const LegalityQuery &Query) { 667 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 668 }) 669 .narrowScalarIf( 670 greaterThan(0, 1), 671 [](const LegalityQuery &Query) { 672 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 673 }); 674 675 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 676 .scalarize(0) 677 .custom(); 678 679 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 680 // handle some operations by just promoting the register during 681 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 682 auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned { 683 switch (AS) { 684 // FIXME: Private element size. 685 case AMDGPUAS::PRIVATE_ADDRESS: 686 return 32; 687 // FIXME: Check subtarget 688 case AMDGPUAS::LOCAL_ADDRESS: 689 return ST.useDS128() ? 128 : 64; 690 691 // Treat constant and global as identical. SMRD loads are sometimes usable 692 // for global loads (ideally constant address space should be eliminated) 693 // depending on the context. Legality cannot be context dependent, but 694 // RegBankSelect can split the load as necessary depending on the pointer 695 // register bank/uniformity and if the memory is invariant or not written in 696 // a kernel. 697 case AMDGPUAS::CONSTANT_ADDRESS: 698 case AMDGPUAS::GLOBAL_ADDRESS: 699 return IsLoad ? 512 : 128; 700 default: 701 return 128; 702 } 703 }; 704 705 const auto needToSplitMemOp = [=](const LegalityQuery &Query, bool IsLoad) -> bool { 706 const LLT DstTy = Query.Types[0]; 707 708 // Split vector extloads. 709 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 710 unsigned Align = Query.MMODescrs[0].AlignInBits; 711 712 if (MemSize < DstTy.getSizeInBits()) 713 MemSize = std::max(MemSize, Align); 714 715 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 716 return true; 717 718 const LLT PtrTy = Query.Types[1]; 719 unsigned AS = PtrTy.getAddressSpace(); 720 if (MemSize > maxSizeForAddrSpace(AS, IsLoad)) 721 return true; 722 723 // Catch weird sized loads that don't evenly divide into the access sizes 724 // TODO: May be able to widen depending on alignment etc. 725 unsigned NumRegs = MemSize / 32; 726 if (NumRegs == 3 && !ST.hasDwordx3LoadStores()) 727 return true; 728 729 if (Align < MemSize) { 730 const SITargetLowering *TLI = ST.getTargetLowering(); 731 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 732 } 733 734 return false; 735 }; 736 737 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 738 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 739 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 740 741 // TODO: Refine based on subtargets which support unaligned access or 128-bit 742 // LDS 743 // TODO: Unsupported flat for SI. 744 745 for (unsigned Op : {G_LOAD, G_STORE}) { 746 const bool IsStore = Op == G_STORE; 747 748 auto &Actions = getActionDefinitionsBuilder(Op); 749 // Whitelist the common cases. 750 // TODO: Pointer loads 751 // TODO: Wide constant loads 752 // TODO: Only CI+ has 3x loads 753 // TODO: Loads to s16 on gfx9 754 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 755 {V2S32, GlobalPtr, 64, GlobalAlign32}, 756 {V3S32, GlobalPtr, 96, GlobalAlign32}, 757 {S96, GlobalPtr, 96, GlobalAlign32}, 758 {V4S32, GlobalPtr, 128, GlobalAlign32}, 759 {S128, GlobalPtr, 128, GlobalAlign32}, 760 {S64, GlobalPtr, 64, GlobalAlign32}, 761 {V2S64, GlobalPtr, 128, GlobalAlign32}, 762 {V2S16, GlobalPtr, 32, GlobalAlign32}, 763 {S32, GlobalPtr, 8, GlobalAlign8}, 764 {S32, GlobalPtr, 16, GlobalAlign16}, 765 766 {S32, LocalPtr, 32, 32}, 767 {S64, LocalPtr, 64, 32}, 768 {V2S32, LocalPtr, 64, 32}, 769 {S32, LocalPtr, 8, 8}, 770 {S32, LocalPtr, 16, 16}, 771 {V2S16, LocalPtr, 32, 32}, 772 773 {S32, PrivatePtr, 32, 32}, 774 {S32, PrivatePtr, 8, 8}, 775 {S32, PrivatePtr, 16, 16}, 776 {V2S16, PrivatePtr, 32, 32}, 777 778 {S32, FlatPtr, 32, GlobalAlign32}, 779 {S32, FlatPtr, 16, GlobalAlign16}, 780 {S32, FlatPtr, 8, GlobalAlign8}, 781 {V2S16, FlatPtr, 32, GlobalAlign32}, 782 783 {S32, ConstantPtr, 32, GlobalAlign32}, 784 {V2S32, ConstantPtr, 64, GlobalAlign32}, 785 {V3S32, ConstantPtr, 96, GlobalAlign32}, 786 {V4S32, ConstantPtr, 128, GlobalAlign32}, 787 {S64, ConstantPtr, 64, GlobalAlign32}, 788 {S128, ConstantPtr, 128, GlobalAlign32}, 789 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 790 Actions 791 .customIf(typeIs(1, Constant32Ptr)) 792 .narrowScalarIf( 793 [=](const LegalityQuery &Query) -> bool { 794 return !Query.Types[0].isVector() && 795 needToSplitMemOp(Query, Op == G_LOAD); 796 }, 797 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 798 const LLT DstTy = Query.Types[0]; 799 const LLT PtrTy = Query.Types[1]; 800 801 const unsigned DstSize = DstTy.getSizeInBits(); 802 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 803 804 // Split extloads. 805 if (DstSize > MemSize) 806 return std::make_pair(0, LLT::scalar(MemSize)); 807 808 if (DstSize > 32 && (DstSize % 32 != 0)) { 809 // FIXME: Need a way to specify non-extload of larger size if 810 // suitably aligned. 811 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 812 } 813 814 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 815 Op == G_LOAD); 816 if (MemSize > MaxSize) 817 return std::make_pair(0, LLT::scalar(MaxSize)); 818 819 unsigned Align = Query.MMODescrs[0].AlignInBits; 820 return std::make_pair(0, LLT::scalar(Align)); 821 }) 822 .fewerElementsIf( 823 [=](const LegalityQuery &Query) -> bool { 824 return Query.Types[0].isVector() && 825 needToSplitMemOp(Query, Op == G_LOAD); 826 }, 827 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 828 const LLT DstTy = Query.Types[0]; 829 const LLT PtrTy = Query.Types[1]; 830 831 LLT EltTy = DstTy.getElementType(); 832 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 833 Op == G_LOAD); 834 835 // Split if it's too large for the address space. 836 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 837 unsigned NumElts = DstTy.getNumElements(); 838 unsigned EltSize = EltTy.getSizeInBits(); 839 840 if (MaxSize % EltSize == 0) { 841 return std::make_pair( 842 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 843 } 844 845 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 846 847 // FIXME: Refine when odd breakdowns handled 848 // The scalars will need to be re-legalized. 849 if (NumPieces == 1 || NumPieces >= NumElts || 850 NumElts % NumPieces != 0) 851 return std::make_pair(0, EltTy); 852 853 return std::make_pair(0, 854 LLT::vector(NumElts / NumPieces, EltTy)); 855 } 856 857 // Need to split because of alignment. 858 unsigned Align = Query.MMODescrs[0].AlignInBits; 859 unsigned EltSize = EltTy.getSizeInBits(); 860 if (EltSize > Align && 861 (EltSize / Align < DstTy.getNumElements())) { 862 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 863 } 864 865 // May need relegalization for the scalars. 866 return std::make_pair(0, EltTy); 867 }) 868 .minScalar(0, S32); 869 870 if (IsStore) 871 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 872 873 // TODO: Need a bitcast lower option? 874 Actions 875 .legalIf([=](const LegalityQuery &Query) { 876 const LLT Ty0 = Query.Types[0]; 877 unsigned Size = Ty0.getSizeInBits(); 878 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 879 unsigned Align = Query.MMODescrs[0].AlignInBits; 880 881 // FIXME: Widening store from alignment not valid. 882 if (MemSize < Size) 883 MemSize = std::max(MemSize, Align); 884 885 // No extending vector loads. 886 if (Size > MemSize && Ty0.isVector()) 887 return false; 888 889 switch (MemSize) { 890 case 8: 891 case 16: 892 return Size == 32; 893 case 32: 894 case 64: 895 case 128: 896 return true; 897 case 96: 898 return ST.hasDwordx3LoadStores(); 899 case 256: 900 case 512: 901 return true; 902 default: 903 return false; 904 } 905 }) 906 .widenScalarToNextPow2(0) 907 // TODO: v3s32->v4s32 with alignment 908 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 909 } 910 911 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 912 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 913 {S32, GlobalPtr, 16, 2 * 8}, 914 {S32, LocalPtr, 8, 8}, 915 {S32, LocalPtr, 16, 16}, 916 {S32, PrivatePtr, 8, 8}, 917 {S32, PrivatePtr, 16, 16}, 918 {S32, ConstantPtr, 8, 8}, 919 {S32, ConstantPtr, 16, 2 * 8}}); 920 if (ST.hasFlatAddressSpace()) { 921 ExtLoads.legalForTypesWithMemDesc( 922 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 923 } 924 925 ExtLoads.clampScalar(0, S32, S32) 926 .widenScalarToNextPow2(0) 927 .unsupportedIfMemSizeNotPow2() 928 .lower(); 929 930 auto &Atomics = getActionDefinitionsBuilder( 931 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 932 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 933 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 934 G_ATOMICRMW_UMIN}) 935 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 936 {S64, GlobalPtr}, {S64, LocalPtr}}); 937 if (ST.hasFlatAddressSpace()) { 938 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 939 } 940 941 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 942 .legalFor({{S32, LocalPtr}}); 943 944 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 945 // demarshalling 946 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 947 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 948 {S32, FlatPtr}, {S64, FlatPtr}}) 949 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 950 {S32, RegionPtr}, {S64, RegionPtr}}); 951 // TODO: Pointer types, any 32-bit or 64-bit vector 952 953 // Condition should be s32 for scalar, s1 for vector. 954 getActionDefinitionsBuilder(G_SELECT) 955 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 956 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 957 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 958 .clampScalar(0, S16, S64) 959 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 960 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 961 .scalarize(1) 962 .clampMaxNumElements(0, S32, 2) 963 .clampMaxNumElements(0, LocalPtr, 2) 964 .clampMaxNumElements(0, PrivatePtr, 2) 965 .scalarize(0) 966 .widenScalarToNextPow2(0) 967 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 968 969 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 970 // be more flexible with the shift amount type. 971 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 972 .legalFor({{S32, S32}, {S64, S32}}); 973 if (ST.has16BitInsts()) { 974 if (ST.hasVOP3PInsts()) { 975 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 976 .clampMaxNumElements(0, S16, 2); 977 } else 978 Shifts.legalFor({{S16, S32}, {S16, S16}}); 979 980 // TODO: Support 16-bit shift amounts 981 Shifts.clampScalar(1, S32, S32); 982 Shifts.clampScalar(0, S16, S64); 983 Shifts.widenScalarToNextPow2(0, 16); 984 } else { 985 // Make sure we legalize the shift amount type first, as the general 986 // expansion for the shifted type will produce much worse code if it hasn't 987 // been truncated already. 988 Shifts.clampScalar(1, S32, S32); 989 Shifts.clampScalar(0, S32, S64); 990 Shifts.widenScalarToNextPow2(0, 32); 991 } 992 Shifts.scalarize(0); 993 994 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 995 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 996 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 997 unsigned IdxTypeIdx = 2; 998 999 getActionDefinitionsBuilder(Op) 1000 .customIf([=](const LegalityQuery &Query) { 1001 const LLT EltTy = Query.Types[EltTypeIdx]; 1002 const LLT VecTy = Query.Types[VecTypeIdx]; 1003 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1004 return (EltTy.getSizeInBits() == 16 || 1005 EltTy.getSizeInBits() % 32 == 0) && 1006 VecTy.getSizeInBits() % 32 == 0 && 1007 VecTy.getSizeInBits() <= 1024 && 1008 IdxTy.getSizeInBits() == 32; 1009 }) 1010 .clampScalar(EltTypeIdx, S32, S64) 1011 .clampScalar(VecTypeIdx, S32, S64) 1012 .clampScalar(IdxTypeIdx, S32, S32); 1013 } 1014 1015 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1016 .unsupportedIf([=](const LegalityQuery &Query) { 1017 const LLT &EltTy = Query.Types[1].getElementType(); 1018 return Query.Types[0] != EltTy; 1019 }); 1020 1021 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1022 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1023 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1024 1025 // FIXME: Doesn't handle extract of illegal sizes. 1026 getActionDefinitionsBuilder(Op) 1027 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1028 // FIXME: Multiples of 16 should not be legal. 1029 .legalIf([=](const LegalityQuery &Query) { 1030 const LLT BigTy = Query.Types[BigTyIdx]; 1031 const LLT LitTy = Query.Types[LitTyIdx]; 1032 return (BigTy.getSizeInBits() % 32 == 0) && 1033 (LitTy.getSizeInBits() % 16 == 0); 1034 }) 1035 .widenScalarIf( 1036 [=](const LegalityQuery &Query) { 1037 const LLT BigTy = Query.Types[BigTyIdx]; 1038 return (BigTy.getScalarSizeInBits() < 16); 1039 }, 1040 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1041 .widenScalarIf( 1042 [=](const LegalityQuery &Query) { 1043 const LLT LitTy = Query.Types[LitTyIdx]; 1044 return (LitTy.getScalarSizeInBits() < 16); 1045 }, 1046 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1047 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1048 .widenScalarToNextPow2(BigTyIdx, 32); 1049 1050 } 1051 1052 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1053 .legalForCartesianProduct(AllS32Vectors, {S32}) 1054 .legalForCartesianProduct(AllS64Vectors, {S64}) 1055 .clampNumElements(0, V16S32, V32S32) 1056 .clampNumElements(0, V2S64, V16S64) 1057 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1058 1059 if (ST.hasScalarPackInsts()) { 1060 BuildVector 1061 // FIXME: Should probably widen s1 vectors straight to s32 1062 .minScalarOrElt(0, S16) 1063 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1064 .minScalar(1, S32); 1065 1066 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1067 .legalFor({V2S16, S32}) 1068 .lower(); 1069 BuildVector.minScalarOrElt(0, S32); 1070 } else { 1071 BuildVector.customFor({V2S16, S16}); 1072 BuildVector.minScalarOrElt(0, S32); 1073 1074 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1075 .customFor({V2S16, S32}) 1076 .lower(); 1077 } 1078 1079 BuildVector.legalIf(isRegisterType(0)); 1080 1081 // FIXME: Clamp maximum size 1082 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1083 .legalIf(isRegisterType(0)); 1084 1085 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1086 // pre-legalize. 1087 if (ST.hasVOP3PInsts()) { 1088 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1089 .customFor({V2S16, V2S16}) 1090 .lower(); 1091 } else 1092 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1093 1094 // Merge/Unmerge 1095 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1096 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1097 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1098 1099 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1100 const LLT &Ty = Query.Types[TypeIdx]; 1101 if (Ty.isVector()) { 1102 const LLT &EltTy = Ty.getElementType(); 1103 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 1104 return true; 1105 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1106 return true; 1107 } 1108 return false; 1109 }; 1110 1111 auto &Builder = getActionDefinitionsBuilder(Op) 1112 // Try to widen to s16 first for small types. 1113 // TODO: Only do this on targets with legal s16 shifts 1114 .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1115 1116 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1117 .lowerFor({{S16, V2S16}}) 1118 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1119 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1120 elementTypeIs(1, S16)), 1121 changeTo(1, V2S16)) 1122 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1123 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1124 // valid. 1125 .clampScalar(LitTyIdx, S32, S256) 1126 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1127 // Break up vectors with weird elements into scalars 1128 .fewerElementsIf( 1129 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 1130 scalarize(0)) 1131 .fewerElementsIf( 1132 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 1133 scalarize(1)) 1134 .clampScalar(BigTyIdx, S32, S1024); 1135 1136 if (Op == G_MERGE_VALUES) { 1137 Builder.widenScalarIf( 1138 // TODO: Use 16-bit shifts if legal for 8-bit values? 1139 [=](const LegalityQuery &Query) { 1140 const LLT Ty = Query.Types[LitTyIdx]; 1141 return Ty.getSizeInBits() < 32; 1142 }, 1143 changeTo(LitTyIdx, S32)); 1144 } 1145 1146 Builder.widenScalarIf( 1147 [=](const LegalityQuery &Query) { 1148 const LLT Ty = Query.Types[BigTyIdx]; 1149 return !isPowerOf2_32(Ty.getSizeInBits()) && 1150 Ty.getSizeInBits() % 16 != 0; 1151 }, 1152 [=](const LegalityQuery &Query) { 1153 // Pick the next power of 2, or a multiple of 64 over 128. 1154 // Whichever is smaller. 1155 const LLT &Ty = Query.Types[BigTyIdx]; 1156 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1157 if (NewSizeInBits >= 256) { 1158 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1159 if (RoundedTo < NewSizeInBits) 1160 NewSizeInBits = RoundedTo; 1161 } 1162 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1163 }) 1164 .legalIf([=](const LegalityQuery &Query) { 1165 const LLT &BigTy = Query.Types[BigTyIdx]; 1166 const LLT &LitTy = Query.Types[LitTyIdx]; 1167 1168 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1169 return false; 1170 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1171 return false; 1172 1173 return BigTy.getSizeInBits() % 16 == 0 && 1174 LitTy.getSizeInBits() % 16 == 0 && 1175 BigTy.getSizeInBits() <= 1024; 1176 }) 1177 // Any vectors left are the wrong size. Scalarize them. 1178 .scalarize(0) 1179 .scalarize(1); 1180 } 1181 1182 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1183 // RegBankSelect. 1184 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1185 .legalFor({{S32}, {S64}}); 1186 1187 if (ST.hasVOP3PInsts()) { 1188 SextInReg.lowerFor({{V2S16}}) 1189 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1190 // get more vector shift opportunities, since we'll get those when 1191 // expanded. 1192 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1193 } else if (ST.has16BitInsts()) { 1194 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1195 } else { 1196 // Prefer to promote to s32 before lowering if we don't have 16-bit 1197 // shifts. This avoid a lot of intermediate truncate and extend operations. 1198 SextInReg.lowerFor({{S32}, {S64}}); 1199 } 1200 1201 SextInReg 1202 .scalarize(0) 1203 .clampScalar(0, S32, S64) 1204 .lower(); 1205 1206 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1207 .legalFor({S64}); 1208 1209 getActionDefinitionsBuilder({ 1210 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1211 G_FCOPYSIGN, 1212 1213 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1214 G_READ_REGISTER, 1215 G_WRITE_REGISTER, 1216 1217 G_SADDO, G_SSUBO, 1218 1219 // TODO: Implement 1220 G_FMINIMUM, G_FMAXIMUM 1221 }).lower(); 1222 1223 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1224 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1225 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1226 .unsupported(); 1227 1228 computeTables(); 1229 verify(*ST.getInstrInfo()); 1230 } 1231 1232 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1233 MachineRegisterInfo &MRI, 1234 MachineIRBuilder &B, 1235 GISelChangeObserver &Observer) const { 1236 switch (MI.getOpcode()) { 1237 case TargetOpcode::G_ADDRSPACE_CAST: 1238 return legalizeAddrSpaceCast(MI, MRI, B); 1239 case TargetOpcode::G_FRINT: 1240 return legalizeFrint(MI, MRI, B); 1241 case TargetOpcode::G_FCEIL: 1242 return legalizeFceil(MI, MRI, B); 1243 case TargetOpcode::G_INTRINSIC_TRUNC: 1244 return legalizeIntrinsicTrunc(MI, MRI, B); 1245 case TargetOpcode::G_SITOFP: 1246 return legalizeITOFP(MI, MRI, B, true); 1247 case TargetOpcode::G_UITOFP: 1248 return legalizeITOFP(MI, MRI, B, false); 1249 case TargetOpcode::G_FPTOSI: 1250 return legalizeFPTOI(MI, MRI, B, true); 1251 case TargetOpcode::G_FPTOUI: 1252 return legalizeFPTOI(MI, MRI, B, false); 1253 case TargetOpcode::G_FMINNUM: 1254 case TargetOpcode::G_FMAXNUM: 1255 case TargetOpcode::G_FMINNUM_IEEE: 1256 case TargetOpcode::G_FMAXNUM_IEEE: 1257 return legalizeMinNumMaxNum(MI, MRI, B); 1258 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1259 return legalizeExtractVectorElt(MI, MRI, B); 1260 case TargetOpcode::G_INSERT_VECTOR_ELT: 1261 return legalizeInsertVectorElt(MI, MRI, B); 1262 case TargetOpcode::G_SHUFFLE_VECTOR: 1263 return legalizeShuffleVector(MI, MRI, B); 1264 case TargetOpcode::G_FSIN: 1265 case TargetOpcode::G_FCOS: 1266 return legalizeSinCos(MI, MRI, B); 1267 case TargetOpcode::G_GLOBAL_VALUE: 1268 return legalizeGlobalValue(MI, MRI, B); 1269 case TargetOpcode::G_LOAD: 1270 return legalizeLoad(MI, MRI, B, Observer); 1271 case TargetOpcode::G_FMAD: 1272 return legalizeFMad(MI, MRI, B); 1273 case TargetOpcode::G_FDIV: 1274 return legalizeFDIV(MI, MRI, B); 1275 case TargetOpcode::G_ATOMIC_CMPXCHG: 1276 return legalizeAtomicCmpXChg(MI, MRI, B); 1277 case TargetOpcode::G_FLOG: 1278 return legalizeFlog(MI, B, 1.0f / numbers::log2ef); 1279 case TargetOpcode::G_FLOG10: 1280 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1281 case TargetOpcode::G_FEXP: 1282 return legalizeFExp(MI, B); 1283 case TargetOpcode::G_FFLOOR: 1284 return legalizeFFloor(MI, MRI, B); 1285 case TargetOpcode::G_BUILD_VECTOR: 1286 return legalizeBuildVector(MI, MRI, B); 1287 default: 1288 return false; 1289 } 1290 1291 llvm_unreachable("expected switch to return"); 1292 } 1293 1294 Register AMDGPULegalizerInfo::getSegmentAperture( 1295 unsigned AS, 1296 MachineRegisterInfo &MRI, 1297 MachineIRBuilder &B) const { 1298 MachineFunction &MF = B.getMF(); 1299 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1300 const LLT S32 = LLT::scalar(32); 1301 1302 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1303 1304 if (ST.hasApertureRegs()) { 1305 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1306 // getreg. 1307 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1308 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1309 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1310 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1311 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1312 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1313 unsigned Encoding = 1314 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1315 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1316 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1317 1318 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1319 1320 B.buildInstr(AMDGPU::S_GETREG_B32) 1321 .addDef(GetReg) 1322 .addImm(Encoding); 1323 MRI.setType(GetReg, S32); 1324 1325 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1326 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1327 } 1328 1329 Register QueuePtr = MRI.createGenericVirtualRegister( 1330 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1331 1332 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1333 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1334 return Register(); 1335 1336 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1337 // private_segment_aperture_base_hi. 1338 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1339 1340 // TODO: can we be smarter about machine pointer info? 1341 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1342 MachineMemOperand *MMO = MF.getMachineMemOperand( 1343 PtrInfo, 1344 MachineMemOperand::MOLoad | 1345 MachineMemOperand::MODereferenceable | 1346 MachineMemOperand::MOInvariant, 1347 4, 1348 MinAlign(64, StructOffset)); 1349 1350 Register LoadAddr; 1351 1352 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1353 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1354 } 1355 1356 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1357 MachineInstr &MI, MachineRegisterInfo &MRI, 1358 MachineIRBuilder &B) const { 1359 MachineFunction &MF = B.getMF(); 1360 1361 B.setInstr(MI); 1362 1363 const LLT S32 = LLT::scalar(32); 1364 Register Dst = MI.getOperand(0).getReg(); 1365 Register Src = MI.getOperand(1).getReg(); 1366 1367 LLT DstTy = MRI.getType(Dst); 1368 LLT SrcTy = MRI.getType(Src); 1369 unsigned DestAS = DstTy.getAddressSpace(); 1370 unsigned SrcAS = SrcTy.getAddressSpace(); 1371 1372 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1373 // vector element. 1374 assert(!DstTy.isVector()); 1375 1376 const AMDGPUTargetMachine &TM 1377 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1378 1379 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1380 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1381 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1382 return true; 1383 } 1384 1385 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1386 // Truncate. 1387 B.buildExtract(Dst, Src, 0); 1388 MI.eraseFromParent(); 1389 return true; 1390 } 1391 1392 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1393 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1394 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1395 1396 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1397 // another. Merge operands are required to be the same type, but creating an 1398 // extra ptrtoint would be kind of pointless. 1399 auto HighAddr = B.buildConstant( 1400 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1401 B.buildMerge(Dst, {Src, HighAddr}); 1402 MI.eraseFromParent(); 1403 return true; 1404 } 1405 1406 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1407 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1408 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1409 unsigned NullVal = TM.getNullPointerValue(DestAS); 1410 1411 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1412 auto FlatNull = B.buildConstant(SrcTy, 0); 1413 1414 // Extract low 32-bits of the pointer. 1415 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1416 1417 auto CmpRes = 1418 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1419 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1420 1421 MI.eraseFromParent(); 1422 return true; 1423 } 1424 1425 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1426 return false; 1427 1428 if (!ST.hasFlatAddressSpace()) 1429 return false; 1430 1431 auto SegmentNull = 1432 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1433 auto FlatNull = 1434 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1435 1436 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1437 if (!ApertureReg.isValid()) 1438 return false; 1439 1440 auto CmpRes = 1441 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1442 1443 // Coerce the type of the low half of the result so we can use merge_values. 1444 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1445 1446 // TODO: Should we allow mismatched types but matching sizes in merges to 1447 // avoid the ptrtoint? 1448 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1449 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1450 1451 MI.eraseFromParent(); 1452 return true; 1453 } 1454 1455 bool AMDGPULegalizerInfo::legalizeFrint( 1456 MachineInstr &MI, MachineRegisterInfo &MRI, 1457 MachineIRBuilder &B) const { 1458 B.setInstr(MI); 1459 1460 Register Src = MI.getOperand(1).getReg(); 1461 LLT Ty = MRI.getType(Src); 1462 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1463 1464 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1465 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1466 1467 auto C1 = B.buildFConstant(Ty, C1Val); 1468 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1469 1470 // TODO: Should this propagate fast-math-flags? 1471 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1472 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1473 1474 auto C2 = B.buildFConstant(Ty, C2Val); 1475 auto Fabs = B.buildFAbs(Ty, Src); 1476 1477 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1478 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1479 return true; 1480 } 1481 1482 bool AMDGPULegalizerInfo::legalizeFceil( 1483 MachineInstr &MI, MachineRegisterInfo &MRI, 1484 MachineIRBuilder &B) const { 1485 B.setInstr(MI); 1486 1487 const LLT S1 = LLT::scalar(1); 1488 const LLT S64 = LLT::scalar(64); 1489 1490 Register Src = MI.getOperand(1).getReg(); 1491 assert(MRI.getType(Src) == S64); 1492 1493 // result = trunc(src) 1494 // if (src > 0.0 && src != result) 1495 // result += 1.0 1496 1497 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1498 1499 const auto Zero = B.buildFConstant(S64, 0.0); 1500 const auto One = B.buildFConstant(S64, 1.0); 1501 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1502 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1503 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1504 auto Add = B.buildSelect(S64, And, One, Zero); 1505 1506 // TODO: Should this propagate fast-math-flags? 1507 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1508 return true; 1509 } 1510 1511 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1512 MachineIRBuilder &B) { 1513 const unsigned FractBits = 52; 1514 const unsigned ExpBits = 11; 1515 LLT S32 = LLT::scalar(32); 1516 1517 auto Const0 = B.buildConstant(S32, FractBits - 32); 1518 auto Const1 = B.buildConstant(S32, ExpBits); 1519 1520 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1521 .addUse(Const0.getReg(0)) 1522 .addUse(Const1.getReg(0)); 1523 1524 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1525 } 1526 1527 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1528 MachineInstr &MI, MachineRegisterInfo &MRI, 1529 MachineIRBuilder &B) const { 1530 B.setInstr(MI); 1531 1532 const LLT S1 = LLT::scalar(1); 1533 const LLT S32 = LLT::scalar(32); 1534 const LLT S64 = LLT::scalar(64); 1535 1536 Register Src = MI.getOperand(1).getReg(); 1537 assert(MRI.getType(Src) == S64); 1538 1539 // TODO: Should this use extract since the low half is unused? 1540 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1541 Register Hi = Unmerge.getReg(1); 1542 1543 // Extract the upper half, since this is where we will find the sign and 1544 // exponent. 1545 auto Exp = extractF64Exponent(Hi, B); 1546 1547 const unsigned FractBits = 52; 1548 1549 // Extract the sign bit. 1550 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1551 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1552 1553 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1554 1555 const auto Zero32 = B.buildConstant(S32, 0); 1556 1557 // Extend back to 64-bits. 1558 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1559 1560 auto Shr = B.buildAShr(S64, FractMask, Exp); 1561 auto Not = B.buildNot(S64, Shr); 1562 auto Tmp0 = B.buildAnd(S64, Src, Not); 1563 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1564 1565 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1566 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1567 1568 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1569 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1570 return true; 1571 } 1572 1573 bool AMDGPULegalizerInfo::legalizeITOFP( 1574 MachineInstr &MI, MachineRegisterInfo &MRI, 1575 MachineIRBuilder &B, bool Signed) const { 1576 B.setInstr(MI); 1577 1578 Register Dst = MI.getOperand(0).getReg(); 1579 Register Src = MI.getOperand(1).getReg(); 1580 1581 const LLT S64 = LLT::scalar(64); 1582 const LLT S32 = LLT::scalar(32); 1583 1584 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1585 1586 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1587 1588 auto CvtHi = Signed ? 1589 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1590 B.buildUITOFP(S64, Unmerge.getReg(1)); 1591 1592 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1593 1594 auto ThirtyTwo = B.buildConstant(S32, 32); 1595 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1596 .addUse(CvtHi.getReg(0)) 1597 .addUse(ThirtyTwo.getReg(0)); 1598 1599 // TODO: Should this propagate fast-math-flags? 1600 B.buildFAdd(Dst, LdExp, CvtLo); 1601 MI.eraseFromParent(); 1602 return true; 1603 } 1604 1605 // TODO: Copied from DAG implementation. Verify logic and document how this 1606 // actually works. 1607 bool AMDGPULegalizerInfo::legalizeFPTOI( 1608 MachineInstr &MI, MachineRegisterInfo &MRI, 1609 MachineIRBuilder &B, bool Signed) const { 1610 B.setInstr(MI); 1611 1612 Register Dst = MI.getOperand(0).getReg(); 1613 Register Src = MI.getOperand(1).getReg(); 1614 1615 const LLT S64 = LLT::scalar(64); 1616 const LLT S32 = LLT::scalar(32); 1617 1618 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1619 1620 unsigned Flags = MI.getFlags(); 1621 1622 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1623 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1624 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1625 1626 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1627 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1628 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1629 1630 auto Hi = Signed ? 1631 B.buildFPTOSI(S32, FloorMul) : 1632 B.buildFPTOUI(S32, FloorMul); 1633 auto Lo = B.buildFPTOUI(S32, Fma); 1634 1635 B.buildMerge(Dst, { Lo, Hi }); 1636 MI.eraseFromParent(); 1637 1638 return true; 1639 } 1640 1641 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1642 MachineInstr &MI, MachineRegisterInfo &MRI, 1643 MachineIRBuilder &B) const { 1644 MachineFunction &MF = B.getMF(); 1645 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1646 1647 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1648 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1649 1650 // With ieee_mode disabled, the instructions have the correct behavior 1651 // already for G_FMINNUM/G_FMAXNUM 1652 if (!MFI->getMode().IEEE) 1653 return !IsIEEEOp; 1654 1655 if (IsIEEEOp) 1656 return true; 1657 1658 MachineIRBuilder HelperBuilder(MI); 1659 GISelObserverWrapper DummyObserver; 1660 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1661 HelperBuilder.setInstr(MI); 1662 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1663 } 1664 1665 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1666 MachineInstr &MI, MachineRegisterInfo &MRI, 1667 MachineIRBuilder &B) const { 1668 // TODO: Should move some of this into LegalizerHelper. 1669 1670 // TODO: Promote dynamic indexing of s16 to s32 1671 1672 // FIXME: Artifact combiner probably should have replaced the truncated 1673 // constant before this, so we shouldn't need 1674 // getConstantVRegValWithLookThrough. 1675 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1676 MI.getOperand(2).getReg(), MRI); 1677 if (!IdxVal) // Dynamic case will be selected to register indexing. 1678 return true; 1679 1680 Register Dst = MI.getOperand(0).getReg(); 1681 Register Vec = MI.getOperand(1).getReg(); 1682 1683 LLT VecTy = MRI.getType(Vec); 1684 LLT EltTy = VecTy.getElementType(); 1685 assert(EltTy == MRI.getType(Dst)); 1686 1687 B.setInstr(MI); 1688 1689 if (IdxVal->Value < VecTy.getNumElements()) 1690 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1691 else 1692 B.buildUndef(Dst); 1693 1694 MI.eraseFromParent(); 1695 return true; 1696 } 1697 1698 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1699 MachineInstr &MI, MachineRegisterInfo &MRI, 1700 MachineIRBuilder &B) const { 1701 // TODO: Should move some of this into LegalizerHelper. 1702 1703 // TODO: Promote dynamic indexing of s16 to s32 1704 1705 // FIXME: Artifact combiner probably should have replaced the truncated 1706 // constant before this, so we shouldn't need 1707 // getConstantVRegValWithLookThrough. 1708 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1709 MI.getOperand(3).getReg(), MRI); 1710 if (!IdxVal) // Dynamic case will be selected to register indexing. 1711 return true; 1712 1713 Register Dst = MI.getOperand(0).getReg(); 1714 Register Vec = MI.getOperand(1).getReg(); 1715 Register Ins = MI.getOperand(2).getReg(); 1716 1717 LLT VecTy = MRI.getType(Vec); 1718 LLT EltTy = VecTy.getElementType(); 1719 assert(EltTy == MRI.getType(Ins)); 1720 1721 B.setInstr(MI); 1722 1723 if (IdxVal->Value < VecTy.getNumElements()) 1724 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 1725 else 1726 B.buildUndef(Dst); 1727 1728 MI.eraseFromParent(); 1729 return true; 1730 } 1731 1732 static bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask) { 1733 assert(Mask.size() == 2); 1734 1735 // If one half is undef, the other is trivially in the same reg. 1736 if (Mask[0] == -1 || Mask[1] == -1) 1737 return true; 1738 return ((Mask[0] == 0 || Mask[0] == 1) && (Mask[1] == 0 || Mask[1] == 1)) || 1739 ((Mask[0] == 2 || Mask[0] == 3) && (Mask[1] == 2 || Mask[1] == 3)); 1740 } 1741 1742 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1743 MachineInstr &MI, MachineRegisterInfo &MRI, 1744 MachineIRBuilder &B) const { 1745 const LLT V2S16 = LLT::vector(2, 16); 1746 1747 Register Dst = MI.getOperand(0).getReg(); 1748 Register Src0 = MI.getOperand(1).getReg(); 1749 LLT DstTy = MRI.getType(Dst); 1750 LLT SrcTy = MRI.getType(Src0); 1751 1752 if (SrcTy == V2S16 && DstTy == V2S16 && 1753 isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1754 return true; 1755 1756 MachineIRBuilder HelperBuilder(MI); 1757 GISelObserverWrapper DummyObserver; 1758 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1759 HelperBuilder.setInstr(MI); 1760 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1761 } 1762 1763 bool AMDGPULegalizerInfo::legalizeSinCos( 1764 MachineInstr &MI, MachineRegisterInfo &MRI, 1765 MachineIRBuilder &B) const { 1766 B.setInstr(MI); 1767 1768 Register DstReg = MI.getOperand(0).getReg(); 1769 Register SrcReg = MI.getOperand(1).getReg(); 1770 LLT Ty = MRI.getType(DstReg); 1771 unsigned Flags = MI.getFlags(); 1772 1773 Register TrigVal; 1774 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1775 if (ST.hasTrigReducedRange()) { 1776 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1777 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1778 .addUse(MulVal.getReg(0)) 1779 .setMIFlags(Flags).getReg(0); 1780 } else 1781 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1782 1783 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1784 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1785 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1786 .addUse(TrigVal) 1787 .setMIFlags(Flags); 1788 MI.eraseFromParent(); 1789 return true; 1790 } 1791 1792 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1793 Register DstReg, LLT PtrTy, 1794 MachineIRBuilder &B, const GlobalValue *GV, 1795 unsigned Offset, unsigned GAFlags) const { 1796 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1797 // to the following code sequence: 1798 // 1799 // For constant address space: 1800 // s_getpc_b64 s[0:1] 1801 // s_add_u32 s0, s0, $symbol 1802 // s_addc_u32 s1, s1, 0 1803 // 1804 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1805 // a fixup or relocation is emitted to replace $symbol with a literal 1806 // constant, which is a pc-relative offset from the encoding of the $symbol 1807 // operand to the global variable. 1808 // 1809 // For global address space: 1810 // s_getpc_b64 s[0:1] 1811 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1812 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1813 // 1814 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1815 // fixups or relocations are emitted to replace $symbol@*@lo and 1816 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1817 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1818 // operand to the global variable. 1819 // 1820 // What we want here is an offset from the value returned by s_getpc 1821 // (which is the address of the s_add_u32 instruction) to the global 1822 // variable, but since the encoding of $symbol starts 4 bytes after the start 1823 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1824 // small. This requires us to add 4 to the global variable offset in order to 1825 // compute the correct address. 1826 1827 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1828 1829 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1830 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1831 1832 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1833 .addDef(PCReg); 1834 1835 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1836 if (GAFlags == SIInstrInfo::MO_NONE) 1837 MIB.addImm(0); 1838 else 1839 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1840 1841 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1842 1843 if (PtrTy.getSizeInBits() == 32) 1844 B.buildExtract(DstReg, PCReg, 0); 1845 return true; 1846 } 1847 1848 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1849 MachineInstr &MI, MachineRegisterInfo &MRI, 1850 MachineIRBuilder &B) const { 1851 Register DstReg = MI.getOperand(0).getReg(); 1852 LLT Ty = MRI.getType(DstReg); 1853 unsigned AS = Ty.getAddressSpace(); 1854 1855 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1856 MachineFunction &MF = B.getMF(); 1857 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1858 B.setInstr(MI); 1859 1860 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1861 if (!MFI->isEntryFunction()) { 1862 const Function &Fn = MF.getFunction(); 1863 DiagnosticInfoUnsupported BadLDSDecl( 1864 Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 1865 Fn.getContext().diagnose(BadLDSDecl); 1866 } 1867 1868 // TODO: We could emit code to handle the initialization somewhere. 1869 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1870 const SITargetLowering *TLI = ST.getTargetLowering(); 1871 if (!TLI->shouldUseLDSConstAddress(GV)) { 1872 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 1873 return true; // Leave in place; 1874 } 1875 1876 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1877 MI.eraseFromParent(); 1878 return true; 1879 } 1880 1881 const Function &Fn = MF.getFunction(); 1882 DiagnosticInfoUnsupported BadInit( 1883 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1884 Fn.getContext().diagnose(BadInit); 1885 return true; 1886 } 1887 1888 const SITargetLowering *TLI = ST.getTargetLowering(); 1889 1890 if (TLI->shouldEmitFixup(GV)) { 1891 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 1892 MI.eraseFromParent(); 1893 return true; 1894 } 1895 1896 if (TLI->shouldEmitPCReloc(GV)) { 1897 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 1898 MI.eraseFromParent(); 1899 return true; 1900 } 1901 1902 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1903 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 1904 1905 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 1906 MachinePointerInfo::getGOT(MF), 1907 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1908 MachineMemOperand::MOInvariant, 1909 8 /*Size*/, 8 /*Align*/); 1910 1911 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 1912 1913 if (Ty.getSizeInBits() == 32) { 1914 // Truncate if this is a 32-bit constant adrdess. 1915 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 1916 B.buildExtract(DstReg, Load, 0); 1917 } else 1918 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 1919 1920 MI.eraseFromParent(); 1921 return true; 1922 } 1923 1924 bool AMDGPULegalizerInfo::legalizeLoad( 1925 MachineInstr &MI, MachineRegisterInfo &MRI, 1926 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 1927 B.setInstr(MI); 1928 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1929 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 1930 Observer.changingInstr(MI); 1931 MI.getOperand(1).setReg(Cast.getReg(0)); 1932 Observer.changedInstr(MI); 1933 return true; 1934 } 1935 1936 bool AMDGPULegalizerInfo::legalizeFMad( 1937 MachineInstr &MI, MachineRegisterInfo &MRI, 1938 MachineIRBuilder &B) const { 1939 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 1940 assert(Ty.isScalar()); 1941 1942 MachineFunction &MF = B.getMF(); 1943 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1944 1945 // TODO: Always legal with future ftz flag. 1946 // FIXME: Do we need just output? 1947 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 1948 return true; 1949 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 1950 return true; 1951 1952 MachineIRBuilder HelperBuilder(MI); 1953 GISelObserverWrapper DummyObserver; 1954 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1955 HelperBuilder.setMBB(*MI.getParent()); 1956 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 1957 } 1958 1959 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 1960 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 1961 Register DstReg = MI.getOperand(0).getReg(); 1962 Register PtrReg = MI.getOperand(1).getReg(); 1963 Register CmpVal = MI.getOperand(2).getReg(); 1964 Register NewVal = MI.getOperand(3).getReg(); 1965 1966 assert(SITargetLowering::isFlatGlobalAddrSpace( 1967 MRI.getType(PtrReg).getAddressSpace()) && 1968 "this should not have been custom lowered"); 1969 1970 LLT ValTy = MRI.getType(CmpVal); 1971 LLT VecTy = LLT::vector(2, ValTy); 1972 1973 B.setInstr(MI); 1974 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 1975 1976 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 1977 .addDef(DstReg) 1978 .addUse(PtrReg) 1979 .addUse(PackedVal) 1980 .setMemRefs(MI.memoperands()); 1981 1982 MI.eraseFromParent(); 1983 return true; 1984 } 1985 1986 bool AMDGPULegalizerInfo::legalizeFlog( 1987 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 1988 Register Dst = MI.getOperand(0).getReg(); 1989 Register Src = MI.getOperand(1).getReg(); 1990 LLT Ty = B.getMRI()->getType(Dst); 1991 unsigned Flags = MI.getFlags(); 1992 B.setInstr(MI); 1993 1994 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 1995 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 1996 1997 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 1998 MI.eraseFromParent(); 1999 return true; 2000 } 2001 2002 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2003 MachineIRBuilder &B) const { 2004 Register Dst = MI.getOperand(0).getReg(); 2005 Register Src = MI.getOperand(1).getReg(); 2006 unsigned Flags = MI.getFlags(); 2007 LLT Ty = B.getMRI()->getType(Dst); 2008 B.setInstr(MI); 2009 2010 auto K = B.buildFConstant(Ty, numbers::log2e); 2011 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2012 B.buildFExp2(Dst, Mul, Flags); 2013 MI.eraseFromParent(); 2014 return true; 2015 } 2016 2017 // Find a source register, ignoring any possible source modifiers. 2018 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2019 Register ModSrc = OrigSrc; 2020 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2021 ModSrc = SrcFNeg->getOperand(1).getReg(); 2022 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2023 ModSrc = SrcFAbs->getOperand(1).getReg(); 2024 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2025 ModSrc = SrcFAbs->getOperand(1).getReg(); 2026 return ModSrc; 2027 } 2028 2029 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2030 MachineRegisterInfo &MRI, 2031 MachineIRBuilder &B) const { 2032 B.setInstr(MI); 2033 2034 const LLT S1 = LLT::scalar(1); 2035 const LLT S64 = LLT::scalar(64); 2036 Register Dst = MI.getOperand(0).getReg(); 2037 Register OrigSrc = MI.getOperand(1).getReg(); 2038 unsigned Flags = MI.getFlags(); 2039 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2040 "this should not have been custom lowered"); 2041 2042 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2043 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2044 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2045 // V_FRACT bug is: 2046 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2047 // 2048 // Convert floor(x) to (x - fract(x)) 2049 2050 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2051 .addUse(OrigSrc) 2052 .setMIFlags(Flags); 2053 2054 // Give source modifier matching some assistance before obscuring a foldable 2055 // pattern. 2056 2057 // TODO: We can avoid the neg on the fract? The input sign to fract 2058 // shouldn't matter? 2059 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2060 2061 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2062 2063 Register Min = MRI.createGenericVirtualRegister(S64); 2064 2065 // We don't need to concern ourselves with the snan handling difference, so 2066 // use the one which will directly select. 2067 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2068 if (MFI->getMode().IEEE) 2069 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2070 else 2071 B.buildFMinNum(Min, Fract, Const, Flags); 2072 2073 Register CorrectedFract = Min; 2074 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2075 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2076 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2077 } 2078 2079 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2080 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2081 2082 MI.eraseFromParent(); 2083 return true; 2084 } 2085 2086 // Turn an illegal packed v2s16 build vector into bit operations. 2087 // TODO: This should probably be a bitcast action in LegalizerHelper. 2088 bool AMDGPULegalizerInfo::legalizeBuildVector( 2089 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2090 Register Dst = MI.getOperand(0).getReg(); 2091 LLT DstTy = MRI.getType(Dst); 2092 const LLT S32 = LLT::scalar(32); 2093 const LLT V2S16 = LLT::vector(2, 16); 2094 (void)DstTy; 2095 (void)V2S16; 2096 assert(DstTy == V2S16); 2097 2098 Register Src0 = MI.getOperand(1).getReg(); 2099 Register Src1 = MI.getOperand(2).getReg(); 2100 assert(MRI.getType(Src0) == LLT::scalar(16)); 2101 2102 B.setInstr(MI); 2103 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2104 B.buildBitcast(Dst, Merge); 2105 2106 MI.eraseFromParent(); 2107 return true; 2108 } 2109 2110 // Return the use branch instruction, otherwise null if the usage is invalid. 2111 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2112 MachineRegisterInfo &MRI, 2113 MachineInstr *&Br) { 2114 Register CondDef = MI.getOperand(0).getReg(); 2115 if (!MRI.hasOneNonDBGUse(CondDef)) 2116 return nullptr; 2117 2118 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2119 if (UseMI.getParent() != MI.getParent() || 2120 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2121 return nullptr; 2122 2123 // Make sure the cond br is followed by a G_BR 2124 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2125 if (Next != MI.getParent()->end()) { 2126 if (Next->getOpcode() != AMDGPU::G_BR) 2127 return nullptr; 2128 Br = &*Next; 2129 } 2130 2131 return &UseMI; 2132 } 2133 2134 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 2135 Register Reg, LLT Ty) const { 2136 Register LiveIn = MRI.getLiveInVirtReg(Reg); 2137 if (LiveIn) 2138 return LiveIn; 2139 2140 Register NewReg = MRI.createGenericVirtualRegister(Ty); 2141 MRI.addLiveIn(Reg, NewReg); 2142 return NewReg; 2143 } 2144 2145 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2146 const ArgDescriptor *Arg) const { 2147 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2148 return false; // TODO: Handle these 2149 2150 assert(Arg->getRegister().isPhysical()); 2151 2152 MachineRegisterInfo &MRI = *B.getMRI(); 2153 2154 LLT Ty = MRI.getType(DstReg); 2155 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 2156 2157 if (Arg->isMasked()) { 2158 // TODO: Should we try to emit this once in the entry block? 2159 const LLT S32 = LLT::scalar(32); 2160 const unsigned Mask = Arg->getMask(); 2161 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2162 2163 Register AndMaskSrc = LiveIn; 2164 2165 if (Shift != 0) { 2166 auto ShiftAmt = B.buildConstant(S32, Shift); 2167 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2168 } 2169 2170 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2171 } else 2172 B.buildCopy(DstReg, LiveIn); 2173 2174 // Insert the argument copy if it doens't already exist. 2175 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2176 if (!MRI.getVRegDef(LiveIn)) { 2177 // FIXME: Should have scoped insert pt 2178 MachineBasicBlock &OrigInsBB = B.getMBB(); 2179 auto OrigInsPt = B.getInsertPt(); 2180 2181 MachineBasicBlock &EntryMBB = B.getMF().front(); 2182 EntryMBB.addLiveIn(Arg->getRegister()); 2183 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2184 B.buildCopy(LiveIn, Arg->getRegister()); 2185 2186 B.setInsertPt(OrigInsBB, OrigInsPt); 2187 } 2188 2189 return true; 2190 } 2191 2192 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2193 MachineInstr &MI, 2194 MachineRegisterInfo &MRI, 2195 MachineIRBuilder &B, 2196 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2197 B.setInstr(MI); 2198 2199 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2200 2201 const ArgDescriptor *Arg; 2202 const TargetRegisterClass *RC; 2203 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 2204 if (!Arg) { 2205 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2206 return false; 2207 } 2208 2209 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 2210 MI.eraseFromParent(); 2211 return true; 2212 } 2213 2214 return false; 2215 } 2216 2217 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2218 MachineRegisterInfo &MRI, 2219 MachineIRBuilder &B) const { 2220 B.setInstr(MI); 2221 Register Dst = MI.getOperand(0).getReg(); 2222 LLT DstTy = MRI.getType(Dst); 2223 LLT S16 = LLT::scalar(16); 2224 LLT S32 = LLT::scalar(32); 2225 LLT S64 = LLT::scalar(64); 2226 2227 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2228 return true; 2229 2230 if (DstTy == S16) 2231 return legalizeFDIV16(MI, MRI, B); 2232 if (DstTy == S32) 2233 return legalizeFDIV32(MI, MRI, B); 2234 if (DstTy == S64) 2235 return legalizeFDIV64(MI, MRI, B); 2236 2237 return false; 2238 } 2239 2240 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2241 MachineRegisterInfo &MRI, 2242 MachineIRBuilder &B) const { 2243 Register Res = MI.getOperand(0).getReg(); 2244 Register LHS = MI.getOperand(1).getReg(); 2245 Register RHS = MI.getOperand(2).getReg(); 2246 2247 uint16_t Flags = MI.getFlags(); 2248 2249 LLT ResTy = MRI.getType(Res); 2250 LLT S32 = LLT::scalar(32); 2251 LLT S64 = LLT::scalar(64); 2252 2253 const MachineFunction &MF = B.getMF(); 2254 bool Unsafe = 2255 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2256 2257 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2258 return false; 2259 2260 if (!Unsafe && ResTy == S32 && 2261 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2262 return false; 2263 2264 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2265 // 1 / x -> RCP(x) 2266 if (CLHS->isExactlyValue(1.0)) { 2267 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2268 .addUse(RHS) 2269 .setMIFlags(Flags); 2270 2271 MI.eraseFromParent(); 2272 return true; 2273 } 2274 2275 // -1 / x -> RCP( FNEG(x) ) 2276 if (CLHS->isExactlyValue(-1.0)) { 2277 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2278 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2279 .addUse(FNeg.getReg(0)) 2280 .setMIFlags(Flags); 2281 2282 MI.eraseFromParent(); 2283 return true; 2284 } 2285 } 2286 2287 // x / y -> x * (1.0 / y) 2288 if (Unsafe) { 2289 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2290 .addUse(RHS) 2291 .setMIFlags(Flags); 2292 B.buildFMul(Res, LHS, RCP, Flags); 2293 2294 MI.eraseFromParent(); 2295 return true; 2296 } 2297 2298 return false; 2299 } 2300 2301 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2302 MachineRegisterInfo &MRI, 2303 MachineIRBuilder &B) const { 2304 B.setInstr(MI); 2305 Register Res = MI.getOperand(0).getReg(); 2306 Register LHS = MI.getOperand(1).getReg(); 2307 Register RHS = MI.getOperand(2).getReg(); 2308 2309 uint16_t Flags = MI.getFlags(); 2310 2311 LLT S16 = LLT::scalar(16); 2312 LLT S32 = LLT::scalar(32); 2313 2314 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2315 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2316 2317 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2318 .addUse(RHSExt.getReg(0)) 2319 .setMIFlags(Flags); 2320 2321 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2322 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2323 2324 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2325 .addUse(RDst.getReg(0)) 2326 .addUse(RHS) 2327 .addUse(LHS) 2328 .setMIFlags(Flags); 2329 2330 MI.eraseFromParent(); 2331 return true; 2332 } 2333 2334 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2335 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2336 static void toggleSPDenormMode(bool Enable, 2337 MachineIRBuilder &B, 2338 const GCNSubtarget &ST, 2339 AMDGPU::SIModeRegisterDefaults Mode) { 2340 // Set SP denorm mode to this value. 2341 unsigned SPDenormMode = 2342 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2343 2344 if (ST.hasDenormModeInst()) { 2345 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2346 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2347 2348 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2349 B.buildInstr(AMDGPU::S_DENORM_MODE) 2350 .addImm(NewDenormModeValue); 2351 2352 } else { 2353 // Select FP32 bit field in mode register. 2354 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2355 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2356 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2357 2358 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2359 .addImm(SPDenormMode) 2360 .addImm(SPDenormModeBitField); 2361 } 2362 } 2363 2364 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2365 MachineRegisterInfo &MRI, 2366 MachineIRBuilder &B) const { 2367 B.setInstr(MI); 2368 Register Res = MI.getOperand(0).getReg(); 2369 Register LHS = MI.getOperand(1).getReg(); 2370 Register RHS = MI.getOperand(2).getReg(); 2371 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2372 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2373 2374 uint16_t Flags = MI.getFlags(); 2375 2376 LLT S32 = LLT::scalar(32); 2377 LLT S1 = LLT::scalar(1); 2378 2379 auto One = B.buildFConstant(S32, 1.0f); 2380 2381 auto DenominatorScaled = 2382 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2383 .addUse(RHS) 2384 .addUse(LHS) 2385 .addImm(1) 2386 .setMIFlags(Flags); 2387 auto NumeratorScaled = 2388 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2389 .addUse(LHS) 2390 .addUse(RHS) 2391 .addImm(0) 2392 .setMIFlags(Flags); 2393 2394 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2395 .addUse(DenominatorScaled.getReg(0)) 2396 .setMIFlags(Flags); 2397 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2398 2399 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2400 // aren't modeled as reading it. 2401 if (!Mode.allFP32Denormals()) 2402 toggleSPDenormMode(true, B, ST, Mode); 2403 2404 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2405 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2406 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2407 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2408 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2409 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2410 2411 if (!Mode.allFP32Denormals()) 2412 toggleSPDenormMode(false, B, ST, Mode); 2413 2414 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2415 .addUse(Fma4.getReg(0)) 2416 .addUse(Fma1.getReg(0)) 2417 .addUse(Fma3.getReg(0)) 2418 .addUse(NumeratorScaled.getReg(1)) 2419 .setMIFlags(Flags); 2420 2421 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2422 .addUse(Fmas.getReg(0)) 2423 .addUse(RHS) 2424 .addUse(LHS) 2425 .setMIFlags(Flags); 2426 2427 MI.eraseFromParent(); 2428 return true; 2429 } 2430 2431 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2432 MachineRegisterInfo &MRI, 2433 MachineIRBuilder &B) const { 2434 B.setInstr(MI); 2435 Register Res = MI.getOperand(0).getReg(); 2436 Register LHS = MI.getOperand(1).getReg(); 2437 Register RHS = MI.getOperand(2).getReg(); 2438 2439 uint16_t Flags = MI.getFlags(); 2440 2441 LLT S64 = LLT::scalar(64); 2442 LLT S1 = LLT::scalar(1); 2443 2444 auto One = B.buildFConstant(S64, 1.0); 2445 2446 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2447 .addUse(LHS) 2448 .addUse(RHS) 2449 .addImm(1) 2450 .setMIFlags(Flags); 2451 2452 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2453 2454 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2455 .addUse(DivScale0.getReg(0)) 2456 .setMIFlags(Flags); 2457 2458 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2459 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2460 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2461 2462 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2463 .addUse(LHS) 2464 .addUse(RHS) 2465 .addImm(0) 2466 .setMIFlags(Flags); 2467 2468 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2469 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); 2470 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2471 2472 Register Scale; 2473 if (!ST.hasUsableDivScaleConditionOutput()) { 2474 // Workaround a hardware bug on SI where the condition output from div_scale 2475 // is not usable. 2476 2477 LLT S32 = LLT::scalar(32); 2478 2479 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2480 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2481 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2482 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 2483 2484 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 2485 Scale1Unmerge.getReg(1)); 2486 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 2487 Scale0Unmerge.getReg(1)); 2488 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 2489 } else { 2490 Scale = DivScale1.getReg(1); 2491 } 2492 2493 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 2494 .addUse(Fma4.getReg(0)) 2495 .addUse(Fma3.getReg(0)) 2496 .addUse(Mul.getReg(0)) 2497 .addUse(Scale) 2498 .setMIFlags(Flags); 2499 2500 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 2501 .addUse(Fmas.getReg(0)) 2502 .addUse(RHS) 2503 .addUse(LHS) 2504 .setMIFlags(Flags); 2505 2506 MI.eraseFromParent(); 2507 return true; 2508 } 2509 2510 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 2511 MachineRegisterInfo &MRI, 2512 MachineIRBuilder &B) const { 2513 B.setInstr(MI); 2514 Register Res = MI.getOperand(0).getReg(); 2515 Register LHS = MI.getOperand(2).getReg(); 2516 Register RHS = MI.getOperand(3).getReg(); 2517 uint16_t Flags = MI.getFlags(); 2518 2519 LLT S32 = LLT::scalar(32); 2520 LLT S1 = LLT::scalar(1); 2521 2522 auto Abs = B.buildFAbs(S32, RHS, Flags); 2523 const APFloat C0Val(1.0f); 2524 2525 auto C0 = B.buildConstant(S32, 0x6f800000); 2526 auto C1 = B.buildConstant(S32, 0x2f800000); 2527 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 2528 2529 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 2530 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 2531 2532 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 2533 2534 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2535 .addUse(Mul0.getReg(0)) 2536 .setMIFlags(Flags); 2537 2538 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 2539 2540 B.buildFMul(Res, Sel, Mul1, Flags); 2541 2542 MI.eraseFromParent(); 2543 return true; 2544 } 2545 2546 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 2547 MachineRegisterInfo &MRI, 2548 MachineIRBuilder &B) const { 2549 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2550 if (!MFI->isEntryFunction()) { 2551 return legalizePreloadedArgIntrin(MI, MRI, B, 2552 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 2553 } 2554 2555 B.setInstr(MI); 2556 2557 uint64_t Offset = 2558 ST.getTargetLowering()->getImplicitParameterOffset( 2559 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 2560 Register DstReg = MI.getOperand(0).getReg(); 2561 LLT DstTy = MRI.getType(DstReg); 2562 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 2563 2564 const ArgDescriptor *Arg; 2565 const TargetRegisterClass *RC; 2566 std::tie(Arg, RC) 2567 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2568 if (!Arg) 2569 return false; 2570 2571 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 2572 if (!loadInputValue(KernargPtrReg, B, Arg)) 2573 return false; 2574 2575 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 2576 MI.eraseFromParent(); 2577 return true; 2578 } 2579 2580 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 2581 MachineRegisterInfo &MRI, 2582 MachineIRBuilder &B, 2583 unsigned AddrSpace) const { 2584 B.setInstr(MI); 2585 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 2586 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 2587 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 2588 MI.eraseFromParent(); 2589 return true; 2590 } 2591 2592 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 2593 // offset (the offset that is included in bounds checking and swizzling, to be 2594 // split between the instruction's voffset and immoffset fields) and soffset 2595 // (the offset that is excluded from bounds checking and swizzling, to go in 2596 // the instruction's soffset field). This function takes the first kind of 2597 // offset and figures out how to split it between voffset and immoffset. 2598 std::tuple<Register, unsigned, unsigned> 2599 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 2600 Register OrigOffset) const { 2601 const unsigned MaxImm = 4095; 2602 Register BaseReg; 2603 unsigned TotalConstOffset; 2604 MachineInstr *OffsetDef; 2605 const LLT S32 = LLT::scalar(32); 2606 2607 std::tie(BaseReg, TotalConstOffset, OffsetDef) 2608 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 2609 2610 unsigned ImmOffset = TotalConstOffset; 2611 2612 // If the immediate value is too big for the immoffset field, put the value 2613 // and -4096 into the immoffset field so that the value that is copied/added 2614 // for the voffset field is a multiple of 4096, and it stands more chance 2615 // of being CSEd with the copy/add for another similar load/store. 2616 // However, do not do that rounding down to a multiple of 4096 if that is a 2617 // negative number, as it appears to be illegal to have a negative offset 2618 // in the vgpr, even if adding the immediate offset makes it positive. 2619 unsigned Overflow = ImmOffset & ~MaxImm; 2620 ImmOffset -= Overflow; 2621 if ((int32_t)Overflow < 0) { 2622 Overflow += ImmOffset; 2623 ImmOffset = 0; 2624 } 2625 2626 if (Overflow != 0) { 2627 if (!BaseReg) { 2628 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 2629 } else { 2630 auto OverflowVal = B.buildConstant(S32, Overflow); 2631 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 2632 } 2633 } 2634 2635 if (!BaseReg) 2636 BaseReg = B.buildConstant(S32, 0).getReg(0); 2637 2638 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 2639 } 2640 2641 /// Handle register layout difference for f16 images for some subtargets. 2642 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 2643 MachineRegisterInfo &MRI, 2644 Register Reg) const { 2645 if (!ST.hasUnpackedD16VMem()) 2646 return Reg; 2647 2648 const LLT S16 = LLT::scalar(16); 2649 const LLT S32 = LLT::scalar(32); 2650 LLT StoreVT = MRI.getType(Reg); 2651 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 2652 2653 auto Unmerge = B.buildUnmerge(S16, Reg); 2654 2655 SmallVector<Register, 4> WideRegs; 2656 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 2657 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 2658 2659 int NumElts = StoreVT.getNumElements(); 2660 2661 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 2662 } 2663 2664 Register AMDGPULegalizerInfo::fixStoreSourceType( 2665 MachineIRBuilder &B, Register VData, bool IsFormat) const { 2666 MachineRegisterInfo *MRI = B.getMRI(); 2667 LLT Ty = MRI->getType(VData); 2668 2669 const LLT S16 = LLT::scalar(16); 2670 2671 // Fixup illegal register types for i8 stores. 2672 if (Ty == LLT::scalar(8) || Ty == S16) { 2673 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 2674 return AnyExt; 2675 } 2676 2677 if (Ty.isVector()) { 2678 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 2679 if (IsFormat) 2680 return handleD16VData(B, *MRI, VData); 2681 } 2682 } 2683 2684 return VData; 2685 } 2686 2687 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 2688 MachineRegisterInfo &MRI, 2689 MachineIRBuilder &B, 2690 bool IsTyped, 2691 bool IsFormat) const { 2692 B.setInstr(MI); 2693 2694 Register VData = MI.getOperand(1).getReg(); 2695 LLT Ty = MRI.getType(VData); 2696 LLT EltTy = Ty.getScalarType(); 2697 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 2698 const LLT S32 = LLT::scalar(32); 2699 2700 VData = fixStoreSourceType(B, VData, IsFormat); 2701 Register RSrc = MI.getOperand(2).getReg(); 2702 2703 MachineMemOperand *MMO = *MI.memoperands_begin(); 2704 const int MemSize = MMO->getSize(); 2705 2706 unsigned ImmOffset; 2707 unsigned TotalOffset; 2708 2709 // The typed intrinsics add an immediate after the registers. 2710 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 2711 2712 // The struct intrinsic variants add one additional operand over raw. 2713 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 2714 Register VIndex; 2715 int OpOffset = 0; 2716 if (HasVIndex) { 2717 VIndex = MI.getOperand(3).getReg(); 2718 OpOffset = 1; 2719 } 2720 2721 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 2722 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 2723 2724 unsigned Format = 0; 2725 if (IsTyped) { 2726 Format = MI.getOperand(5 + OpOffset).getImm(); 2727 ++OpOffset; 2728 } 2729 2730 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 2731 2732 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 2733 if (TotalOffset != 0) 2734 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 2735 2736 unsigned Opc; 2737 if (IsTyped) { 2738 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 2739 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 2740 } else if (IsFormat) { 2741 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 2742 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 2743 } else { 2744 switch (MemSize) { 2745 case 1: 2746 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 2747 break; 2748 case 2: 2749 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 2750 break; 2751 default: 2752 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 2753 break; 2754 } 2755 } 2756 2757 if (!VIndex) 2758 VIndex = B.buildConstant(S32, 0).getReg(0); 2759 2760 auto MIB = B.buildInstr(Opc) 2761 .addUse(VData) // vdata 2762 .addUse(RSrc) // rsrc 2763 .addUse(VIndex) // vindex 2764 .addUse(VOffset) // voffset 2765 .addUse(SOffset) // soffset 2766 .addImm(ImmOffset); // offset(imm) 2767 2768 if (IsTyped) 2769 MIB.addImm(Format); 2770 2771 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 2772 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 2773 .addMemOperand(MMO); 2774 2775 MI.eraseFromParent(); 2776 return true; 2777 } 2778 2779 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 2780 MachineRegisterInfo &MRI, 2781 MachineIRBuilder &B, 2782 bool IsFormat, 2783 bool IsTyped) const { 2784 B.setInstr(MI); 2785 2786 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 2787 MachineMemOperand *MMO = *MI.memoperands_begin(); 2788 const int MemSize = MMO->getSize(); 2789 const LLT S32 = LLT::scalar(32); 2790 2791 Register Dst = MI.getOperand(0).getReg(); 2792 Register RSrc = MI.getOperand(2).getReg(); 2793 2794 // The typed intrinsics add an immediate after the registers. 2795 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 2796 2797 // The struct intrinsic variants add one additional operand over raw. 2798 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 2799 Register VIndex; 2800 int OpOffset = 0; 2801 if (HasVIndex) { 2802 VIndex = MI.getOperand(3).getReg(); 2803 OpOffset = 1; 2804 } 2805 2806 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 2807 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 2808 2809 unsigned Format = 0; 2810 if (IsTyped) { 2811 Format = MI.getOperand(5 + OpOffset).getImm(); 2812 ++OpOffset; 2813 } 2814 2815 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 2816 unsigned ImmOffset; 2817 unsigned TotalOffset; 2818 2819 LLT Ty = MRI.getType(Dst); 2820 LLT EltTy = Ty.getScalarType(); 2821 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 2822 const bool Unpacked = ST.hasUnpackedD16VMem(); 2823 2824 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 2825 if (TotalOffset != 0) 2826 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 2827 2828 unsigned Opc; 2829 2830 if (IsTyped) { 2831 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 2832 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 2833 } else if (IsFormat) { 2834 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 2835 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 2836 } else { 2837 switch (MemSize) { 2838 case 1: 2839 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 2840 break; 2841 case 2: 2842 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 2843 break; 2844 default: 2845 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 2846 break; 2847 } 2848 } 2849 2850 Register LoadDstReg; 2851 2852 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 2853 LLT UnpackedTy = Ty.changeElementSize(32); 2854 2855 if (IsExtLoad) 2856 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 2857 else if (Unpacked && IsD16 && Ty.isVector()) 2858 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 2859 else 2860 LoadDstReg = Dst; 2861 2862 if (!VIndex) 2863 VIndex = B.buildConstant(S32, 0).getReg(0); 2864 2865 auto MIB = B.buildInstr(Opc) 2866 .addDef(LoadDstReg) // vdata 2867 .addUse(RSrc) // rsrc 2868 .addUse(VIndex) // vindex 2869 .addUse(VOffset) // voffset 2870 .addUse(SOffset) // soffset 2871 .addImm(ImmOffset); // offset(imm) 2872 2873 if (IsTyped) 2874 MIB.addImm(Format); 2875 2876 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 2877 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 2878 .addMemOperand(MMO); 2879 2880 if (LoadDstReg != Dst) { 2881 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2882 2883 // Widen result for extending loads was widened. 2884 if (IsExtLoad) 2885 B.buildTrunc(Dst, LoadDstReg); 2886 else { 2887 // Repack to original 16-bit vector result 2888 // FIXME: G_TRUNC should work, but legalization currently fails 2889 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 2890 SmallVector<Register, 4> Repack; 2891 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 2892 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 2893 B.buildMerge(Dst, Repack); 2894 } 2895 } 2896 2897 MI.eraseFromParent(); 2898 return true; 2899 } 2900 2901 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 2902 MachineIRBuilder &B, 2903 bool IsInc) const { 2904 B.setInstr(MI); 2905 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 2906 AMDGPU::G_AMDGPU_ATOMIC_DEC; 2907 B.buildInstr(Opc) 2908 .addDef(MI.getOperand(0).getReg()) 2909 .addUse(MI.getOperand(2).getReg()) 2910 .addUse(MI.getOperand(3).getReg()) 2911 .cloneMemRefs(MI); 2912 MI.eraseFromParent(); 2913 return true; 2914 } 2915 2916 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 2917 switch (IntrID) { 2918 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 2919 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 2920 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 2921 case Intrinsic::amdgcn_raw_buffer_atomic_add: 2922 case Intrinsic::amdgcn_struct_buffer_atomic_add: 2923 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 2924 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 2925 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 2926 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 2927 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 2928 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 2929 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 2930 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 2931 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 2932 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 2933 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 2934 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 2935 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 2936 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 2937 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 2938 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 2939 case Intrinsic::amdgcn_raw_buffer_atomic_and: 2940 case Intrinsic::amdgcn_struct_buffer_atomic_and: 2941 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 2942 case Intrinsic::amdgcn_raw_buffer_atomic_or: 2943 case Intrinsic::amdgcn_struct_buffer_atomic_or: 2944 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 2945 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 2946 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 2947 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 2948 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 2949 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 2950 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 2951 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 2952 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 2953 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 2954 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 2955 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 2956 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 2957 default: 2958 llvm_unreachable("unhandled atomic opcode"); 2959 } 2960 } 2961 2962 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 2963 MachineIRBuilder &B, 2964 Intrinsic::ID IID) const { 2965 B.setInstr(MI); 2966 2967 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 2968 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 2969 2970 Register Dst = MI.getOperand(0).getReg(); 2971 Register VData = MI.getOperand(2).getReg(); 2972 2973 Register CmpVal; 2974 int OpOffset = 0; 2975 2976 if (IsCmpSwap) { 2977 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 2978 ++OpOffset; 2979 } 2980 2981 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 2982 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 2983 2984 // The struct intrinsic variants add one additional operand over raw. 2985 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 2986 Register VIndex; 2987 if (HasVIndex) { 2988 VIndex = MI.getOperand(4 + OpOffset).getReg(); 2989 ++OpOffset; 2990 } 2991 2992 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 2993 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 2994 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 2995 2996 MachineMemOperand *MMO = *MI.memoperands_begin(); 2997 2998 unsigned ImmOffset; 2999 unsigned TotalOffset; 3000 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3001 if (TotalOffset != 0) 3002 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3003 3004 if (!VIndex) 3005 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3006 3007 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3008 .addDef(Dst) 3009 .addUse(VData); // vdata 3010 3011 if (IsCmpSwap) 3012 MIB.addReg(CmpVal); 3013 3014 MIB.addUse(RSrc) // rsrc 3015 .addUse(VIndex) // vindex 3016 .addUse(VOffset) // voffset 3017 .addUse(SOffset) // soffset 3018 .addImm(ImmOffset) // offset(imm) 3019 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3020 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3021 .addMemOperand(MMO); 3022 3023 MI.eraseFromParent(); 3024 return true; 3025 } 3026 3027 // Produce a vector of s16 elements from s32 pieces. 3028 static void truncToS16Vector(MachineIRBuilder &B, Register DstReg, 3029 ArrayRef<Register> UnmergeParts) { 3030 const LLT S16 = LLT::scalar(16); 3031 3032 SmallVector<Register, 4> RemergeParts(UnmergeParts.size()); 3033 for (int I = 0, E = UnmergeParts.size(); I != E; ++I) 3034 RemergeParts[I] = B.buildTrunc(S16, UnmergeParts[I]).getReg(0); 3035 3036 B.buildBuildVector(DstReg, RemergeParts); 3037 } 3038 3039 /// Convert a set of s32 registers to a result vector with s16 elements. 3040 static void bitcastToS16Vector(MachineIRBuilder &B, Register DstReg, 3041 ArrayRef<Register> UnmergeParts) { 3042 MachineRegisterInfo &MRI = *B.getMRI(); 3043 const LLT V2S16 = LLT::vector(2, 16); 3044 LLT TargetTy = MRI.getType(DstReg); 3045 int NumElts = UnmergeParts.size(); 3046 3047 if (NumElts == 1) { 3048 assert(TargetTy == V2S16); 3049 B.buildBitcast(DstReg, UnmergeParts[0]); 3050 return; 3051 } 3052 3053 SmallVector<Register, 4> RemergeParts(NumElts); 3054 for (int I = 0; I != NumElts; ++I) 3055 RemergeParts[I] = B.buildBitcast(V2S16, UnmergeParts[I]).getReg(0); 3056 3057 if (TargetTy.getSizeInBits() == 32u * NumElts) { 3058 B.buildConcatVectors(DstReg, RemergeParts); 3059 return; 3060 } 3061 3062 const LLT V3S16 = LLT::vector(3, 16); 3063 const LLT V6S16 = LLT::vector(6, 16); 3064 3065 // Widen to v6s16 and unpack v3 parts. 3066 assert(TargetTy == V3S16); 3067 3068 RemergeParts.push_back(B.buildUndef(V2S16).getReg(0)); 3069 auto Concat = B.buildConcatVectors(V6S16, RemergeParts); 3070 B.buildUnmerge({DstReg, MRI.createGenericVirtualRegister(V3S16)}, Concat); 3071 } 3072 3073 // FIXME: Just vector trunc should be sufficent, but legalization currently 3074 // broken. 3075 static void repackUnpackedD16Load(MachineIRBuilder &B, Register DstReg, 3076 Register WideDstReg) { 3077 const LLT S32 = LLT::scalar(32); 3078 const LLT S16 = LLT::scalar(16); 3079 3080 auto Unmerge = B.buildUnmerge(S32, WideDstReg); 3081 3082 int NumOps = Unmerge->getNumOperands() - 1; 3083 SmallVector<Register, 4> RemergeParts(NumOps); 3084 for (int I = 0; I != NumOps; ++I) 3085 RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0); 3086 3087 B.buildBuildVector(DstReg, RemergeParts); 3088 } 3089 3090 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3091 MachineInstr &MI, MachineIRBuilder &B, 3092 GISelChangeObserver &Observer, 3093 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3094 bool IsTFE = MI.getNumExplicitDefs() == 2; 3095 3096 // We are only processing the operands of d16 image operations on subtargets 3097 // that use the unpacked register layout, or need to repack the TFE result. 3098 3099 // TODO: Need to handle a16 images too 3100 // TODO: Do we need to guard against already legalized intrinsics? 3101 if (!IsTFE && !ST.hasUnpackedD16VMem()) 3102 return true; 3103 3104 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3105 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3106 3107 if (BaseOpcode->Atomic) // No d16 atomics, or TFE. 3108 return true; 3109 3110 B.setInstr(MI); 3111 3112 MachineRegisterInfo *MRI = B.getMRI(); 3113 const LLT S32 = LLT::scalar(32); 3114 const LLT S16 = LLT::scalar(16); 3115 3116 if (BaseOpcode->Store) { // No TFE for stores? 3117 Register VData = MI.getOperand(1).getReg(); 3118 LLT Ty = MRI->getType(VData); 3119 if (!Ty.isVector() || Ty.getElementType() != S16) 3120 return true; 3121 3122 B.setInstr(MI); 3123 3124 Observer.changingInstr(MI); 3125 MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData)); 3126 Observer.changedInstr(MI); 3127 return true; 3128 } 3129 3130 Register DstReg = MI.getOperand(0).getReg(); 3131 LLT Ty = MRI->getType(DstReg); 3132 const LLT EltTy = Ty.getScalarType(); 3133 const bool IsD16 = Ty.getScalarType() == S16; 3134 const unsigned NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3135 3136 if (IsTFE) { 3137 // In the IR, TFE is supposed to be used with a 2 element struct return 3138 // type. The intruction really returns these two values in one contiguous 3139 // register, with one additional dword beyond the loaded data. Rewrite the 3140 // return type to use a single register result. 3141 Register Dst1Reg = MI.getOperand(1).getReg(); 3142 if (MRI->getType(Dst1Reg) != S32) 3143 return false; 3144 3145 // TODO: Make sure the TFE operand bit is set. 3146 3147 // The raw dword aligned data component of the load. The only legal cases 3148 // where this matters should be when using the packed D16 format, for 3149 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3150 LLT RoundedTy; 3151 LLT TFETy; 3152 3153 if (IsD16 && ST.hasUnpackedD16VMem()) { 3154 RoundedTy = LLT::scalarOrVector(NumElts, 32); 3155 TFETy = LLT::vector(NumElts + 1, 32); 3156 } else { 3157 unsigned EltSize = Ty.getScalarSizeInBits(); 3158 unsigned RoundedElts = (Ty.getSizeInBits() + 31) / 32; 3159 unsigned RoundedSize = 32 * RoundedElts; 3160 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3161 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3162 } 3163 3164 Register TFEReg = MRI->createGenericVirtualRegister(TFETy); 3165 Observer.changingInstr(MI); 3166 3167 MI.getOperand(0).setReg(TFEReg); 3168 MI.RemoveOperand(1); 3169 3170 Observer.changedInstr(MI); 3171 3172 // Insert after the instruction. 3173 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3174 3175 // Now figure out how to copy the new result register back into the old 3176 // result. 3177 3178 SmallVector<Register, 5> UnmergeResults(TFETy.getNumElements(), Dst1Reg); 3179 int NumDataElts = TFETy.getNumElements() - 1; 3180 3181 if (!Ty.isVector()) { 3182 // Simplest case is a trivial unmerge (plus a truncate for d16). 3183 UnmergeResults[0] = Ty == S32 ? 3184 DstReg : MRI->createGenericVirtualRegister(S32); 3185 3186 B.buildUnmerge(UnmergeResults, TFEReg); 3187 if (Ty != S32) 3188 B.buildTrunc(DstReg, UnmergeResults[0]); 3189 return true; 3190 } 3191 3192 // We have to repack into a new vector of some kind. 3193 for (int I = 0; I != NumDataElts; ++I) 3194 UnmergeResults[I] = MRI->createGenericVirtualRegister(S32); 3195 B.buildUnmerge(UnmergeResults, TFEReg); 3196 3197 // Drop the final TFE element. 3198 ArrayRef<Register> DataPart(UnmergeResults.data(), NumDataElts); 3199 3200 if (EltTy == S32) 3201 B.buildBuildVector(DstReg, DataPart); 3202 else if (ST.hasUnpackedD16VMem()) 3203 truncToS16Vector(B, DstReg, DataPart); 3204 else 3205 bitcastToS16Vector(B, DstReg, DataPart); 3206 3207 return true; 3208 } 3209 3210 // Must be an image load. 3211 if (!Ty.isVector() || Ty.getElementType() != S16) 3212 return true; 3213 3214 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3215 3216 LLT WidenedTy = Ty.changeElementType(S32); 3217 Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy); 3218 3219 Observer.changingInstr(MI); 3220 MI.getOperand(0).setReg(WideDstReg); 3221 Observer.changedInstr(MI); 3222 3223 repackUnpackedD16Load(B, DstReg, WideDstReg); 3224 return true; 3225 } 3226 3227 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 3228 MachineInstr &MI, MachineIRBuilder &B, 3229 GISelChangeObserver &Observer) const { 3230 Register Dst = MI.getOperand(0).getReg(); 3231 LLT Ty = B.getMRI()->getType(Dst); 3232 unsigned Size = Ty.getSizeInBits(); 3233 MachineFunction &MF = B.getMF(); 3234 3235 Observer.changingInstr(MI); 3236 3237 // FIXME: We don't really need this intermediate instruction. The intrinsic 3238 // should be fixed to have a memory operand. Since it's readnone, we're not 3239 // allowed to add one. 3240 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 3241 MI.RemoveOperand(1); // Remove intrinsic ID 3242 3243 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 3244 // TODO: Should this use datalayout alignment? 3245 const unsigned MemSize = (Size + 7) / 8; 3246 const unsigned MemAlign = 4; 3247 MachineMemOperand *MMO = MF.getMachineMemOperand( 3248 MachinePointerInfo(), 3249 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 3250 MachineMemOperand::MOInvariant, MemSize, MemAlign); 3251 MI.addMemOperand(MF, MMO); 3252 3253 // There are no 96-bit result scalar loads, but widening to 128-bit should 3254 // always be legal. We may need to restore this to a 96-bit result if it turns 3255 // out this needs to be converted to a vector load during RegBankSelect. 3256 if (!isPowerOf2_32(Size)) { 3257 LegalizerHelper Helper(MF, *this, Observer, B); 3258 B.setInstr(MI); 3259 3260 if (Ty.isVector()) 3261 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 3262 else 3263 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 3264 } 3265 3266 Observer.changedInstr(MI); 3267 return true; 3268 } 3269 3270 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 3271 MachineIRBuilder &B, 3272 GISelChangeObserver &Observer) const { 3273 MachineRegisterInfo &MRI = *B.getMRI(); 3274 3275 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 3276 auto IntrID = MI.getIntrinsicID(); 3277 switch (IntrID) { 3278 case Intrinsic::amdgcn_if: 3279 case Intrinsic::amdgcn_else: { 3280 MachineInstr *Br = nullptr; 3281 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 3282 const SIRegisterInfo *TRI 3283 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 3284 3285 B.setInstr(*BrCond); 3286 Register Def = MI.getOperand(1).getReg(); 3287 Register Use = MI.getOperand(3).getReg(); 3288 3289 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 3290 if (Br) 3291 BrTarget = Br->getOperand(0).getMBB(); 3292 3293 if (IntrID == Intrinsic::amdgcn_if) { 3294 B.buildInstr(AMDGPU::SI_IF) 3295 .addDef(Def) 3296 .addUse(Use) 3297 .addMBB(BrTarget); 3298 } else { 3299 B.buildInstr(AMDGPU::SI_ELSE) 3300 .addDef(Def) 3301 .addUse(Use) 3302 .addMBB(BrTarget) 3303 .addImm(0); 3304 } 3305 3306 if (Br) 3307 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 3308 3309 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 3310 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 3311 MI.eraseFromParent(); 3312 BrCond->eraseFromParent(); 3313 return true; 3314 } 3315 3316 return false; 3317 } 3318 case Intrinsic::amdgcn_loop: { 3319 MachineInstr *Br = nullptr; 3320 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 3321 const SIRegisterInfo *TRI 3322 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 3323 3324 B.setInstr(*BrCond); 3325 3326 // FIXME: Need to adjust branch targets based on unconditional branch. 3327 Register Reg = MI.getOperand(2).getReg(); 3328 B.buildInstr(AMDGPU::SI_LOOP) 3329 .addUse(Reg) 3330 .addMBB(BrCond->getOperand(1).getMBB()); 3331 MI.eraseFromParent(); 3332 BrCond->eraseFromParent(); 3333 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 3334 return true; 3335 } 3336 3337 return false; 3338 } 3339 case Intrinsic::amdgcn_kernarg_segment_ptr: 3340 return legalizePreloadedArgIntrin( 3341 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3342 case Intrinsic::amdgcn_implicitarg_ptr: 3343 return legalizeImplicitArgPtr(MI, MRI, B); 3344 case Intrinsic::amdgcn_workitem_id_x: 3345 return legalizePreloadedArgIntrin(MI, MRI, B, 3346 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 3347 case Intrinsic::amdgcn_workitem_id_y: 3348 return legalizePreloadedArgIntrin(MI, MRI, B, 3349 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 3350 case Intrinsic::amdgcn_workitem_id_z: 3351 return legalizePreloadedArgIntrin(MI, MRI, B, 3352 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 3353 case Intrinsic::amdgcn_workgroup_id_x: 3354 return legalizePreloadedArgIntrin(MI, MRI, B, 3355 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 3356 case Intrinsic::amdgcn_workgroup_id_y: 3357 return legalizePreloadedArgIntrin(MI, MRI, B, 3358 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 3359 case Intrinsic::amdgcn_workgroup_id_z: 3360 return legalizePreloadedArgIntrin(MI, MRI, B, 3361 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 3362 case Intrinsic::amdgcn_dispatch_ptr: 3363 return legalizePreloadedArgIntrin(MI, MRI, B, 3364 AMDGPUFunctionArgInfo::DISPATCH_PTR); 3365 case Intrinsic::amdgcn_queue_ptr: 3366 return legalizePreloadedArgIntrin(MI, MRI, B, 3367 AMDGPUFunctionArgInfo::QUEUE_PTR); 3368 case Intrinsic::amdgcn_implicit_buffer_ptr: 3369 return legalizePreloadedArgIntrin( 3370 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 3371 case Intrinsic::amdgcn_dispatch_id: 3372 return legalizePreloadedArgIntrin(MI, MRI, B, 3373 AMDGPUFunctionArgInfo::DISPATCH_ID); 3374 case Intrinsic::amdgcn_fdiv_fast: 3375 return legalizeFDIVFastIntrin(MI, MRI, B); 3376 case Intrinsic::amdgcn_is_shared: 3377 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 3378 case Intrinsic::amdgcn_is_private: 3379 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 3380 case Intrinsic::amdgcn_wavefrontsize: { 3381 B.setInstr(MI); 3382 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 3383 MI.eraseFromParent(); 3384 return true; 3385 } 3386 case Intrinsic::amdgcn_s_buffer_load: 3387 return legalizeSBufferLoad(MI, B, Observer); 3388 case Intrinsic::amdgcn_raw_buffer_store: 3389 case Intrinsic::amdgcn_struct_buffer_store: 3390 return legalizeBufferStore(MI, MRI, B, false, false); 3391 case Intrinsic::amdgcn_raw_buffer_store_format: 3392 case Intrinsic::amdgcn_struct_buffer_store_format: 3393 return legalizeBufferStore(MI, MRI, B, false, true); 3394 case Intrinsic::amdgcn_raw_tbuffer_store: 3395 case Intrinsic::amdgcn_struct_tbuffer_store: 3396 return legalizeBufferStore(MI, MRI, B, true, true); 3397 case Intrinsic::amdgcn_raw_buffer_load: 3398 case Intrinsic::amdgcn_struct_buffer_load: 3399 return legalizeBufferLoad(MI, MRI, B, false, false); 3400 case Intrinsic::amdgcn_raw_buffer_load_format: 3401 case Intrinsic::amdgcn_struct_buffer_load_format: 3402 return legalizeBufferLoad(MI, MRI, B, true, false); 3403 case Intrinsic::amdgcn_raw_tbuffer_load: 3404 case Intrinsic::amdgcn_struct_tbuffer_load: 3405 return legalizeBufferLoad(MI, MRI, B, true, true); 3406 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3407 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3408 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3409 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3410 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3411 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3412 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3413 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3414 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3415 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3416 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3417 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3418 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3419 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3420 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3421 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3422 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3423 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3424 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3425 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3426 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3427 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3428 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3429 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3430 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3431 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3432 return legalizeBufferAtomic(MI, B, IntrID); 3433 case Intrinsic::amdgcn_atomic_inc: 3434 return legalizeAtomicIncDec(MI, B, true); 3435 case Intrinsic::amdgcn_atomic_dec: 3436 return legalizeAtomicIncDec(MI, B, false); 3437 default: { 3438 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 3439 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 3440 return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr); 3441 return true; 3442 } 3443 } 3444 3445 return true; 3446 } 3447