1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPULegalizerInfo.h" 22 23 #include "AMDGPU.h" 24 #include "AMDGPUGlobalISelUtils.h" 25 #include "AMDGPUTargetMachine.h" 26 #include "SIMachineFunctionInfo.h" 27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 30 #include "llvm/CodeGen/TargetOpcodes.h" 31 #include "llvm/CodeGen/ValueTypes.h" 32 #include "llvm/IR/DerivedTypes.h" 33 #include "llvm/IR/DiagnosticInfo.h" 34 #include "llvm/IR/Type.h" 35 #include "llvm/Support/Debug.h" 36 37 #define DEBUG_TYPE "amdgpu-legalinfo" 38 39 using namespace llvm; 40 using namespace LegalizeActions; 41 using namespace LegalizeMutations; 42 using namespace LegalityPredicates; 43 using namespace MIPatternMatch; 44 45 // Round the number of elements to the next power of two elements 46 static LLT getPow2VectorType(LLT Ty) { 47 unsigned NElts = Ty.getNumElements(); 48 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 49 return Ty.changeNumElements(Pow2NElts); 50 } 51 52 // Round the number of bits to the next power of two bits 53 static LLT getPow2ScalarType(LLT Ty) { 54 unsigned Bits = Ty.getSizeInBits(); 55 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 56 return LLT::scalar(Pow2Bits); 57 } 58 59 static LegalityPredicate isMultiple32(unsigned TypeIdx, 60 unsigned MaxSize = 1024) { 61 return [=](const LegalityQuery &Query) { 62 const LLT Ty = Query.Types[TypeIdx]; 63 const LLT EltTy = Ty.getScalarType(); 64 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 65 }; 66 } 67 68 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 69 return [=](const LegalityQuery &Query) { 70 return Query.Types[TypeIdx].getSizeInBits() == Size; 71 }; 72 } 73 74 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 75 return [=](const LegalityQuery &Query) { 76 const LLT Ty = Query.Types[TypeIdx]; 77 return Ty.isVector() && 78 Ty.getNumElements() % 2 != 0 && 79 Ty.getElementType().getSizeInBits() < 32 && 80 Ty.getSizeInBits() % 32 != 0; 81 }; 82 } 83 84 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 85 return [=](const LegalityQuery &Query) { 86 const LLT Ty = Query.Types[TypeIdx]; 87 const LLT EltTy = Ty.getScalarType(); 88 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 89 }; 90 } 91 92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 93 return [=](const LegalityQuery &Query) { 94 const LLT Ty = Query.Types[TypeIdx]; 95 const LLT EltTy = Ty.getElementType(); 96 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 97 }; 98 } 99 100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 101 return [=](const LegalityQuery &Query) { 102 const LLT Ty = Query.Types[TypeIdx]; 103 const LLT EltTy = Ty.getElementType(); 104 unsigned Size = Ty.getSizeInBits(); 105 unsigned Pieces = (Size + 63) / 64; 106 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 107 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 108 }; 109 } 110 111 // Increase the number of vector elements to reach the next multiple of 32-bit 112 // type. 113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 114 return [=](const LegalityQuery &Query) { 115 const LLT Ty = Query.Types[TypeIdx]; 116 117 const LLT EltTy = Ty.getElementType(); 118 const int Size = Ty.getSizeInBits(); 119 const int EltSize = EltTy.getSizeInBits(); 120 const int NextMul32 = (Size + 31) / 32; 121 122 assert(EltSize < 32); 123 124 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 125 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 126 }; 127 } 128 129 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 130 return [=](const LegalityQuery &Query) { 131 const LLT QueryTy = Query.Types[TypeIdx]; 132 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 133 }; 134 } 135 136 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 137 return [=](const LegalityQuery &Query) { 138 const LLT QueryTy = Query.Types[TypeIdx]; 139 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 140 }; 141 } 142 143 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 144 return [=](const LegalityQuery &Query) { 145 const LLT QueryTy = Query.Types[TypeIdx]; 146 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 147 }; 148 } 149 150 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 151 // v2s16. 152 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 153 return [=](const LegalityQuery &Query) { 154 const LLT Ty = Query.Types[TypeIdx]; 155 if (Ty.isVector()) { 156 const int EltSize = Ty.getElementType().getSizeInBits(); 157 return EltSize == 32 || EltSize == 64 || 158 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 159 EltSize == 128 || EltSize == 256; 160 } 161 162 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 163 }; 164 } 165 166 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 167 return [=](const LegalityQuery &Query) { 168 const LLT QueryTy = Query.Types[TypeIdx]; 169 return QueryTy.isVector() && QueryTy.getElementType() == Type; 170 }; 171 } 172 173 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 174 return [=](const LegalityQuery &Query) { 175 const LLT Ty = Query.Types[TypeIdx]; 176 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 177 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 178 }; 179 } 180 181 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 182 const GCNTargetMachine &TM) 183 : ST(ST_) { 184 using namespace TargetOpcode; 185 186 auto GetAddrSpacePtr = [&TM](unsigned AS) { 187 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 188 }; 189 190 const LLT S1 = LLT::scalar(1); 191 const LLT S16 = LLT::scalar(16); 192 const LLT S32 = LLT::scalar(32); 193 const LLT S64 = LLT::scalar(64); 194 const LLT S96 = LLT::scalar(96); 195 const LLT S128 = LLT::scalar(128); 196 const LLT S256 = LLT::scalar(256); 197 const LLT S1024 = LLT::scalar(1024); 198 199 const LLT V2S16 = LLT::vector(2, 16); 200 const LLT V4S16 = LLT::vector(4, 16); 201 202 const LLT V2S32 = LLT::vector(2, 32); 203 const LLT V3S32 = LLT::vector(3, 32); 204 const LLT V4S32 = LLT::vector(4, 32); 205 const LLT V5S32 = LLT::vector(5, 32); 206 const LLT V6S32 = LLT::vector(6, 32); 207 const LLT V7S32 = LLT::vector(7, 32); 208 const LLT V8S32 = LLT::vector(8, 32); 209 const LLT V9S32 = LLT::vector(9, 32); 210 const LLT V10S32 = LLT::vector(10, 32); 211 const LLT V11S32 = LLT::vector(11, 32); 212 const LLT V12S32 = LLT::vector(12, 32); 213 const LLT V13S32 = LLT::vector(13, 32); 214 const LLT V14S32 = LLT::vector(14, 32); 215 const LLT V15S32 = LLT::vector(15, 32); 216 const LLT V16S32 = LLT::vector(16, 32); 217 const LLT V32S32 = LLT::vector(32, 32); 218 219 const LLT V2S64 = LLT::vector(2, 64); 220 const LLT V3S64 = LLT::vector(3, 64); 221 const LLT V4S64 = LLT::vector(4, 64); 222 const LLT V5S64 = LLT::vector(5, 64); 223 const LLT V6S64 = LLT::vector(6, 64); 224 const LLT V7S64 = LLT::vector(7, 64); 225 const LLT V8S64 = LLT::vector(8, 64); 226 const LLT V16S64 = LLT::vector(16, 64); 227 228 std::initializer_list<LLT> AllS32Vectors = 229 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 230 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 231 std::initializer_list<LLT> AllS64Vectors = 232 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 233 234 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 235 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 236 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 237 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 238 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 239 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 240 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 241 242 const LLT CodePtr = FlatPtr; 243 244 const std::initializer_list<LLT> AddrSpaces64 = { 245 GlobalPtr, ConstantPtr, FlatPtr 246 }; 247 248 const std::initializer_list<LLT> AddrSpaces32 = { 249 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 250 }; 251 252 const std::initializer_list<LLT> FPTypesBase = { 253 S32, S64 254 }; 255 256 const std::initializer_list<LLT> FPTypes16 = { 257 S32, S64, S16 258 }; 259 260 const std::initializer_list<LLT> FPTypesPK16 = { 261 S32, S64, S16, V2S16 262 }; 263 264 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 265 266 setAction({G_BRCOND, S1}, Legal); // VCC branches 267 setAction({G_BRCOND, S32}, Legal); // SCC branches 268 269 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 270 // elements for v3s16 271 getActionDefinitionsBuilder(G_PHI) 272 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 273 .legalFor(AllS32Vectors) 274 .legalFor(AllS64Vectors) 275 .legalFor(AddrSpaces64) 276 .legalFor(AddrSpaces32) 277 .clampScalar(0, S32, S256) 278 .widenScalarToNextPow2(0, 32) 279 .clampMaxNumElements(0, S32, 16) 280 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 281 .legalIf(isPointer(0)); 282 283 if (ST.has16BitInsts()) { 284 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 285 .legalFor({S32, S16}) 286 .clampScalar(0, S16, S32) 287 .scalarize(0) 288 .widenScalarToNextPow2(0, 32); 289 } else { 290 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 291 .legalFor({S32}) 292 .clampScalar(0, S32, S32) 293 .scalarize(0); 294 } 295 296 // FIXME: Not really legal. Placeholder for custom lowering. 297 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 298 .legalFor({S32, S64}) 299 .clampScalar(0, S32, S64) 300 .widenScalarToNextPow2(0, 32) 301 .scalarize(0); 302 303 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 304 .legalFor({S32}) 305 .clampScalar(0, S32, S32) 306 .scalarize(0); 307 308 // Report legal for any types we can handle anywhere. For the cases only legal 309 // on the SALU, RegBankSelect will be able to re-legalize. 310 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 311 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 312 .clampScalar(0, S32, S64) 313 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 314 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 315 .widenScalarToNextPow2(0) 316 .scalarize(0); 317 318 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 319 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 320 .legalFor({{S32, S1}, {S32, S32}}) 321 .clampScalar(0, S32, S32) 322 .scalarize(0); // TODO: Implement. 323 324 getActionDefinitionsBuilder(G_BITCAST) 325 // Don't worry about the size constraint. 326 .legalIf(all(isRegisterType(0), isRegisterType(1))) 327 .lower(); 328 329 330 getActionDefinitionsBuilder(G_CONSTANT) 331 .legalFor({S1, S32, S64, S16, GlobalPtr, 332 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 333 .clampScalar(0, S32, S64) 334 .widenScalarToNextPow2(0) 335 .legalIf(isPointer(0)); 336 337 getActionDefinitionsBuilder(G_FCONSTANT) 338 .legalFor({S32, S64, S16}) 339 .clampScalar(0, S16, S64); 340 341 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 342 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 343 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 344 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 345 .clampScalarOrElt(0, S32, S1024) 346 .legalIf(isMultiple32(0)) 347 .widenScalarToNextPow2(0, 32) 348 .clampMaxNumElements(0, S32, 16); 349 350 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 351 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 352 .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr}); 353 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 354 355 auto &FPOpActions = getActionDefinitionsBuilder( 356 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 357 .legalFor({S32, S64}); 358 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 359 .customFor({S32, S64}); 360 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 361 .customFor({S32, S64}); 362 363 if (ST.has16BitInsts()) { 364 if (ST.hasVOP3PInsts()) 365 FPOpActions.legalFor({S16, V2S16}); 366 else 367 FPOpActions.legalFor({S16}); 368 369 TrigActions.customFor({S16}); 370 FDIVActions.customFor({S16}); 371 } 372 373 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 374 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 375 376 if (ST.hasVOP3PInsts()) { 377 MinNumMaxNum.customFor(FPTypesPK16) 378 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 379 .clampMaxNumElements(0, S16, 2) 380 .clampScalar(0, S16, S64) 381 .scalarize(0); 382 } else if (ST.has16BitInsts()) { 383 MinNumMaxNum.customFor(FPTypes16) 384 .clampScalar(0, S16, S64) 385 .scalarize(0); 386 } else { 387 MinNumMaxNum.customFor(FPTypesBase) 388 .clampScalar(0, S32, S64) 389 .scalarize(0); 390 } 391 392 if (ST.hasVOP3PInsts()) 393 FPOpActions.clampMaxNumElements(0, S16, 2); 394 395 FPOpActions 396 .scalarize(0) 397 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 398 399 TrigActions 400 .scalarize(0) 401 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 402 403 FDIVActions 404 .scalarize(0) 405 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 406 407 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 408 .legalFor(FPTypesPK16) 409 .clampMaxNumElements(0, S16, 2) 410 .scalarize(0) 411 .clampScalar(0, S16, S64); 412 413 if (ST.has16BitInsts()) { 414 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 415 .legalFor({S32, S64, S16}) 416 .scalarize(0) 417 .clampScalar(0, S16, S64); 418 } else { 419 getActionDefinitionsBuilder(G_FSQRT) 420 .legalFor({S32, S64}) 421 .scalarize(0) 422 .clampScalar(0, S32, S64); 423 424 if (ST.hasFractBug()) { 425 getActionDefinitionsBuilder(G_FFLOOR) 426 .customFor({S64}) 427 .legalFor({S32, S64}) 428 .scalarize(0) 429 .clampScalar(0, S32, S64); 430 } else { 431 getActionDefinitionsBuilder(G_FFLOOR) 432 .legalFor({S32, S64}) 433 .scalarize(0) 434 .clampScalar(0, S32, S64); 435 } 436 } 437 438 getActionDefinitionsBuilder(G_FPTRUNC) 439 .legalFor({{S32, S64}, {S16, S32}}) 440 .scalarize(0); 441 442 getActionDefinitionsBuilder(G_FPEXT) 443 .legalFor({{S64, S32}, {S32, S16}}) 444 .lowerFor({{S64, S16}}) // FIXME: Implement 445 .scalarize(0); 446 447 getActionDefinitionsBuilder(G_FSUB) 448 // Use actual fsub instruction 449 .legalFor({S32}) 450 // Must use fadd + fneg 451 .lowerFor({S64, S16, V2S16}) 452 .scalarize(0) 453 .clampScalar(0, S32, S64); 454 455 // Whether this is legal depends on the floating point mode for the function. 456 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 457 if (ST.hasMadF16()) 458 FMad.customFor({S32, S16}); 459 else 460 FMad.customFor({S32}); 461 FMad.scalarize(0) 462 .lower(); 463 464 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 465 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 466 {S32, S1}, {S64, S1}, {S16, S1}}) 467 .scalarize(0) 468 .clampScalar(0, S32, S64) 469 .widenScalarToNextPow2(1, 32); 470 471 // TODO: Split s1->s64 during regbankselect for VALU. 472 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 473 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 474 .lowerFor({{S32, S64}}) 475 .lowerIf(typeIs(1, S1)) 476 .customFor({{S64, S64}}); 477 if (ST.has16BitInsts()) 478 IToFP.legalFor({{S16, S16}}); 479 IToFP.clampScalar(1, S32, S64) 480 .scalarize(0); 481 482 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 483 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 484 .customFor({{S64, S64}}); 485 if (ST.has16BitInsts()) 486 FPToI.legalFor({{S16, S16}}); 487 else 488 FPToI.minScalar(1, S32); 489 490 FPToI.minScalar(0, S32) 491 .scalarize(0) 492 .lower(); 493 494 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 495 .scalarize(0) 496 .lower(); 497 498 if (ST.has16BitInsts()) { 499 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 500 .legalFor({S16, S32, S64}) 501 .clampScalar(0, S16, S64) 502 .scalarize(0); 503 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 504 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 505 .legalFor({S32, S64}) 506 .clampScalar(0, S32, S64) 507 .scalarize(0); 508 } else { 509 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 510 .legalFor({S32}) 511 .customFor({S64}) 512 .clampScalar(0, S32, S64) 513 .scalarize(0); 514 } 515 516 getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK}) 517 .scalarize(0) 518 .alwaysLegal(); 519 520 auto &CmpBuilder = 521 getActionDefinitionsBuilder(G_ICMP) 522 // The compare output type differs based on the register bank of the output, 523 // so make both s1 and s32 legal. 524 // 525 // Scalar compares producing output in scc will be promoted to s32, as that 526 // is the allocatable register type that will be needed for the copy from 527 // scc. This will be promoted during RegBankSelect, and we assume something 528 // before that won't try to use s32 result types. 529 // 530 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 531 // bank. 532 .legalForCartesianProduct( 533 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 534 .legalForCartesianProduct( 535 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 536 if (ST.has16BitInsts()) { 537 CmpBuilder.legalFor({{S1, S16}}); 538 } 539 540 CmpBuilder 541 .widenScalarToNextPow2(1) 542 .clampScalar(1, S32, S64) 543 .scalarize(0) 544 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 545 546 getActionDefinitionsBuilder(G_FCMP) 547 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 548 .widenScalarToNextPow2(1) 549 .clampScalar(1, S32, S64) 550 .scalarize(0); 551 552 // FIXME: fpow has a selection pattern that should move to custom lowering. 553 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2, G_FPOW}); 554 if (ST.has16BitInsts()) 555 Exp2Ops.legalFor({S32, S16}); 556 else 557 Exp2Ops.legalFor({S32}); 558 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 559 Exp2Ops.scalarize(0); 560 561 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10}); 562 if (ST.has16BitInsts()) 563 ExpOps.customFor({{S32}, {S16}}); 564 else 565 ExpOps.customFor({S32}); 566 ExpOps.clampScalar(0, MinScalarFPTy, S32) 567 .scalarize(0); 568 569 // The 64-bit versions produce 32-bit results, but only on the SALU. 570 getActionDefinitionsBuilder(G_CTPOP) 571 .legalFor({{S32, S32}, {S32, S64}}) 572 .clampScalar(0, S32, S32) 573 .clampScalar(1, S32, S64) 574 .scalarize(0) 575 .widenScalarToNextPow2(0, 32) 576 .widenScalarToNextPow2(1, 32); 577 578 // The hardware instructions return a different result on 0 than the generic 579 // instructions expect. The hardware produces -1, but these produce the 580 // bitwidth. 581 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 582 .scalarize(0) 583 .clampScalar(0, S32, S32) 584 .clampScalar(1, S32, S64) 585 .widenScalarToNextPow2(0, 32) 586 .widenScalarToNextPow2(1, 32) 587 .lower(); 588 589 // The 64-bit versions produce 32-bit results, but only on the SALU. 590 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 591 .legalFor({{S32, S32}, {S32, S64}}) 592 .clampScalar(0, S32, S32) 593 .clampScalar(1, S32, S64) 594 .scalarize(0) 595 .widenScalarToNextPow2(0, 32) 596 .widenScalarToNextPow2(1, 32); 597 598 // TODO: Expand for > s32 599 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE}) 600 .legalFor({S32}) 601 .clampScalar(0, S32, S32) 602 .scalarize(0); 603 604 if (ST.has16BitInsts()) { 605 if (ST.hasVOP3PInsts()) { 606 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 607 .legalFor({S32, S16, V2S16}) 608 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 609 .clampMaxNumElements(0, S16, 2) 610 .clampScalar(0, S16, S32) 611 .widenScalarToNextPow2(0) 612 .scalarize(0); 613 } else { 614 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 615 .legalFor({S32, S16}) 616 .widenScalarToNextPow2(0) 617 .clampScalar(0, S16, S32) 618 .scalarize(0); 619 } 620 } else { 621 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 622 .legalFor({S32}) 623 .clampScalar(0, S32, S32) 624 .widenScalarToNextPow2(0) 625 .scalarize(0); 626 } 627 628 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 629 return [=](const LegalityQuery &Query) { 630 return Query.Types[TypeIdx0].getSizeInBits() < 631 Query.Types[TypeIdx1].getSizeInBits(); 632 }; 633 }; 634 635 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 636 return [=](const LegalityQuery &Query) { 637 return Query.Types[TypeIdx0].getSizeInBits() > 638 Query.Types[TypeIdx1].getSizeInBits(); 639 }; 640 }; 641 642 getActionDefinitionsBuilder(G_INTTOPTR) 643 // List the common cases 644 .legalForCartesianProduct(AddrSpaces64, {S64}) 645 .legalForCartesianProduct(AddrSpaces32, {S32}) 646 .scalarize(0) 647 // Accept any address space as long as the size matches 648 .legalIf(sameSize(0, 1)) 649 .widenScalarIf(smallerThan(1, 0), 650 [](const LegalityQuery &Query) { 651 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 652 }) 653 .narrowScalarIf(greaterThan(1, 0), 654 [](const LegalityQuery &Query) { 655 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 656 }); 657 658 getActionDefinitionsBuilder(G_PTRTOINT) 659 // List the common cases 660 .legalForCartesianProduct(AddrSpaces64, {S64}) 661 .legalForCartesianProduct(AddrSpaces32, {S32}) 662 .scalarize(0) 663 // Accept any address space as long as the size matches 664 .legalIf(sameSize(0, 1)) 665 .widenScalarIf(smallerThan(0, 1), 666 [](const LegalityQuery &Query) { 667 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 668 }) 669 .narrowScalarIf( 670 greaterThan(0, 1), 671 [](const LegalityQuery &Query) { 672 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 673 }); 674 675 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 676 .scalarize(0) 677 .custom(); 678 679 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 680 // handle some operations by just promoting the register during 681 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 682 auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned { 683 switch (AS) { 684 // FIXME: Private element size. 685 case AMDGPUAS::PRIVATE_ADDRESS: 686 return 32; 687 // FIXME: Check subtarget 688 case AMDGPUAS::LOCAL_ADDRESS: 689 return ST.useDS128() ? 128 : 64; 690 691 // Treat constant and global as identical. SMRD loads are sometimes usable 692 // for global loads (ideally constant address space should be eliminated) 693 // depending on the context. Legality cannot be context dependent, but 694 // RegBankSelect can split the load as necessary depending on the pointer 695 // register bank/uniformity and if the memory is invariant or not written in 696 // a kernel. 697 case AMDGPUAS::CONSTANT_ADDRESS: 698 case AMDGPUAS::GLOBAL_ADDRESS: 699 return IsLoad ? 512 : 128; 700 default: 701 return 128; 702 } 703 }; 704 705 const auto needToSplitMemOp = [=](const LegalityQuery &Query, bool IsLoad) -> bool { 706 const LLT DstTy = Query.Types[0]; 707 708 // Split vector extloads. 709 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 710 unsigned Align = Query.MMODescrs[0].AlignInBits; 711 712 if (MemSize < DstTy.getSizeInBits()) 713 MemSize = std::max(MemSize, Align); 714 715 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 716 return true; 717 718 const LLT PtrTy = Query.Types[1]; 719 unsigned AS = PtrTy.getAddressSpace(); 720 if (MemSize > maxSizeForAddrSpace(AS, IsLoad)) 721 return true; 722 723 // Catch weird sized loads that don't evenly divide into the access sizes 724 // TODO: May be able to widen depending on alignment etc. 725 unsigned NumRegs = MemSize / 32; 726 if (NumRegs == 3 && !ST.hasDwordx3LoadStores()) 727 return true; 728 729 if (Align < MemSize) { 730 const SITargetLowering *TLI = ST.getTargetLowering(); 731 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 732 } 733 734 return false; 735 }; 736 737 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 738 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 739 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 740 741 // TODO: Refine based on subtargets which support unaligned access or 128-bit 742 // LDS 743 // TODO: Unsupported flat for SI. 744 745 for (unsigned Op : {G_LOAD, G_STORE}) { 746 const bool IsStore = Op == G_STORE; 747 748 auto &Actions = getActionDefinitionsBuilder(Op); 749 // Whitelist the common cases. 750 // TODO: Pointer loads 751 // TODO: Wide constant loads 752 // TODO: Only CI+ has 3x loads 753 // TODO: Loads to s16 on gfx9 754 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 755 {V2S32, GlobalPtr, 64, GlobalAlign32}, 756 {V3S32, GlobalPtr, 96, GlobalAlign32}, 757 {S96, GlobalPtr, 96, GlobalAlign32}, 758 {V4S32, GlobalPtr, 128, GlobalAlign32}, 759 {S128, GlobalPtr, 128, GlobalAlign32}, 760 {S64, GlobalPtr, 64, GlobalAlign32}, 761 {V2S64, GlobalPtr, 128, GlobalAlign32}, 762 {V2S16, GlobalPtr, 32, GlobalAlign32}, 763 {S32, GlobalPtr, 8, GlobalAlign8}, 764 {S32, GlobalPtr, 16, GlobalAlign16}, 765 766 {S32, LocalPtr, 32, 32}, 767 {S64, LocalPtr, 64, 32}, 768 {V2S32, LocalPtr, 64, 32}, 769 {S32, LocalPtr, 8, 8}, 770 {S32, LocalPtr, 16, 16}, 771 {V2S16, LocalPtr, 32, 32}, 772 773 {S32, PrivatePtr, 32, 32}, 774 {S32, PrivatePtr, 8, 8}, 775 {S32, PrivatePtr, 16, 16}, 776 {V2S16, PrivatePtr, 32, 32}, 777 778 {S32, FlatPtr, 32, GlobalAlign32}, 779 {S32, FlatPtr, 16, GlobalAlign16}, 780 {S32, FlatPtr, 8, GlobalAlign8}, 781 {V2S16, FlatPtr, 32, GlobalAlign32}, 782 783 {S32, ConstantPtr, 32, GlobalAlign32}, 784 {V2S32, ConstantPtr, 64, GlobalAlign32}, 785 {V3S32, ConstantPtr, 96, GlobalAlign32}, 786 {V4S32, ConstantPtr, 128, GlobalAlign32}, 787 {S64, ConstantPtr, 64, GlobalAlign32}, 788 {S128, ConstantPtr, 128, GlobalAlign32}, 789 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 790 Actions 791 .customIf(typeIs(1, Constant32Ptr)) 792 .narrowScalarIf( 793 [=](const LegalityQuery &Query) -> bool { 794 return !Query.Types[0].isVector() && 795 needToSplitMemOp(Query, Op == G_LOAD); 796 }, 797 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 798 const LLT DstTy = Query.Types[0]; 799 const LLT PtrTy = Query.Types[1]; 800 801 const unsigned DstSize = DstTy.getSizeInBits(); 802 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 803 804 // Split extloads. 805 if (DstSize > MemSize) 806 return std::make_pair(0, LLT::scalar(MemSize)); 807 808 if (DstSize > 32 && (DstSize % 32 != 0)) { 809 // FIXME: Need a way to specify non-extload of larger size if 810 // suitably aligned. 811 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 812 } 813 814 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 815 Op == G_LOAD); 816 if (MemSize > MaxSize) 817 return std::make_pair(0, LLT::scalar(MaxSize)); 818 819 unsigned Align = Query.MMODescrs[0].AlignInBits; 820 return std::make_pair(0, LLT::scalar(Align)); 821 }) 822 .fewerElementsIf( 823 [=](const LegalityQuery &Query) -> bool { 824 return Query.Types[0].isVector() && 825 needToSplitMemOp(Query, Op == G_LOAD); 826 }, 827 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 828 const LLT DstTy = Query.Types[0]; 829 const LLT PtrTy = Query.Types[1]; 830 831 LLT EltTy = DstTy.getElementType(); 832 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 833 Op == G_LOAD); 834 835 // Split if it's too large for the address space. 836 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 837 unsigned NumElts = DstTy.getNumElements(); 838 unsigned EltSize = EltTy.getSizeInBits(); 839 840 if (MaxSize % EltSize == 0) { 841 return std::make_pair( 842 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 843 } 844 845 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 846 847 // FIXME: Refine when odd breakdowns handled 848 // The scalars will need to be re-legalized. 849 if (NumPieces == 1 || NumPieces >= NumElts || 850 NumElts % NumPieces != 0) 851 return std::make_pair(0, EltTy); 852 853 return std::make_pair(0, 854 LLT::vector(NumElts / NumPieces, EltTy)); 855 } 856 857 // Need to split because of alignment. 858 unsigned Align = Query.MMODescrs[0].AlignInBits; 859 unsigned EltSize = EltTy.getSizeInBits(); 860 if (EltSize > Align && 861 (EltSize / Align < DstTy.getNumElements())) { 862 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 863 } 864 865 // May need relegalization for the scalars. 866 return std::make_pair(0, EltTy); 867 }) 868 .minScalar(0, S32); 869 870 if (IsStore) 871 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 872 873 // TODO: Need a bitcast lower option? 874 Actions 875 .legalIf([=](const LegalityQuery &Query) { 876 const LLT Ty0 = Query.Types[0]; 877 unsigned Size = Ty0.getSizeInBits(); 878 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 879 unsigned Align = Query.MMODescrs[0].AlignInBits; 880 881 // FIXME: Widening store from alignment not valid. 882 if (MemSize < Size) 883 MemSize = std::max(MemSize, Align); 884 885 // No extending vector loads. 886 if (Size > MemSize && Ty0.isVector()) 887 return false; 888 889 switch (MemSize) { 890 case 8: 891 case 16: 892 return Size == 32; 893 case 32: 894 case 64: 895 case 128: 896 return true; 897 case 96: 898 return ST.hasDwordx3LoadStores(); 899 case 256: 900 case 512: 901 return true; 902 default: 903 return false; 904 } 905 }) 906 .widenScalarToNextPow2(0) 907 // TODO: v3s32->v4s32 with alignment 908 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 909 } 910 911 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 912 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 913 {S32, GlobalPtr, 16, 2 * 8}, 914 {S32, LocalPtr, 8, 8}, 915 {S32, LocalPtr, 16, 16}, 916 {S32, PrivatePtr, 8, 8}, 917 {S32, PrivatePtr, 16, 16}, 918 {S32, ConstantPtr, 8, 8}, 919 {S32, ConstantPtr, 16, 2 * 8}}); 920 if (ST.hasFlatAddressSpace()) { 921 ExtLoads.legalForTypesWithMemDesc( 922 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 923 } 924 925 ExtLoads.clampScalar(0, S32, S32) 926 .widenScalarToNextPow2(0) 927 .unsupportedIfMemSizeNotPow2() 928 .lower(); 929 930 auto &Atomics = getActionDefinitionsBuilder( 931 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 932 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 933 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 934 G_ATOMICRMW_UMIN}) 935 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 936 {S64, GlobalPtr}, {S64, LocalPtr}}); 937 if (ST.hasFlatAddressSpace()) { 938 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 939 } 940 941 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 942 .legalFor({{S32, LocalPtr}}); 943 944 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 945 // demarshalling 946 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 947 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 948 {S32, FlatPtr}, {S64, FlatPtr}}) 949 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 950 {S32, RegionPtr}, {S64, RegionPtr}}); 951 // TODO: Pointer types, any 32-bit or 64-bit vector 952 953 // Condition should be s32 for scalar, s1 for vector. 954 getActionDefinitionsBuilder(G_SELECT) 955 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 956 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 957 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 958 .clampScalar(0, S16, S64) 959 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 960 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 961 .scalarize(1) 962 .clampMaxNumElements(0, S32, 2) 963 .clampMaxNumElements(0, LocalPtr, 2) 964 .clampMaxNumElements(0, PrivatePtr, 2) 965 .scalarize(0) 966 .widenScalarToNextPow2(0) 967 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 968 969 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 970 // be more flexible with the shift amount type. 971 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 972 .legalFor({{S32, S32}, {S64, S32}}); 973 if (ST.has16BitInsts()) { 974 if (ST.hasVOP3PInsts()) { 975 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 976 .clampMaxNumElements(0, S16, 2); 977 } else 978 Shifts.legalFor({{S16, S32}, {S16, S16}}); 979 980 // TODO: Support 16-bit shift amounts 981 Shifts.clampScalar(1, S32, S32); 982 Shifts.clampScalar(0, S16, S64); 983 Shifts.widenScalarToNextPow2(0, 16); 984 } else { 985 // Make sure we legalize the shift amount type first, as the general 986 // expansion for the shifted type will produce much worse code if it hasn't 987 // been truncated already. 988 Shifts.clampScalar(1, S32, S32); 989 Shifts.clampScalar(0, S32, S64); 990 Shifts.widenScalarToNextPow2(0, 32); 991 } 992 Shifts.scalarize(0); 993 994 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 995 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 996 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 997 unsigned IdxTypeIdx = 2; 998 999 getActionDefinitionsBuilder(Op) 1000 .customIf([=](const LegalityQuery &Query) { 1001 const LLT EltTy = Query.Types[EltTypeIdx]; 1002 const LLT VecTy = Query.Types[VecTypeIdx]; 1003 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1004 return (EltTy.getSizeInBits() == 16 || 1005 EltTy.getSizeInBits() % 32 == 0) && 1006 VecTy.getSizeInBits() % 32 == 0 && 1007 VecTy.getSizeInBits() <= 1024 && 1008 IdxTy.getSizeInBits() == 32; 1009 }) 1010 .clampScalar(EltTypeIdx, S32, S64) 1011 .clampScalar(VecTypeIdx, S32, S64) 1012 .clampScalar(IdxTypeIdx, S32, S32); 1013 } 1014 1015 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1016 .unsupportedIf([=](const LegalityQuery &Query) { 1017 const LLT &EltTy = Query.Types[1].getElementType(); 1018 return Query.Types[0] != EltTy; 1019 }); 1020 1021 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1022 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1023 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1024 1025 // FIXME: Doesn't handle extract of illegal sizes. 1026 getActionDefinitionsBuilder(Op) 1027 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1028 // FIXME: Multiples of 16 should not be legal. 1029 .legalIf([=](const LegalityQuery &Query) { 1030 const LLT BigTy = Query.Types[BigTyIdx]; 1031 const LLT LitTy = Query.Types[LitTyIdx]; 1032 return (BigTy.getSizeInBits() % 32 == 0) && 1033 (LitTy.getSizeInBits() % 16 == 0); 1034 }) 1035 .widenScalarIf( 1036 [=](const LegalityQuery &Query) { 1037 const LLT BigTy = Query.Types[BigTyIdx]; 1038 return (BigTy.getScalarSizeInBits() < 16); 1039 }, 1040 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1041 .widenScalarIf( 1042 [=](const LegalityQuery &Query) { 1043 const LLT LitTy = Query.Types[LitTyIdx]; 1044 return (LitTy.getScalarSizeInBits() < 16); 1045 }, 1046 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1047 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1048 .widenScalarToNextPow2(BigTyIdx, 32); 1049 1050 } 1051 1052 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1053 .legalForCartesianProduct(AllS32Vectors, {S32}) 1054 .legalForCartesianProduct(AllS64Vectors, {S64}) 1055 .clampNumElements(0, V16S32, V32S32) 1056 .clampNumElements(0, V2S64, V16S64) 1057 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1058 1059 if (ST.hasScalarPackInsts()) { 1060 BuildVector 1061 // FIXME: Should probably widen s1 vectors straight to s32 1062 .minScalarOrElt(0, S16) 1063 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1064 .minScalar(1, S32); 1065 1066 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1067 .legalFor({V2S16, S32}) 1068 .lower(); 1069 BuildVector.minScalarOrElt(0, S32); 1070 } else { 1071 BuildVector.customFor({V2S16, S16}); 1072 BuildVector.minScalarOrElt(0, S32); 1073 1074 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1075 .customFor({V2S16, S32}) 1076 .lower(); 1077 } 1078 1079 BuildVector.legalIf(isRegisterType(0)); 1080 1081 // FIXME: Clamp maximum size 1082 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1083 .legalIf(isRegisterType(0)); 1084 1085 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1086 // pre-legalize. 1087 if (ST.hasVOP3PInsts()) { 1088 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1089 .customFor({V2S16, V2S16}) 1090 .lower(); 1091 } else 1092 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1093 1094 // Merge/Unmerge 1095 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1096 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1097 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1098 1099 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1100 const LLT &Ty = Query.Types[TypeIdx]; 1101 if (Ty.isVector()) { 1102 const LLT &EltTy = Ty.getElementType(); 1103 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 1104 return true; 1105 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1106 return true; 1107 } 1108 return false; 1109 }; 1110 1111 auto &Builder = getActionDefinitionsBuilder(Op) 1112 // Try to widen to s16 first for small types. 1113 // TODO: Only do this on targets with legal s16 shifts 1114 .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1115 1116 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1117 .lowerFor({{S16, V2S16}}) 1118 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1119 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1120 elementTypeIs(1, S16)), 1121 changeTo(1, V2S16)) 1122 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1123 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1124 // valid. 1125 .clampScalar(LitTyIdx, S32, S256) 1126 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1127 // Break up vectors with weird elements into scalars 1128 .fewerElementsIf( 1129 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 1130 scalarize(0)) 1131 .fewerElementsIf( 1132 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 1133 scalarize(1)) 1134 .clampScalar(BigTyIdx, S32, S1024); 1135 1136 if (Op == G_MERGE_VALUES) { 1137 Builder.widenScalarIf( 1138 // TODO: Use 16-bit shifts if legal for 8-bit values? 1139 [=](const LegalityQuery &Query) { 1140 const LLT Ty = Query.Types[LitTyIdx]; 1141 return Ty.getSizeInBits() < 32; 1142 }, 1143 changeTo(LitTyIdx, S32)); 1144 } 1145 1146 Builder.widenScalarIf( 1147 [=](const LegalityQuery &Query) { 1148 const LLT Ty = Query.Types[BigTyIdx]; 1149 return !isPowerOf2_32(Ty.getSizeInBits()) && 1150 Ty.getSizeInBits() % 16 != 0; 1151 }, 1152 [=](const LegalityQuery &Query) { 1153 // Pick the next power of 2, or a multiple of 64 over 128. 1154 // Whichever is smaller. 1155 const LLT &Ty = Query.Types[BigTyIdx]; 1156 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1157 if (NewSizeInBits >= 256) { 1158 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1159 if (RoundedTo < NewSizeInBits) 1160 NewSizeInBits = RoundedTo; 1161 } 1162 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1163 }) 1164 .legalIf([=](const LegalityQuery &Query) { 1165 const LLT &BigTy = Query.Types[BigTyIdx]; 1166 const LLT &LitTy = Query.Types[LitTyIdx]; 1167 1168 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1169 return false; 1170 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1171 return false; 1172 1173 return BigTy.getSizeInBits() % 16 == 0 && 1174 LitTy.getSizeInBits() % 16 == 0 && 1175 BigTy.getSizeInBits() <= 1024; 1176 }) 1177 // Any vectors left are the wrong size. Scalarize them. 1178 .scalarize(0) 1179 .scalarize(1); 1180 } 1181 1182 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1183 // RegBankSelect. 1184 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1185 .legalFor({{S32}, {S64}}); 1186 1187 if (ST.hasVOP3PInsts()) { 1188 SextInReg.lowerFor({{V2S16}}) 1189 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1190 // get more vector shift opportunities, since we'll get those when 1191 // expanded. 1192 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1193 } else if (ST.has16BitInsts()) { 1194 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1195 } else { 1196 // Prefer to promote to s32 before lowering if we don't have 16-bit 1197 // shifts. This avoid a lot of intermediate truncate and extend operations. 1198 SextInReg.lowerFor({{S32}, {S64}}); 1199 } 1200 1201 SextInReg 1202 .scalarize(0) 1203 .clampScalar(0, S32, S64) 1204 .lower(); 1205 1206 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1207 .legalFor({S64}); 1208 1209 getActionDefinitionsBuilder({ 1210 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1211 G_FCOPYSIGN, 1212 1213 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1214 G_READ_REGISTER, 1215 G_WRITE_REGISTER, 1216 1217 G_SADDO, G_SSUBO, 1218 1219 // TODO: Implement 1220 G_FMINIMUM, G_FMAXIMUM 1221 }).lower(); 1222 1223 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1224 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1225 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1226 .unsupported(); 1227 1228 computeTables(); 1229 verify(*ST.getInstrInfo()); 1230 } 1231 1232 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1233 MachineRegisterInfo &MRI, 1234 MachineIRBuilder &B, 1235 GISelChangeObserver &Observer) const { 1236 switch (MI.getOpcode()) { 1237 case TargetOpcode::G_ADDRSPACE_CAST: 1238 return legalizeAddrSpaceCast(MI, MRI, B); 1239 case TargetOpcode::G_FRINT: 1240 return legalizeFrint(MI, MRI, B); 1241 case TargetOpcode::G_FCEIL: 1242 return legalizeFceil(MI, MRI, B); 1243 case TargetOpcode::G_INTRINSIC_TRUNC: 1244 return legalizeIntrinsicTrunc(MI, MRI, B); 1245 case TargetOpcode::G_SITOFP: 1246 return legalizeITOFP(MI, MRI, B, true); 1247 case TargetOpcode::G_UITOFP: 1248 return legalizeITOFP(MI, MRI, B, false); 1249 case TargetOpcode::G_FPTOSI: 1250 return legalizeFPTOI(MI, MRI, B, true); 1251 case TargetOpcode::G_FPTOUI: 1252 return legalizeFPTOI(MI, MRI, B, false); 1253 case TargetOpcode::G_FMINNUM: 1254 case TargetOpcode::G_FMAXNUM: 1255 case TargetOpcode::G_FMINNUM_IEEE: 1256 case TargetOpcode::G_FMAXNUM_IEEE: 1257 return legalizeMinNumMaxNum(MI, MRI, B); 1258 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1259 return legalizeExtractVectorElt(MI, MRI, B); 1260 case TargetOpcode::G_INSERT_VECTOR_ELT: 1261 return legalizeInsertVectorElt(MI, MRI, B); 1262 case TargetOpcode::G_SHUFFLE_VECTOR: 1263 return legalizeShuffleVector(MI, MRI, B); 1264 case TargetOpcode::G_FSIN: 1265 case TargetOpcode::G_FCOS: 1266 return legalizeSinCos(MI, MRI, B); 1267 case TargetOpcode::G_GLOBAL_VALUE: 1268 return legalizeGlobalValue(MI, MRI, B); 1269 case TargetOpcode::G_LOAD: 1270 return legalizeLoad(MI, MRI, B, Observer); 1271 case TargetOpcode::G_FMAD: 1272 return legalizeFMad(MI, MRI, B); 1273 case TargetOpcode::G_FDIV: 1274 return legalizeFDIV(MI, MRI, B); 1275 case TargetOpcode::G_ATOMIC_CMPXCHG: 1276 return legalizeAtomicCmpXChg(MI, MRI, B); 1277 case TargetOpcode::G_FLOG: 1278 return legalizeFlog(MI, B, 1.0f / numbers::log2ef); 1279 case TargetOpcode::G_FLOG10: 1280 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1281 case TargetOpcode::G_FEXP: 1282 return legalizeFExp(MI, B); 1283 case TargetOpcode::G_FFLOOR: 1284 return legalizeFFloor(MI, MRI, B); 1285 case TargetOpcode::G_BUILD_VECTOR: 1286 return legalizeBuildVector(MI, MRI, B); 1287 default: 1288 return false; 1289 } 1290 1291 llvm_unreachable("expected switch to return"); 1292 } 1293 1294 Register AMDGPULegalizerInfo::getSegmentAperture( 1295 unsigned AS, 1296 MachineRegisterInfo &MRI, 1297 MachineIRBuilder &B) const { 1298 MachineFunction &MF = B.getMF(); 1299 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1300 const LLT S32 = LLT::scalar(32); 1301 1302 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1303 1304 if (ST.hasApertureRegs()) { 1305 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1306 // getreg. 1307 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1308 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1309 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1310 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1311 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1312 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1313 unsigned Encoding = 1314 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1315 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1316 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1317 1318 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1319 1320 B.buildInstr(AMDGPU::S_GETREG_B32) 1321 .addDef(GetReg) 1322 .addImm(Encoding); 1323 MRI.setType(GetReg, S32); 1324 1325 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1326 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1327 } 1328 1329 Register QueuePtr = MRI.createGenericVirtualRegister( 1330 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1331 1332 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1333 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1334 return Register(); 1335 1336 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1337 // private_segment_aperture_base_hi. 1338 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1339 1340 // TODO: can we be smarter about machine pointer info? 1341 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1342 MachineMemOperand *MMO = MF.getMachineMemOperand( 1343 PtrInfo, 1344 MachineMemOperand::MOLoad | 1345 MachineMemOperand::MODereferenceable | 1346 MachineMemOperand::MOInvariant, 1347 4, 1348 MinAlign(64, StructOffset)); 1349 1350 Register LoadAddr; 1351 1352 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1353 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1354 } 1355 1356 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1357 MachineInstr &MI, MachineRegisterInfo &MRI, 1358 MachineIRBuilder &B) const { 1359 MachineFunction &MF = B.getMF(); 1360 1361 B.setInstr(MI); 1362 1363 const LLT S32 = LLT::scalar(32); 1364 Register Dst = MI.getOperand(0).getReg(); 1365 Register Src = MI.getOperand(1).getReg(); 1366 1367 LLT DstTy = MRI.getType(Dst); 1368 LLT SrcTy = MRI.getType(Src); 1369 unsigned DestAS = DstTy.getAddressSpace(); 1370 unsigned SrcAS = SrcTy.getAddressSpace(); 1371 1372 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1373 // vector element. 1374 assert(!DstTy.isVector()); 1375 1376 const AMDGPUTargetMachine &TM 1377 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1378 1379 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1380 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1381 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1382 return true; 1383 } 1384 1385 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1386 // Truncate. 1387 B.buildExtract(Dst, Src, 0); 1388 MI.eraseFromParent(); 1389 return true; 1390 } 1391 1392 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1393 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1394 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1395 1396 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1397 // another. Merge operands are required to be the same type, but creating an 1398 // extra ptrtoint would be kind of pointless. 1399 auto HighAddr = B.buildConstant( 1400 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1401 B.buildMerge(Dst, {Src, HighAddr}); 1402 MI.eraseFromParent(); 1403 return true; 1404 } 1405 1406 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1407 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1408 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1409 unsigned NullVal = TM.getNullPointerValue(DestAS); 1410 1411 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1412 auto FlatNull = B.buildConstant(SrcTy, 0); 1413 1414 // Extract low 32-bits of the pointer. 1415 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1416 1417 auto CmpRes = 1418 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1419 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1420 1421 MI.eraseFromParent(); 1422 return true; 1423 } 1424 1425 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1426 return false; 1427 1428 if (!ST.hasFlatAddressSpace()) 1429 return false; 1430 1431 auto SegmentNull = 1432 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1433 auto FlatNull = 1434 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1435 1436 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1437 if (!ApertureReg.isValid()) 1438 return false; 1439 1440 auto CmpRes = 1441 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1442 1443 // Coerce the type of the low half of the result so we can use merge_values. 1444 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1445 1446 // TODO: Should we allow mismatched types but matching sizes in merges to 1447 // avoid the ptrtoint? 1448 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1449 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1450 1451 MI.eraseFromParent(); 1452 return true; 1453 } 1454 1455 bool AMDGPULegalizerInfo::legalizeFrint( 1456 MachineInstr &MI, MachineRegisterInfo &MRI, 1457 MachineIRBuilder &B) const { 1458 B.setInstr(MI); 1459 1460 Register Src = MI.getOperand(1).getReg(); 1461 LLT Ty = MRI.getType(Src); 1462 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1463 1464 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1465 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1466 1467 auto C1 = B.buildFConstant(Ty, C1Val); 1468 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1469 1470 // TODO: Should this propagate fast-math-flags? 1471 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1472 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1473 1474 auto C2 = B.buildFConstant(Ty, C2Val); 1475 auto Fabs = B.buildFAbs(Ty, Src); 1476 1477 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1478 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1479 return true; 1480 } 1481 1482 bool AMDGPULegalizerInfo::legalizeFceil( 1483 MachineInstr &MI, MachineRegisterInfo &MRI, 1484 MachineIRBuilder &B) const { 1485 B.setInstr(MI); 1486 1487 const LLT S1 = LLT::scalar(1); 1488 const LLT S64 = LLT::scalar(64); 1489 1490 Register Src = MI.getOperand(1).getReg(); 1491 assert(MRI.getType(Src) == S64); 1492 1493 // result = trunc(src) 1494 // if (src > 0.0 && src != result) 1495 // result += 1.0 1496 1497 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1498 1499 const auto Zero = B.buildFConstant(S64, 0.0); 1500 const auto One = B.buildFConstant(S64, 1.0); 1501 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1502 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1503 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1504 auto Add = B.buildSelect(S64, And, One, Zero); 1505 1506 // TODO: Should this propagate fast-math-flags? 1507 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1508 return true; 1509 } 1510 1511 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1512 MachineIRBuilder &B) { 1513 const unsigned FractBits = 52; 1514 const unsigned ExpBits = 11; 1515 LLT S32 = LLT::scalar(32); 1516 1517 auto Const0 = B.buildConstant(S32, FractBits - 32); 1518 auto Const1 = B.buildConstant(S32, ExpBits); 1519 1520 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1521 .addUse(Const0.getReg(0)) 1522 .addUse(Const1.getReg(0)); 1523 1524 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1525 } 1526 1527 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1528 MachineInstr &MI, MachineRegisterInfo &MRI, 1529 MachineIRBuilder &B) const { 1530 B.setInstr(MI); 1531 1532 const LLT S1 = LLT::scalar(1); 1533 const LLT S32 = LLT::scalar(32); 1534 const LLT S64 = LLT::scalar(64); 1535 1536 Register Src = MI.getOperand(1).getReg(); 1537 assert(MRI.getType(Src) == S64); 1538 1539 // TODO: Should this use extract since the low half is unused? 1540 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1541 Register Hi = Unmerge.getReg(1); 1542 1543 // Extract the upper half, since this is where we will find the sign and 1544 // exponent. 1545 auto Exp = extractF64Exponent(Hi, B); 1546 1547 const unsigned FractBits = 52; 1548 1549 // Extract the sign bit. 1550 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1551 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1552 1553 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1554 1555 const auto Zero32 = B.buildConstant(S32, 0); 1556 1557 // Extend back to 64-bits. 1558 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1559 1560 auto Shr = B.buildAShr(S64, FractMask, Exp); 1561 auto Not = B.buildNot(S64, Shr); 1562 auto Tmp0 = B.buildAnd(S64, Src, Not); 1563 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1564 1565 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1566 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1567 1568 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1569 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1570 return true; 1571 } 1572 1573 bool AMDGPULegalizerInfo::legalizeITOFP( 1574 MachineInstr &MI, MachineRegisterInfo &MRI, 1575 MachineIRBuilder &B, bool Signed) const { 1576 B.setInstr(MI); 1577 1578 Register Dst = MI.getOperand(0).getReg(); 1579 Register Src = MI.getOperand(1).getReg(); 1580 1581 const LLT S64 = LLT::scalar(64); 1582 const LLT S32 = LLT::scalar(32); 1583 1584 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1585 1586 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1587 1588 auto CvtHi = Signed ? 1589 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1590 B.buildUITOFP(S64, Unmerge.getReg(1)); 1591 1592 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1593 1594 auto ThirtyTwo = B.buildConstant(S32, 32); 1595 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1596 .addUse(CvtHi.getReg(0)) 1597 .addUse(ThirtyTwo.getReg(0)); 1598 1599 // TODO: Should this propagate fast-math-flags? 1600 B.buildFAdd(Dst, LdExp, CvtLo); 1601 MI.eraseFromParent(); 1602 return true; 1603 } 1604 1605 // TODO: Copied from DAG implementation. Verify logic and document how this 1606 // actually works. 1607 bool AMDGPULegalizerInfo::legalizeFPTOI( 1608 MachineInstr &MI, MachineRegisterInfo &MRI, 1609 MachineIRBuilder &B, bool Signed) const { 1610 B.setInstr(MI); 1611 1612 Register Dst = MI.getOperand(0).getReg(); 1613 Register Src = MI.getOperand(1).getReg(); 1614 1615 const LLT S64 = LLT::scalar(64); 1616 const LLT S32 = LLT::scalar(32); 1617 1618 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1619 1620 unsigned Flags = MI.getFlags(); 1621 1622 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1623 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1624 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1625 1626 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1627 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1628 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1629 1630 auto Hi = Signed ? 1631 B.buildFPTOSI(S32, FloorMul) : 1632 B.buildFPTOUI(S32, FloorMul); 1633 auto Lo = B.buildFPTOUI(S32, Fma); 1634 1635 B.buildMerge(Dst, { Lo, Hi }); 1636 MI.eraseFromParent(); 1637 1638 return true; 1639 } 1640 1641 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1642 MachineInstr &MI, MachineRegisterInfo &MRI, 1643 MachineIRBuilder &B) const { 1644 MachineFunction &MF = B.getMF(); 1645 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1646 1647 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1648 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1649 1650 // With ieee_mode disabled, the instructions have the correct behavior 1651 // already for G_FMINNUM/G_FMAXNUM 1652 if (!MFI->getMode().IEEE) 1653 return !IsIEEEOp; 1654 1655 if (IsIEEEOp) 1656 return true; 1657 1658 MachineIRBuilder HelperBuilder(MI); 1659 GISelObserverWrapper DummyObserver; 1660 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1661 HelperBuilder.setInstr(MI); 1662 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1663 } 1664 1665 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1666 MachineInstr &MI, MachineRegisterInfo &MRI, 1667 MachineIRBuilder &B) const { 1668 // TODO: Should move some of this into LegalizerHelper. 1669 1670 // TODO: Promote dynamic indexing of s16 to s32 1671 // TODO: Dynamic s64 indexing is only legal for SGPR. 1672 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); 1673 if (!IdxVal) // Dynamic case will be selected to register indexing. 1674 return true; 1675 1676 Register Dst = MI.getOperand(0).getReg(); 1677 Register Vec = MI.getOperand(1).getReg(); 1678 1679 LLT VecTy = MRI.getType(Vec); 1680 LLT EltTy = VecTy.getElementType(); 1681 assert(EltTy == MRI.getType(Dst)); 1682 1683 B.setInstr(MI); 1684 1685 if (IdxVal.getValue() < VecTy.getNumElements()) 1686 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); 1687 else 1688 B.buildUndef(Dst); 1689 1690 MI.eraseFromParent(); 1691 return true; 1692 } 1693 1694 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1695 MachineInstr &MI, MachineRegisterInfo &MRI, 1696 MachineIRBuilder &B) const { 1697 // TODO: Should move some of this into LegalizerHelper. 1698 1699 // TODO: Promote dynamic indexing of s16 to s32 1700 // TODO: Dynamic s64 indexing is only legal for SGPR. 1701 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); 1702 if (!IdxVal) // Dynamic case will be selected to register indexing. 1703 return true; 1704 1705 Register Dst = MI.getOperand(0).getReg(); 1706 Register Vec = MI.getOperand(1).getReg(); 1707 Register Ins = MI.getOperand(2).getReg(); 1708 1709 LLT VecTy = MRI.getType(Vec); 1710 LLT EltTy = VecTy.getElementType(); 1711 assert(EltTy == MRI.getType(Ins)); 1712 1713 B.setInstr(MI); 1714 1715 if (IdxVal.getValue() < VecTy.getNumElements()) 1716 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); 1717 else 1718 B.buildUndef(Dst); 1719 1720 MI.eraseFromParent(); 1721 return true; 1722 } 1723 1724 static bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask) { 1725 assert(Mask.size() == 2); 1726 1727 // If one half is undef, the other is trivially in the same reg. 1728 if (Mask[0] == -1 || Mask[1] == -1) 1729 return true; 1730 return ((Mask[0] == 0 || Mask[0] == 1) && (Mask[1] == 0 || Mask[1] == 1)) || 1731 ((Mask[0] == 2 || Mask[0] == 3) && (Mask[1] == 2 || Mask[1] == 3)); 1732 } 1733 1734 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1735 MachineInstr &MI, MachineRegisterInfo &MRI, 1736 MachineIRBuilder &B) const { 1737 const LLT V2S16 = LLT::vector(2, 16); 1738 1739 Register Dst = MI.getOperand(0).getReg(); 1740 Register Src0 = MI.getOperand(1).getReg(); 1741 LLT DstTy = MRI.getType(Dst); 1742 LLT SrcTy = MRI.getType(Src0); 1743 1744 if (SrcTy == V2S16 && DstTy == V2S16 && 1745 isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1746 return true; 1747 1748 MachineIRBuilder HelperBuilder(MI); 1749 GISelObserverWrapper DummyObserver; 1750 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1751 HelperBuilder.setInstr(MI); 1752 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1753 } 1754 1755 bool AMDGPULegalizerInfo::legalizeSinCos( 1756 MachineInstr &MI, MachineRegisterInfo &MRI, 1757 MachineIRBuilder &B) const { 1758 B.setInstr(MI); 1759 1760 Register DstReg = MI.getOperand(0).getReg(); 1761 Register SrcReg = MI.getOperand(1).getReg(); 1762 LLT Ty = MRI.getType(DstReg); 1763 unsigned Flags = MI.getFlags(); 1764 1765 Register TrigVal; 1766 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1767 if (ST.hasTrigReducedRange()) { 1768 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1769 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1770 .addUse(MulVal.getReg(0)) 1771 .setMIFlags(Flags).getReg(0); 1772 } else 1773 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1774 1775 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1776 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1777 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1778 .addUse(TrigVal) 1779 .setMIFlags(Flags); 1780 MI.eraseFromParent(); 1781 return true; 1782 } 1783 1784 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1785 Register DstReg, LLT PtrTy, 1786 MachineIRBuilder &B, const GlobalValue *GV, 1787 unsigned Offset, unsigned GAFlags) const { 1788 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1789 // to the following code sequence: 1790 // 1791 // For constant address space: 1792 // s_getpc_b64 s[0:1] 1793 // s_add_u32 s0, s0, $symbol 1794 // s_addc_u32 s1, s1, 0 1795 // 1796 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1797 // a fixup or relocation is emitted to replace $symbol with a literal 1798 // constant, which is a pc-relative offset from the encoding of the $symbol 1799 // operand to the global variable. 1800 // 1801 // For global address space: 1802 // s_getpc_b64 s[0:1] 1803 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1804 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1805 // 1806 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1807 // fixups or relocations are emitted to replace $symbol@*@lo and 1808 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1809 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1810 // operand to the global variable. 1811 // 1812 // What we want here is an offset from the value returned by s_getpc 1813 // (which is the address of the s_add_u32 instruction) to the global 1814 // variable, but since the encoding of $symbol starts 4 bytes after the start 1815 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1816 // small. This requires us to add 4 to the global variable offset in order to 1817 // compute the correct address. 1818 1819 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1820 1821 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1822 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1823 1824 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1825 .addDef(PCReg); 1826 1827 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1828 if (GAFlags == SIInstrInfo::MO_NONE) 1829 MIB.addImm(0); 1830 else 1831 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1832 1833 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1834 1835 if (PtrTy.getSizeInBits() == 32) 1836 B.buildExtract(DstReg, PCReg, 0); 1837 return true; 1838 } 1839 1840 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1841 MachineInstr &MI, MachineRegisterInfo &MRI, 1842 MachineIRBuilder &B) const { 1843 Register DstReg = MI.getOperand(0).getReg(); 1844 LLT Ty = MRI.getType(DstReg); 1845 unsigned AS = Ty.getAddressSpace(); 1846 1847 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1848 MachineFunction &MF = B.getMF(); 1849 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1850 B.setInstr(MI); 1851 1852 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1853 if (!MFI->isEntryFunction()) { 1854 const Function &Fn = MF.getFunction(); 1855 DiagnosticInfoUnsupported BadLDSDecl( 1856 Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 1857 Fn.getContext().diagnose(BadLDSDecl); 1858 } 1859 1860 // TODO: We could emit code to handle the initialization somewhere. 1861 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1862 const SITargetLowering *TLI = ST.getTargetLowering(); 1863 if (!TLI->shouldUseLDSConstAddress(GV)) { 1864 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 1865 return true; // Leave in place; 1866 } 1867 1868 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1869 MI.eraseFromParent(); 1870 return true; 1871 } 1872 1873 const Function &Fn = MF.getFunction(); 1874 DiagnosticInfoUnsupported BadInit( 1875 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1876 Fn.getContext().diagnose(BadInit); 1877 return true; 1878 } 1879 1880 const SITargetLowering *TLI = ST.getTargetLowering(); 1881 1882 if (TLI->shouldEmitFixup(GV)) { 1883 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 1884 MI.eraseFromParent(); 1885 return true; 1886 } 1887 1888 if (TLI->shouldEmitPCReloc(GV)) { 1889 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 1890 MI.eraseFromParent(); 1891 return true; 1892 } 1893 1894 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1895 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 1896 1897 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 1898 MachinePointerInfo::getGOT(MF), 1899 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1900 MachineMemOperand::MOInvariant, 1901 8 /*Size*/, 8 /*Align*/); 1902 1903 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 1904 1905 if (Ty.getSizeInBits() == 32) { 1906 // Truncate if this is a 32-bit constant adrdess. 1907 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 1908 B.buildExtract(DstReg, Load, 0); 1909 } else 1910 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 1911 1912 MI.eraseFromParent(); 1913 return true; 1914 } 1915 1916 bool AMDGPULegalizerInfo::legalizeLoad( 1917 MachineInstr &MI, MachineRegisterInfo &MRI, 1918 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 1919 B.setInstr(MI); 1920 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1921 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 1922 Observer.changingInstr(MI); 1923 MI.getOperand(1).setReg(Cast.getReg(0)); 1924 Observer.changedInstr(MI); 1925 return true; 1926 } 1927 1928 bool AMDGPULegalizerInfo::legalizeFMad( 1929 MachineInstr &MI, MachineRegisterInfo &MRI, 1930 MachineIRBuilder &B) const { 1931 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 1932 assert(Ty.isScalar()); 1933 1934 MachineFunction &MF = B.getMF(); 1935 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1936 1937 // TODO: Always legal with future ftz flag. 1938 // FIXME: Do we need just output? 1939 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 1940 return true; 1941 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 1942 return true; 1943 1944 MachineIRBuilder HelperBuilder(MI); 1945 GISelObserverWrapper DummyObserver; 1946 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1947 HelperBuilder.setMBB(*MI.getParent()); 1948 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 1949 } 1950 1951 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 1952 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 1953 Register DstReg = MI.getOperand(0).getReg(); 1954 Register PtrReg = MI.getOperand(1).getReg(); 1955 Register CmpVal = MI.getOperand(2).getReg(); 1956 Register NewVal = MI.getOperand(3).getReg(); 1957 1958 assert(SITargetLowering::isFlatGlobalAddrSpace( 1959 MRI.getType(PtrReg).getAddressSpace()) && 1960 "this should not have been custom lowered"); 1961 1962 LLT ValTy = MRI.getType(CmpVal); 1963 LLT VecTy = LLT::vector(2, ValTy); 1964 1965 B.setInstr(MI); 1966 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 1967 1968 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 1969 .addDef(DstReg) 1970 .addUse(PtrReg) 1971 .addUse(PackedVal) 1972 .setMemRefs(MI.memoperands()); 1973 1974 MI.eraseFromParent(); 1975 return true; 1976 } 1977 1978 bool AMDGPULegalizerInfo::legalizeFlog( 1979 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 1980 Register Dst = MI.getOperand(0).getReg(); 1981 Register Src = MI.getOperand(1).getReg(); 1982 LLT Ty = B.getMRI()->getType(Dst); 1983 unsigned Flags = MI.getFlags(); 1984 B.setInstr(MI); 1985 1986 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 1987 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 1988 1989 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 1990 MI.eraseFromParent(); 1991 return true; 1992 } 1993 1994 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 1995 MachineIRBuilder &B) const { 1996 Register Dst = MI.getOperand(0).getReg(); 1997 Register Src = MI.getOperand(1).getReg(); 1998 unsigned Flags = MI.getFlags(); 1999 LLT Ty = B.getMRI()->getType(Dst); 2000 B.setInstr(MI); 2001 2002 auto K = B.buildFConstant(Ty, numbers::log2e); 2003 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2004 B.buildFExp2(Dst, Mul, Flags); 2005 MI.eraseFromParent(); 2006 return true; 2007 } 2008 2009 // Find a source register, ignoring any possible source modifiers. 2010 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2011 Register ModSrc = OrigSrc; 2012 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2013 ModSrc = SrcFNeg->getOperand(1).getReg(); 2014 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2015 ModSrc = SrcFAbs->getOperand(1).getReg(); 2016 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2017 ModSrc = SrcFAbs->getOperand(1).getReg(); 2018 return ModSrc; 2019 } 2020 2021 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2022 MachineRegisterInfo &MRI, 2023 MachineIRBuilder &B) const { 2024 B.setInstr(MI); 2025 2026 const LLT S1 = LLT::scalar(1); 2027 const LLT S64 = LLT::scalar(64); 2028 Register Dst = MI.getOperand(0).getReg(); 2029 Register OrigSrc = MI.getOperand(1).getReg(); 2030 unsigned Flags = MI.getFlags(); 2031 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2032 "this should not have been custom lowered"); 2033 2034 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2035 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2036 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2037 // V_FRACT bug is: 2038 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2039 // 2040 // Convert floor(x) to (x - fract(x)) 2041 2042 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2043 .addUse(OrigSrc) 2044 .setMIFlags(Flags); 2045 2046 // Give source modifier matching some assistance before obscuring a foldable 2047 // pattern. 2048 2049 // TODO: We can avoid the neg on the fract? The input sign to fract 2050 // shouldn't matter? 2051 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2052 2053 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2054 2055 Register Min = MRI.createGenericVirtualRegister(S64); 2056 2057 // We don't need to concern ourselves with the snan handling difference, so 2058 // use the one which will directly select. 2059 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2060 if (MFI->getMode().IEEE) 2061 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2062 else 2063 B.buildFMinNum(Min, Fract, Const, Flags); 2064 2065 Register CorrectedFract = Min; 2066 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2067 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2068 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2069 } 2070 2071 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2072 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2073 2074 MI.eraseFromParent(); 2075 return true; 2076 } 2077 2078 // Turn an illegal packed v2s16 build vector into bit operations. 2079 // TODO: This should probably be a bitcast action in LegalizerHelper. 2080 bool AMDGPULegalizerInfo::legalizeBuildVector( 2081 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2082 Register Dst = MI.getOperand(0).getReg(); 2083 LLT DstTy = MRI.getType(Dst); 2084 const LLT S32 = LLT::scalar(32); 2085 const LLT V2S16 = LLT::vector(2, 16); 2086 (void)DstTy; 2087 (void)V2S16; 2088 assert(DstTy == V2S16); 2089 2090 Register Src0 = MI.getOperand(1).getReg(); 2091 Register Src1 = MI.getOperand(2).getReg(); 2092 assert(MRI.getType(Src0) == LLT::scalar(16)); 2093 2094 B.setInstr(MI); 2095 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2096 B.buildBitcast(Dst, Merge); 2097 2098 MI.eraseFromParent(); 2099 return true; 2100 } 2101 2102 // Return the use branch instruction, otherwise null if the usage is invalid. 2103 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2104 MachineRegisterInfo &MRI, 2105 MachineInstr *&Br) { 2106 Register CondDef = MI.getOperand(0).getReg(); 2107 if (!MRI.hasOneNonDBGUse(CondDef)) 2108 return nullptr; 2109 2110 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2111 if (UseMI.getParent() != MI.getParent() || 2112 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2113 return nullptr; 2114 2115 // Make sure the cond br is followed by a G_BR 2116 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2117 if (Next != MI.getParent()->end()) { 2118 if (Next->getOpcode() != AMDGPU::G_BR) 2119 return nullptr; 2120 Br = &*Next; 2121 } 2122 2123 return &UseMI; 2124 } 2125 2126 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 2127 Register Reg, LLT Ty) const { 2128 Register LiveIn = MRI.getLiveInVirtReg(Reg); 2129 if (LiveIn) 2130 return LiveIn; 2131 2132 Register NewReg = MRI.createGenericVirtualRegister(Ty); 2133 MRI.addLiveIn(Reg, NewReg); 2134 return NewReg; 2135 } 2136 2137 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2138 const ArgDescriptor *Arg) const { 2139 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2140 return false; // TODO: Handle these 2141 2142 assert(Arg->getRegister().isPhysical()); 2143 2144 MachineRegisterInfo &MRI = *B.getMRI(); 2145 2146 LLT Ty = MRI.getType(DstReg); 2147 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 2148 2149 if (Arg->isMasked()) { 2150 // TODO: Should we try to emit this once in the entry block? 2151 const LLT S32 = LLT::scalar(32); 2152 const unsigned Mask = Arg->getMask(); 2153 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2154 2155 Register AndMaskSrc = LiveIn; 2156 2157 if (Shift != 0) { 2158 auto ShiftAmt = B.buildConstant(S32, Shift); 2159 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2160 } 2161 2162 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2163 } else 2164 B.buildCopy(DstReg, LiveIn); 2165 2166 // Insert the argument copy if it doens't already exist. 2167 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2168 if (!MRI.getVRegDef(LiveIn)) { 2169 // FIXME: Should have scoped insert pt 2170 MachineBasicBlock &OrigInsBB = B.getMBB(); 2171 auto OrigInsPt = B.getInsertPt(); 2172 2173 MachineBasicBlock &EntryMBB = B.getMF().front(); 2174 EntryMBB.addLiveIn(Arg->getRegister()); 2175 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2176 B.buildCopy(LiveIn, Arg->getRegister()); 2177 2178 B.setInsertPt(OrigInsBB, OrigInsPt); 2179 } 2180 2181 return true; 2182 } 2183 2184 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2185 MachineInstr &MI, 2186 MachineRegisterInfo &MRI, 2187 MachineIRBuilder &B, 2188 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2189 B.setInstr(MI); 2190 2191 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2192 2193 const ArgDescriptor *Arg; 2194 const TargetRegisterClass *RC; 2195 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 2196 if (!Arg) { 2197 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2198 return false; 2199 } 2200 2201 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 2202 MI.eraseFromParent(); 2203 return true; 2204 } 2205 2206 return false; 2207 } 2208 2209 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2210 MachineRegisterInfo &MRI, 2211 MachineIRBuilder &B) const { 2212 B.setInstr(MI); 2213 Register Dst = MI.getOperand(0).getReg(); 2214 LLT DstTy = MRI.getType(Dst); 2215 LLT S16 = LLT::scalar(16); 2216 LLT S32 = LLT::scalar(32); 2217 LLT S64 = LLT::scalar(64); 2218 2219 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2220 return true; 2221 2222 if (DstTy == S16) 2223 return legalizeFDIV16(MI, MRI, B); 2224 if (DstTy == S32) 2225 return legalizeFDIV32(MI, MRI, B); 2226 if (DstTy == S64) 2227 return legalizeFDIV64(MI, MRI, B); 2228 2229 return false; 2230 } 2231 2232 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2233 MachineRegisterInfo &MRI, 2234 MachineIRBuilder &B) const { 2235 Register Res = MI.getOperand(0).getReg(); 2236 Register LHS = MI.getOperand(1).getReg(); 2237 Register RHS = MI.getOperand(2).getReg(); 2238 2239 uint16_t Flags = MI.getFlags(); 2240 2241 LLT ResTy = MRI.getType(Res); 2242 LLT S32 = LLT::scalar(32); 2243 LLT S64 = LLT::scalar(64); 2244 2245 const MachineFunction &MF = B.getMF(); 2246 bool Unsafe = 2247 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2248 2249 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2250 return false; 2251 2252 if (!Unsafe && ResTy == S32 && 2253 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2254 return false; 2255 2256 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2257 // 1 / x -> RCP(x) 2258 if (CLHS->isExactlyValue(1.0)) { 2259 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2260 .addUse(RHS) 2261 .setMIFlags(Flags); 2262 2263 MI.eraseFromParent(); 2264 return true; 2265 } 2266 2267 // -1 / x -> RCP( FNEG(x) ) 2268 if (CLHS->isExactlyValue(-1.0)) { 2269 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2270 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2271 .addUse(FNeg.getReg(0)) 2272 .setMIFlags(Flags); 2273 2274 MI.eraseFromParent(); 2275 return true; 2276 } 2277 } 2278 2279 // x / y -> x * (1.0 / y) 2280 if (Unsafe) { 2281 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2282 .addUse(RHS) 2283 .setMIFlags(Flags); 2284 B.buildFMul(Res, LHS, RCP, Flags); 2285 2286 MI.eraseFromParent(); 2287 return true; 2288 } 2289 2290 return false; 2291 } 2292 2293 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2294 MachineRegisterInfo &MRI, 2295 MachineIRBuilder &B) const { 2296 B.setInstr(MI); 2297 Register Res = MI.getOperand(0).getReg(); 2298 Register LHS = MI.getOperand(1).getReg(); 2299 Register RHS = MI.getOperand(2).getReg(); 2300 2301 uint16_t Flags = MI.getFlags(); 2302 2303 LLT S16 = LLT::scalar(16); 2304 LLT S32 = LLT::scalar(32); 2305 2306 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2307 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2308 2309 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2310 .addUse(RHSExt.getReg(0)) 2311 .setMIFlags(Flags); 2312 2313 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2314 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2315 2316 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2317 .addUse(RDst.getReg(0)) 2318 .addUse(RHS) 2319 .addUse(LHS) 2320 .setMIFlags(Flags); 2321 2322 MI.eraseFromParent(); 2323 return true; 2324 } 2325 2326 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2327 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2328 static void toggleSPDenormMode(bool Enable, 2329 MachineIRBuilder &B, 2330 const GCNSubtarget &ST, 2331 AMDGPU::SIModeRegisterDefaults Mode) { 2332 // Set SP denorm mode to this value. 2333 unsigned SPDenormMode = 2334 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2335 2336 if (ST.hasDenormModeInst()) { 2337 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2338 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2339 2340 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2341 B.buildInstr(AMDGPU::S_DENORM_MODE) 2342 .addImm(NewDenormModeValue); 2343 2344 } else { 2345 // Select FP32 bit field in mode register. 2346 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2347 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2348 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2349 2350 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2351 .addImm(SPDenormMode) 2352 .addImm(SPDenormModeBitField); 2353 } 2354 } 2355 2356 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2357 MachineRegisterInfo &MRI, 2358 MachineIRBuilder &B) const { 2359 B.setInstr(MI); 2360 Register Res = MI.getOperand(0).getReg(); 2361 Register LHS = MI.getOperand(1).getReg(); 2362 Register RHS = MI.getOperand(2).getReg(); 2363 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2364 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2365 2366 uint16_t Flags = MI.getFlags(); 2367 2368 LLT S32 = LLT::scalar(32); 2369 LLT S1 = LLT::scalar(1); 2370 2371 auto One = B.buildFConstant(S32, 1.0f); 2372 2373 auto DenominatorScaled = 2374 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2375 .addUse(RHS) 2376 .addUse(LHS) 2377 .addImm(1) 2378 .setMIFlags(Flags); 2379 auto NumeratorScaled = 2380 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2381 .addUse(LHS) 2382 .addUse(RHS) 2383 .addImm(0) 2384 .setMIFlags(Flags); 2385 2386 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2387 .addUse(DenominatorScaled.getReg(0)) 2388 .setMIFlags(Flags); 2389 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2390 2391 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2392 // aren't modeled as reading it. 2393 if (!Mode.allFP32Denormals()) 2394 toggleSPDenormMode(true, B, ST, Mode); 2395 2396 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2397 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2398 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2399 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2400 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2401 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2402 2403 if (!Mode.allFP32Denormals()) 2404 toggleSPDenormMode(false, B, ST, Mode); 2405 2406 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2407 .addUse(Fma4.getReg(0)) 2408 .addUse(Fma1.getReg(0)) 2409 .addUse(Fma3.getReg(0)) 2410 .addUse(NumeratorScaled.getReg(1)) 2411 .setMIFlags(Flags); 2412 2413 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2414 .addUse(Fmas.getReg(0)) 2415 .addUse(RHS) 2416 .addUse(LHS) 2417 .setMIFlags(Flags); 2418 2419 MI.eraseFromParent(); 2420 return true; 2421 } 2422 2423 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2424 MachineRegisterInfo &MRI, 2425 MachineIRBuilder &B) const { 2426 B.setInstr(MI); 2427 Register Res = MI.getOperand(0).getReg(); 2428 Register LHS = MI.getOperand(1).getReg(); 2429 Register RHS = MI.getOperand(2).getReg(); 2430 2431 uint16_t Flags = MI.getFlags(); 2432 2433 LLT S64 = LLT::scalar(64); 2434 LLT S1 = LLT::scalar(1); 2435 2436 auto One = B.buildFConstant(S64, 1.0); 2437 2438 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2439 .addUse(LHS) 2440 .addUse(RHS) 2441 .addImm(1) 2442 .setMIFlags(Flags); 2443 2444 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2445 2446 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2447 .addUse(DivScale0.getReg(0)) 2448 .setMIFlags(Flags); 2449 2450 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2451 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2452 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2453 2454 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2455 .addUse(LHS) 2456 .addUse(RHS) 2457 .addImm(0) 2458 .setMIFlags(Flags); 2459 2460 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2461 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); 2462 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2463 2464 Register Scale; 2465 if (!ST.hasUsableDivScaleConditionOutput()) { 2466 // Workaround a hardware bug on SI where the condition output from div_scale 2467 // is not usable. 2468 2469 LLT S32 = LLT::scalar(32); 2470 2471 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2472 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2473 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2474 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 2475 2476 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 2477 Scale1Unmerge.getReg(1)); 2478 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 2479 Scale0Unmerge.getReg(1)); 2480 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 2481 } else { 2482 Scale = DivScale1.getReg(1); 2483 } 2484 2485 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 2486 .addUse(Fma4.getReg(0)) 2487 .addUse(Fma3.getReg(0)) 2488 .addUse(Mul.getReg(0)) 2489 .addUse(Scale) 2490 .setMIFlags(Flags); 2491 2492 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 2493 .addUse(Fmas.getReg(0)) 2494 .addUse(RHS) 2495 .addUse(LHS) 2496 .setMIFlags(Flags); 2497 2498 MI.eraseFromParent(); 2499 return true; 2500 } 2501 2502 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 2503 MachineRegisterInfo &MRI, 2504 MachineIRBuilder &B) const { 2505 B.setInstr(MI); 2506 Register Res = MI.getOperand(0).getReg(); 2507 Register LHS = MI.getOperand(2).getReg(); 2508 Register RHS = MI.getOperand(3).getReg(); 2509 uint16_t Flags = MI.getFlags(); 2510 2511 LLT S32 = LLT::scalar(32); 2512 LLT S1 = LLT::scalar(1); 2513 2514 auto Abs = B.buildFAbs(S32, RHS, Flags); 2515 const APFloat C0Val(1.0f); 2516 2517 auto C0 = B.buildConstant(S32, 0x6f800000); 2518 auto C1 = B.buildConstant(S32, 0x2f800000); 2519 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 2520 2521 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 2522 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 2523 2524 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 2525 2526 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2527 .addUse(Mul0.getReg(0)) 2528 .setMIFlags(Flags); 2529 2530 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 2531 2532 B.buildFMul(Res, Sel, Mul1, Flags); 2533 2534 MI.eraseFromParent(); 2535 return true; 2536 } 2537 2538 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 2539 MachineRegisterInfo &MRI, 2540 MachineIRBuilder &B) const { 2541 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2542 if (!MFI->isEntryFunction()) { 2543 return legalizePreloadedArgIntrin(MI, MRI, B, 2544 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 2545 } 2546 2547 B.setInstr(MI); 2548 2549 uint64_t Offset = 2550 ST.getTargetLowering()->getImplicitParameterOffset( 2551 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 2552 Register DstReg = MI.getOperand(0).getReg(); 2553 LLT DstTy = MRI.getType(DstReg); 2554 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 2555 2556 const ArgDescriptor *Arg; 2557 const TargetRegisterClass *RC; 2558 std::tie(Arg, RC) 2559 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2560 if (!Arg) 2561 return false; 2562 2563 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 2564 if (!loadInputValue(KernargPtrReg, B, Arg)) 2565 return false; 2566 2567 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 2568 MI.eraseFromParent(); 2569 return true; 2570 } 2571 2572 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 2573 MachineRegisterInfo &MRI, 2574 MachineIRBuilder &B, 2575 unsigned AddrSpace) const { 2576 B.setInstr(MI); 2577 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 2578 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 2579 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 2580 MI.eraseFromParent(); 2581 return true; 2582 } 2583 2584 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 2585 // offset (the offset that is included in bounds checking and swizzling, to be 2586 // split between the instruction's voffset and immoffset fields) and soffset 2587 // (the offset that is excluded from bounds checking and swizzling, to go in 2588 // the instruction's soffset field). This function takes the first kind of 2589 // offset and figures out how to split it between voffset and immoffset. 2590 std::tuple<Register, unsigned, unsigned> 2591 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 2592 Register OrigOffset) const { 2593 const unsigned MaxImm = 4095; 2594 Register BaseReg; 2595 unsigned TotalConstOffset; 2596 MachineInstr *OffsetDef; 2597 const LLT S32 = LLT::scalar(32); 2598 2599 std::tie(BaseReg, TotalConstOffset, OffsetDef) 2600 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 2601 2602 unsigned ImmOffset = TotalConstOffset; 2603 2604 // If the immediate value is too big for the immoffset field, put the value 2605 // and -4096 into the immoffset field so that the value that is copied/added 2606 // for the voffset field is a multiple of 4096, and it stands more chance 2607 // of being CSEd with the copy/add for another similar load/store. 2608 // However, do not do that rounding down to a multiple of 4096 if that is a 2609 // negative number, as it appears to be illegal to have a negative offset 2610 // in the vgpr, even if adding the immediate offset makes it positive. 2611 unsigned Overflow = ImmOffset & ~MaxImm; 2612 ImmOffset -= Overflow; 2613 if ((int32_t)Overflow < 0) { 2614 Overflow += ImmOffset; 2615 ImmOffset = 0; 2616 } 2617 2618 if (Overflow != 0) { 2619 if (!BaseReg) { 2620 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 2621 } else { 2622 auto OverflowVal = B.buildConstant(S32, Overflow); 2623 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 2624 } 2625 } 2626 2627 if (!BaseReg) 2628 BaseReg = B.buildConstant(S32, 0).getReg(0); 2629 2630 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 2631 } 2632 2633 /// Handle register layout difference for f16 images for some subtargets. 2634 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 2635 MachineRegisterInfo &MRI, 2636 Register Reg) const { 2637 if (!ST.hasUnpackedD16VMem()) 2638 return Reg; 2639 2640 const LLT S16 = LLT::scalar(16); 2641 const LLT S32 = LLT::scalar(32); 2642 LLT StoreVT = MRI.getType(Reg); 2643 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 2644 2645 auto Unmerge = B.buildUnmerge(S16, Reg); 2646 2647 SmallVector<Register, 4> WideRegs; 2648 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 2649 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 2650 2651 int NumElts = StoreVT.getNumElements(); 2652 2653 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 2654 } 2655 2656 Register AMDGPULegalizerInfo::fixStoreSourceType( 2657 MachineIRBuilder &B, Register VData, bool IsFormat) const { 2658 MachineRegisterInfo *MRI = B.getMRI(); 2659 LLT Ty = MRI->getType(VData); 2660 2661 const LLT S16 = LLT::scalar(16); 2662 2663 // Fixup illegal register types for i8 stores. 2664 if (Ty == LLT::scalar(8) || Ty == S16) { 2665 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 2666 return AnyExt; 2667 } 2668 2669 if (Ty.isVector()) { 2670 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 2671 if (IsFormat) 2672 return handleD16VData(B, *MRI, VData); 2673 } 2674 } 2675 2676 return VData; 2677 } 2678 2679 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 2680 MachineRegisterInfo &MRI, 2681 MachineIRBuilder &B, 2682 bool IsTyped, 2683 bool IsFormat) const { 2684 B.setInstr(MI); 2685 2686 Register VData = MI.getOperand(1).getReg(); 2687 LLT Ty = MRI.getType(VData); 2688 LLT EltTy = Ty.getScalarType(); 2689 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 2690 const LLT S32 = LLT::scalar(32); 2691 2692 VData = fixStoreSourceType(B, VData, IsFormat); 2693 Register RSrc = MI.getOperand(2).getReg(); 2694 2695 MachineMemOperand *MMO = *MI.memoperands_begin(); 2696 const int MemSize = MMO->getSize(); 2697 2698 unsigned ImmOffset; 2699 unsigned TotalOffset; 2700 2701 // The typed intrinsics add an immediate after the registers. 2702 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 2703 2704 // The struct intrinsic variants add one additional operand over raw. 2705 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 2706 Register VIndex; 2707 int OpOffset = 0; 2708 if (HasVIndex) { 2709 VIndex = MI.getOperand(3).getReg(); 2710 OpOffset = 1; 2711 } 2712 2713 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 2714 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 2715 2716 unsigned Format = 0; 2717 if (IsTyped) { 2718 Format = MI.getOperand(5 + OpOffset).getImm(); 2719 ++OpOffset; 2720 } 2721 2722 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 2723 2724 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 2725 if (TotalOffset != 0) 2726 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 2727 2728 unsigned Opc; 2729 if (IsTyped) { 2730 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 2731 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 2732 } else if (IsFormat) { 2733 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 2734 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 2735 } else { 2736 switch (MemSize) { 2737 case 1: 2738 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 2739 break; 2740 case 2: 2741 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 2742 break; 2743 default: 2744 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 2745 break; 2746 } 2747 } 2748 2749 if (!VIndex) 2750 VIndex = B.buildConstant(S32, 0).getReg(0); 2751 2752 auto MIB = B.buildInstr(Opc) 2753 .addUse(VData) // vdata 2754 .addUse(RSrc) // rsrc 2755 .addUse(VIndex) // vindex 2756 .addUse(VOffset) // voffset 2757 .addUse(SOffset) // soffset 2758 .addImm(ImmOffset); // offset(imm) 2759 2760 if (IsTyped) 2761 MIB.addImm(Format); 2762 2763 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 2764 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 2765 .addMemOperand(MMO); 2766 2767 MI.eraseFromParent(); 2768 return true; 2769 } 2770 2771 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 2772 MachineRegisterInfo &MRI, 2773 MachineIRBuilder &B, 2774 bool IsFormat, 2775 bool IsTyped) const { 2776 B.setInstr(MI); 2777 2778 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 2779 MachineMemOperand *MMO = *MI.memoperands_begin(); 2780 const int MemSize = MMO->getSize(); 2781 const LLT S32 = LLT::scalar(32); 2782 2783 Register Dst = MI.getOperand(0).getReg(); 2784 Register RSrc = MI.getOperand(2).getReg(); 2785 2786 // The typed intrinsics add an immediate after the registers. 2787 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 2788 2789 // The struct intrinsic variants add one additional operand over raw. 2790 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 2791 Register VIndex; 2792 int OpOffset = 0; 2793 if (HasVIndex) { 2794 VIndex = MI.getOperand(3).getReg(); 2795 OpOffset = 1; 2796 } 2797 2798 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 2799 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 2800 2801 unsigned Format = 0; 2802 if (IsTyped) { 2803 Format = MI.getOperand(5 + OpOffset).getImm(); 2804 ++OpOffset; 2805 } 2806 2807 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 2808 unsigned ImmOffset; 2809 unsigned TotalOffset; 2810 2811 LLT Ty = MRI.getType(Dst); 2812 LLT EltTy = Ty.getScalarType(); 2813 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 2814 const bool Unpacked = ST.hasUnpackedD16VMem(); 2815 2816 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 2817 if (TotalOffset != 0) 2818 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 2819 2820 unsigned Opc; 2821 2822 if (IsTyped) { 2823 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 2824 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 2825 } else if (IsFormat) { 2826 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 2827 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 2828 } else { 2829 switch (MemSize) { 2830 case 1: 2831 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 2832 break; 2833 case 2: 2834 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 2835 break; 2836 default: 2837 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 2838 break; 2839 } 2840 } 2841 2842 Register LoadDstReg; 2843 2844 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 2845 LLT UnpackedTy = Ty.changeElementSize(32); 2846 2847 if (IsExtLoad) 2848 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 2849 else if (Unpacked && IsD16 && Ty.isVector()) 2850 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 2851 else 2852 LoadDstReg = Dst; 2853 2854 if (!VIndex) 2855 VIndex = B.buildConstant(S32, 0).getReg(0); 2856 2857 auto MIB = B.buildInstr(Opc) 2858 .addDef(LoadDstReg) // vdata 2859 .addUse(RSrc) // rsrc 2860 .addUse(VIndex) // vindex 2861 .addUse(VOffset) // voffset 2862 .addUse(SOffset) // soffset 2863 .addImm(ImmOffset); // offset(imm) 2864 2865 if (IsTyped) 2866 MIB.addImm(Format); 2867 2868 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 2869 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 2870 .addMemOperand(MMO); 2871 2872 if (LoadDstReg != Dst) { 2873 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2874 2875 // Widen result for extending loads was widened. 2876 if (IsExtLoad) 2877 B.buildTrunc(Dst, LoadDstReg); 2878 else { 2879 // Repack to original 16-bit vector result 2880 // FIXME: G_TRUNC should work, but legalization currently fails 2881 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 2882 SmallVector<Register, 4> Repack; 2883 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 2884 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 2885 B.buildMerge(Dst, Repack); 2886 } 2887 } 2888 2889 MI.eraseFromParent(); 2890 return true; 2891 } 2892 2893 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 2894 MachineIRBuilder &B, 2895 bool IsInc) const { 2896 B.setInstr(MI); 2897 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 2898 AMDGPU::G_AMDGPU_ATOMIC_DEC; 2899 B.buildInstr(Opc) 2900 .addDef(MI.getOperand(0).getReg()) 2901 .addUse(MI.getOperand(2).getReg()) 2902 .addUse(MI.getOperand(3).getReg()) 2903 .cloneMemRefs(MI); 2904 MI.eraseFromParent(); 2905 return true; 2906 } 2907 2908 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 2909 switch (IntrID) { 2910 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 2911 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 2912 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 2913 case Intrinsic::amdgcn_raw_buffer_atomic_add: 2914 case Intrinsic::amdgcn_struct_buffer_atomic_add: 2915 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 2916 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 2917 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 2918 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 2919 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 2920 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 2921 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 2922 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 2923 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 2924 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 2925 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 2926 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 2927 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 2928 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 2929 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 2930 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 2931 case Intrinsic::amdgcn_raw_buffer_atomic_and: 2932 case Intrinsic::amdgcn_struct_buffer_atomic_and: 2933 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 2934 case Intrinsic::amdgcn_raw_buffer_atomic_or: 2935 case Intrinsic::amdgcn_struct_buffer_atomic_or: 2936 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 2937 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 2938 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 2939 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 2940 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 2941 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 2942 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 2943 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 2944 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 2945 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 2946 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 2947 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 2948 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 2949 default: 2950 llvm_unreachable("unhandled atomic opcode"); 2951 } 2952 } 2953 2954 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 2955 MachineIRBuilder &B, 2956 Intrinsic::ID IID) const { 2957 B.setInstr(MI); 2958 2959 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 2960 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 2961 2962 Register Dst = MI.getOperand(0).getReg(); 2963 Register VData = MI.getOperand(2).getReg(); 2964 2965 Register CmpVal; 2966 int OpOffset = 0; 2967 2968 if (IsCmpSwap) { 2969 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 2970 ++OpOffset; 2971 } 2972 2973 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 2974 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 2975 2976 // The struct intrinsic variants add one additional operand over raw. 2977 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 2978 Register VIndex; 2979 if (HasVIndex) { 2980 VIndex = MI.getOperand(4 + OpOffset).getReg(); 2981 ++OpOffset; 2982 } 2983 2984 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 2985 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 2986 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 2987 2988 MachineMemOperand *MMO = *MI.memoperands_begin(); 2989 2990 unsigned ImmOffset; 2991 unsigned TotalOffset; 2992 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 2993 if (TotalOffset != 0) 2994 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 2995 2996 if (!VIndex) 2997 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 2998 2999 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3000 .addDef(Dst) 3001 .addUse(VData); // vdata 3002 3003 if (IsCmpSwap) 3004 MIB.addReg(CmpVal); 3005 3006 MIB.addUse(RSrc) // rsrc 3007 .addUse(VIndex) // vindex 3008 .addUse(VOffset) // voffset 3009 .addUse(SOffset) // soffset 3010 .addImm(ImmOffset) // offset(imm) 3011 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3012 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3013 .addMemOperand(MMO); 3014 3015 MI.eraseFromParent(); 3016 return true; 3017 } 3018 3019 // Produce a vector of s16 elements from s32 pieces. 3020 static void truncToS16Vector(MachineIRBuilder &B, Register DstReg, 3021 ArrayRef<Register> UnmergeParts) { 3022 const LLT S16 = LLT::scalar(16); 3023 3024 SmallVector<Register, 4> RemergeParts(UnmergeParts.size()); 3025 for (int I = 0, E = UnmergeParts.size(); I != E; ++I) 3026 RemergeParts[I] = B.buildTrunc(S16, UnmergeParts[I]).getReg(0); 3027 3028 B.buildBuildVector(DstReg, RemergeParts); 3029 } 3030 3031 /// Convert a set of s32 registers to a result vector with s16 elements. 3032 static void bitcastToS16Vector(MachineIRBuilder &B, Register DstReg, 3033 ArrayRef<Register> UnmergeParts) { 3034 MachineRegisterInfo &MRI = *B.getMRI(); 3035 const LLT V2S16 = LLT::vector(2, 16); 3036 LLT TargetTy = MRI.getType(DstReg); 3037 int NumElts = UnmergeParts.size(); 3038 3039 if (NumElts == 1) { 3040 assert(TargetTy == V2S16); 3041 B.buildBitcast(DstReg, UnmergeParts[0]); 3042 return; 3043 } 3044 3045 SmallVector<Register, 4> RemergeParts(NumElts); 3046 for (int I = 0; I != NumElts; ++I) 3047 RemergeParts[I] = B.buildBitcast(V2S16, UnmergeParts[I]).getReg(0); 3048 3049 if (TargetTy.getSizeInBits() == 32u * NumElts) { 3050 B.buildConcatVectors(DstReg, RemergeParts); 3051 return; 3052 } 3053 3054 const LLT V3S16 = LLT::vector(3, 16); 3055 const LLT V6S16 = LLT::vector(6, 16); 3056 3057 // Widen to v6s16 and unpack v3 parts. 3058 assert(TargetTy == V3S16); 3059 3060 RemergeParts.push_back(B.buildUndef(V2S16).getReg(0)); 3061 auto Concat = B.buildConcatVectors(V6S16, RemergeParts); 3062 B.buildUnmerge({DstReg, MRI.createGenericVirtualRegister(V3S16)}, Concat); 3063 } 3064 3065 // FIXME: Just vector trunc should be sufficent, but legalization currently 3066 // broken. 3067 static void repackUnpackedD16Load(MachineIRBuilder &B, Register DstReg, 3068 Register WideDstReg) { 3069 const LLT S32 = LLT::scalar(32); 3070 const LLT S16 = LLT::scalar(16); 3071 3072 auto Unmerge = B.buildUnmerge(S32, WideDstReg); 3073 3074 int NumOps = Unmerge->getNumOperands() - 1; 3075 SmallVector<Register, 4> RemergeParts(NumOps); 3076 for (int I = 0; I != NumOps; ++I) 3077 RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0); 3078 3079 B.buildBuildVector(DstReg, RemergeParts); 3080 } 3081 3082 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3083 MachineInstr &MI, MachineIRBuilder &B, 3084 GISelChangeObserver &Observer, 3085 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3086 bool IsTFE = MI.getNumExplicitDefs() == 2; 3087 3088 // We are only processing the operands of d16 image operations on subtargets 3089 // that use the unpacked register layout, or need to repack the TFE result. 3090 3091 // TODO: Need to handle a16 images too 3092 // TODO: Do we need to guard against already legalized intrinsics? 3093 if (!IsTFE && !ST.hasUnpackedD16VMem()) 3094 return true; 3095 3096 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3097 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3098 3099 if (BaseOpcode->Atomic) // No d16 atomics, or TFE. 3100 return true; 3101 3102 B.setInstr(MI); 3103 3104 MachineRegisterInfo *MRI = B.getMRI(); 3105 const LLT S32 = LLT::scalar(32); 3106 const LLT S16 = LLT::scalar(16); 3107 3108 if (BaseOpcode->Store) { // No TFE for stores? 3109 Register VData = MI.getOperand(1).getReg(); 3110 LLT Ty = MRI->getType(VData); 3111 if (!Ty.isVector() || Ty.getElementType() != S16) 3112 return true; 3113 3114 B.setInstr(MI); 3115 3116 Observer.changingInstr(MI); 3117 MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData)); 3118 Observer.changedInstr(MI); 3119 return true; 3120 } 3121 3122 Register DstReg = MI.getOperand(0).getReg(); 3123 LLT Ty = MRI->getType(DstReg); 3124 const LLT EltTy = Ty.getScalarType(); 3125 const bool IsD16 = Ty.getScalarType() == S16; 3126 const unsigned NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3127 3128 if (IsTFE) { 3129 // In the IR, TFE is supposed to be used with a 2 element struct return 3130 // type. The intruction really returns these two values in one contiguous 3131 // register, with one additional dword beyond the loaded data. Rewrite the 3132 // return type to use a single register result. 3133 Register Dst1Reg = MI.getOperand(1).getReg(); 3134 if (MRI->getType(Dst1Reg) != S32) 3135 return false; 3136 3137 // TODO: Make sure the TFE operand bit is set. 3138 3139 // The raw dword aligned data component of the load. The only legal cases 3140 // where this matters should be when using the packed D16 format, for 3141 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3142 LLT RoundedTy; 3143 LLT TFETy; 3144 3145 if (IsD16 && ST.hasUnpackedD16VMem()) { 3146 RoundedTy = LLT::scalarOrVector(NumElts, 32); 3147 TFETy = LLT::vector(NumElts + 1, 32); 3148 } else { 3149 unsigned EltSize = Ty.getScalarSizeInBits(); 3150 unsigned RoundedElts = (Ty.getSizeInBits() + 31) / 32; 3151 unsigned RoundedSize = 32 * RoundedElts; 3152 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3153 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3154 } 3155 3156 Register TFEReg = MRI->createGenericVirtualRegister(TFETy); 3157 Observer.changingInstr(MI); 3158 3159 MI.getOperand(0).setReg(TFEReg); 3160 MI.RemoveOperand(1); 3161 3162 Observer.changedInstr(MI); 3163 3164 // Insert after the instruction. 3165 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3166 3167 // Now figure out how to copy the new result register back into the old 3168 // result. 3169 3170 SmallVector<Register, 5> UnmergeResults(TFETy.getNumElements(), Dst1Reg); 3171 int NumDataElts = TFETy.getNumElements() - 1; 3172 3173 if (!Ty.isVector()) { 3174 // Simplest case is a trivial unmerge (plus a truncate for d16). 3175 UnmergeResults[0] = Ty == S32 ? 3176 DstReg : MRI->createGenericVirtualRegister(S32); 3177 3178 B.buildUnmerge(UnmergeResults, TFEReg); 3179 if (Ty != S32) 3180 B.buildTrunc(DstReg, UnmergeResults[0]); 3181 return true; 3182 } 3183 3184 // We have to repack into a new vector of some kind. 3185 for (int I = 0; I != NumDataElts; ++I) 3186 UnmergeResults[I] = MRI->createGenericVirtualRegister(S32); 3187 B.buildUnmerge(UnmergeResults, TFEReg); 3188 3189 // Drop the final TFE element. 3190 ArrayRef<Register> DataPart(UnmergeResults.data(), NumDataElts); 3191 3192 if (EltTy == S32) 3193 B.buildBuildVector(DstReg, DataPart); 3194 else if (ST.hasUnpackedD16VMem()) 3195 truncToS16Vector(B, DstReg, DataPart); 3196 else 3197 bitcastToS16Vector(B, DstReg, DataPart); 3198 3199 return true; 3200 } 3201 3202 // Must be an image load. 3203 if (!Ty.isVector() || Ty.getElementType() != S16) 3204 return true; 3205 3206 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3207 3208 LLT WidenedTy = Ty.changeElementType(S32); 3209 Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy); 3210 3211 Observer.changingInstr(MI); 3212 MI.getOperand(0).setReg(WideDstReg); 3213 Observer.changedInstr(MI); 3214 3215 repackUnpackedD16Load(B, DstReg, WideDstReg); 3216 return true; 3217 } 3218 3219 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 3220 MachineInstr &MI, MachineIRBuilder &B, 3221 GISelChangeObserver &Observer) const { 3222 Register Dst = MI.getOperand(0).getReg(); 3223 LLT Ty = B.getMRI()->getType(Dst); 3224 unsigned Size = Ty.getSizeInBits(); 3225 MachineFunction &MF = B.getMF(); 3226 3227 Observer.changingInstr(MI); 3228 3229 // FIXME: We don't really need this intermediate instruction. The intrinsic 3230 // should be fixed to have a memory operand. Since it's readnone, we're not 3231 // allowed to add one. 3232 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 3233 MI.RemoveOperand(1); // Remove intrinsic ID 3234 3235 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 3236 // TODO: Should this use datalayout alignment? 3237 const unsigned MemSize = (Size + 7) / 8; 3238 const unsigned MemAlign = 4; 3239 MachineMemOperand *MMO = MF.getMachineMemOperand( 3240 MachinePointerInfo(), 3241 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 3242 MachineMemOperand::MOInvariant, MemSize, MemAlign); 3243 MI.addMemOperand(MF, MMO); 3244 3245 // There are no 96-bit result scalar loads, but widening to 128-bit should 3246 // always be legal. We may need to restore this to a 96-bit result if it turns 3247 // out this needs to be converted to a vector load during RegBankSelect. 3248 if (!isPowerOf2_32(Size)) { 3249 LegalizerHelper Helper(MF, *this, Observer, B); 3250 B.setInstr(MI); 3251 3252 if (Ty.isVector()) 3253 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 3254 else 3255 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 3256 } 3257 3258 Observer.changedInstr(MI); 3259 return true; 3260 } 3261 3262 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 3263 MachineIRBuilder &B, 3264 GISelChangeObserver &Observer) const { 3265 MachineRegisterInfo &MRI = *B.getMRI(); 3266 3267 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 3268 auto IntrID = MI.getIntrinsicID(); 3269 switch (IntrID) { 3270 case Intrinsic::amdgcn_if: 3271 case Intrinsic::amdgcn_else: { 3272 MachineInstr *Br = nullptr; 3273 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 3274 const SIRegisterInfo *TRI 3275 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 3276 3277 B.setInstr(*BrCond); 3278 Register Def = MI.getOperand(1).getReg(); 3279 Register Use = MI.getOperand(3).getReg(); 3280 3281 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 3282 if (Br) 3283 BrTarget = Br->getOperand(0).getMBB(); 3284 3285 if (IntrID == Intrinsic::amdgcn_if) { 3286 B.buildInstr(AMDGPU::SI_IF) 3287 .addDef(Def) 3288 .addUse(Use) 3289 .addMBB(BrTarget); 3290 } else { 3291 B.buildInstr(AMDGPU::SI_ELSE) 3292 .addDef(Def) 3293 .addUse(Use) 3294 .addMBB(BrTarget) 3295 .addImm(0); 3296 } 3297 3298 if (Br) 3299 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 3300 3301 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 3302 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 3303 MI.eraseFromParent(); 3304 BrCond->eraseFromParent(); 3305 return true; 3306 } 3307 3308 return false; 3309 } 3310 case Intrinsic::amdgcn_loop: { 3311 MachineInstr *Br = nullptr; 3312 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 3313 const SIRegisterInfo *TRI 3314 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 3315 3316 B.setInstr(*BrCond); 3317 3318 // FIXME: Need to adjust branch targets based on unconditional branch. 3319 Register Reg = MI.getOperand(2).getReg(); 3320 B.buildInstr(AMDGPU::SI_LOOP) 3321 .addUse(Reg) 3322 .addMBB(BrCond->getOperand(1).getMBB()); 3323 MI.eraseFromParent(); 3324 BrCond->eraseFromParent(); 3325 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 3326 return true; 3327 } 3328 3329 return false; 3330 } 3331 case Intrinsic::amdgcn_kernarg_segment_ptr: 3332 return legalizePreloadedArgIntrin( 3333 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3334 case Intrinsic::amdgcn_implicitarg_ptr: 3335 return legalizeImplicitArgPtr(MI, MRI, B); 3336 case Intrinsic::amdgcn_workitem_id_x: 3337 return legalizePreloadedArgIntrin(MI, MRI, B, 3338 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 3339 case Intrinsic::amdgcn_workitem_id_y: 3340 return legalizePreloadedArgIntrin(MI, MRI, B, 3341 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 3342 case Intrinsic::amdgcn_workitem_id_z: 3343 return legalizePreloadedArgIntrin(MI, MRI, B, 3344 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 3345 case Intrinsic::amdgcn_workgroup_id_x: 3346 return legalizePreloadedArgIntrin(MI, MRI, B, 3347 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 3348 case Intrinsic::amdgcn_workgroup_id_y: 3349 return legalizePreloadedArgIntrin(MI, MRI, B, 3350 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 3351 case Intrinsic::amdgcn_workgroup_id_z: 3352 return legalizePreloadedArgIntrin(MI, MRI, B, 3353 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 3354 case Intrinsic::amdgcn_dispatch_ptr: 3355 return legalizePreloadedArgIntrin(MI, MRI, B, 3356 AMDGPUFunctionArgInfo::DISPATCH_PTR); 3357 case Intrinsic::amdgcn_queue_ptr: 3358 return legalizePreloadedArgIntrin(MI, MRI, B, 3359 AMDGPUFunctionArgInfo::QUEUE_PTR); 3360 case Intrinsic::amdgcn_implicit_buffer_ptr: 3361 return legalizePreloadedArgIntrin( 3362 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 3363 case Intrinsic::amdgcn_dispatch_id: 3364 return legalizePreloadedArgIntrin(MI, MRI, B, 3365 AMDGPUFunctionArgInfo::DISPATCH_ID); 3366 case Intrinsic::amdgcn_fdiv_fast: 3367 return legalizeFDIVFastIntrin(MI, MRI, B); 3368 case Intrinsic::amdgcn_is_shared: 3369 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 3370 case Intrinsic::amdgcn_is_private: 3371 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 3372 case Intrinsic::amdgcn_wavefrontsize: { 3373 B.setInstr(MI); 3374 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 3375 MI.eraseFromParent(); 3376 return true; 3377 } 3378 case Intrinsic::amdgcn_s_buffer_load: 3379 return legalizeSBufferLoad(MI, B, Observer); 3380 case Intrinsic::amdgcn_raw_buffer_store: 3381 case Intrinsic::amdgcn_struct_buffer_store: 3382 return legalizeBufferStore(MI, MRI, B, false, false); 3383 case Intrinsic::amdgcn_raw_buffer_store_format: 3384 case Intrinsic::amdgcn_struct_buffer_store_format: 3385 return legalizeBufferStore(MI, MRI, B, false, true); 3386 case Intrinsic::amdgcn_raw_tbuffer_store: 3387 case Intrinsic::amdgcn_struct_tbuffer_store: 3388 return legalizeBufferStore(MI, MRI, B, true, true); 3389 case Intrinsic::amdgcn_raw_buffer_load: 3390 case Intrinsic::amdgcn_struct_buffer_load: 3391 return legalizeBufferLoad(MI, MRI, B, false, false); 3392 case Intrinsic::amdgcn_raw_buffer_load_format: 3393 case Intrinsic::amdgcn_struct_buffer_load_format: 3394 return legalizeBufferLoad(MI, MRI, B, true, false); 3395 case Intrinsic::amdgcn_raw_tbuffer_load: 3396 case Intrinsic::amdgcn_struct_tbuffer_load: 3397 return legalizeBufferLoad(MI, MRI, B, true, true); 3398 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3399 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3400 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3401 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3402 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3403 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3404 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3405 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3406 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3407 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3408 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3409 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3410 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3411 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3412 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3413 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3414 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3415 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3416 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3417 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3418 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3419 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3420 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3421 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3422 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3423 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3424 return legalizeBufferAtomic(MI, B, IntrID); 3425 case Intrinsic::amdgcn_atomic_inc: 3426 return legalizeAtomicIncDec(MI, B, true); 3427 case Intrinsic::amdgcn_atomic_dec: 3428 return legalizeAtomicIncDec(MI, B, false); 3429 default: { 3430 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 3431 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 3432 return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr); 3433 return true; 3434 } 3435 } 3436 3437 return true; 3438 } 3439