1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPULegalizerInfo.h" 22 23 #include "AMDGPU.h" 24 #include "AMDGPUGlobalISelUtils.h" 25 #include "AMDGPUTargetMachine.h" 26 #include "SIMachineFunctionInfo.h" 27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 30 #include "llvm/CodeGen/TargetOpcodes.h" 31 #include "llvm/CodeGen/ValueTypes.h" 32 #include "llvm/IR/DerivedTypes.h" 33 #include "llvm/IR/DiagnosticInfo.h" 34 #include "llvm/IR/Type.h" 35 #include "llvm/Support/Debug.h" 36 37 #define DEBUG_TYPE "amdgpu-legalinfo" 38 39 using namespace llvm; 40 using namespace LegalizeActions; 41 using namespace LegalizeMutations; 42 using namespace LegalityPredicates; 43 using namespace MIPatternMatch; 44 45 // Round the number of elements to the next power of two elements 46 static LLT getPow2VectorType(LLT Ty) { 47 unsigned NElts = Ty.getNumElements(); 48 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 49 return Ty.changeNumElements(Pow2NElts); 50 } 51 52 // Round the number of bits to the next power of two bits 53 static LLT getPow2ScalarType(LLT Ty) { 54 unsigned Bits = Ty.getSizeInBits(); 55 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 56 return LLT::scalar(Pow2Bits); 57 } 58 59 static LegalityPredicate isMultiple32(unsigned TypeIdx, 60 unsigned MaxSize = 1024) { 61 return [=](const LegalityQuery &Query) { 62 const LLT Ty = Query.Types[TypeIdx]; 63 const LLT EltTy = Ty.getScalarType(); 64 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 65 }; 66 } 67 68 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 69 return [=](const LegalityQuery &Query) { 70 return Query.Types[TypeIdx].getSizeInBits() == Size; 71 }; 72 } 73 74 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 75 return [=](const LegalityQuery &Query) { 76 const LLT Ty = Query.Types[TypeIdx]; 77 return Ty.isVector() && 78 Ty.getNumElements() % 2 != 0 && 79 Ty.getElementType().getSizeInBits() < 32 && 80 Ty.getSizeInBits() % 32 != 0; 81 }; 82 } 83 84 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 85 return [=](const LegalityQuery &Query) { 86 const LLT Ty = Query.Types[TypeIdx]; 87 const LLT EltTy = Ty.getScalarType(); 88 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 89 }; 90 } 91 92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 93 return [=](const LegalityQuery &Query) { 94 const LLT Ty = Query.Types[TypeIdx]; 95 const LLT EltTy = Ty.getElementType(); 96 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 97 }; 98 } 99 100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 101 return [=](const LegalityQuery &Query) { 102 const LLT Ty = Query.Types[TypeIdx]; 103 const LLT EltTy = Ty.getElementType(); 104 unsigned Size = Ty.getSizeInBits(); 105 unsigned Pieces = (Size + 63) / 64; 106 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 107 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 108 }; 109 } 110 111 // Increase the number of vector elements to reach the next multiple of 32-bit 112 // type. 113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 114 return [=](const LegalityQuery &Query) { 115 const LLT Ty = Query.Types[TypeIdx]; 116 117 const LLT EltTy = Ty.getElementType(); 118 const int Size = Ty.getSizeInBits(); 119 const int EltSize = EltTy.getSizeInBits(); 120 const int NextMul32 = (Size + 31) / 32; 121 122 assert(EltSize < 32); 123 124 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 125 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 126 }; 127 } 128 129 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 130 return [=](const LegalityQuery &Query) { 131 const LLT QueryTy = Query.Types[TypeIdx]; 132 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 133 }; 134 } 135 136 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 137 return [=](const LegalityQuery &Query) { 138 const LLT QueryTy = Query.Types[TypeIdx]; 139 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 140 }; 141 } 142 143 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 144 return [=](const LegalityQuery &Query) { 145 const LLT QueryTy = Query.Types[TypeIdx]; 146 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 147 }; 148 } 149 150 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 151 // v2s16. 152 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 153 return [=](const LegalityQuery &Query) { 154 const LLT Ty = Query.Types[TypeIdx]; 155 if (Ty.isVector()) { 156 const int EltSize = Ty.getElementType().getSizeInBits(); 157 return EltSize == 32 || EltSize == 64 || 158 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 159 EltSize == 128 || EltSize == 256; 160 } 161 162 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 163 }; 164 } 165 166 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 167 return [=](const LegalityQuery &Query) { 168 const LLT QueryTy = Query.Types[TypeIdx]; 169 return QueryTy.isVector() && QueryTy.getElementType() == Type; 170 }; 171 } 172 173 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 174 return [=](const LegalityQuery &Query) { 175 const LLT Ty = Query.Types[TypeIdx]; 176 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 177 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 178 }; 179 } 180 181 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 182 const GCNTargetMachine &TM) 183 : ST(ST_) { 184 using namespace TargetOpcode; 185 186 auto GetAddrSpacePtr = [&TM](unsigned AS) { 187 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 188 }; 189 190 const LLT S1 = LLT::scalar(1); 191 const LLT S16 = LLT::scalar(16); 192 const LLT S32 = LLT::scalar(32); 193 const LLT S64 = LLT::scalar(64); 194 const LLT S96 = LLT::scalar(96); 195 const LLT S128 = LLT::scalar(128); 196 const LLT S256 = LLT::scalar(256); 197 const LLT S1024 = LLT::scalar(1024); 198 199 const LLT V2S16 = LLT::vector(2, 16); 200 const LLT V4S16 = LLT::vector(4, 16); 201 202 const LLT V2S32 = LLT::vector(2, 32); 203 const LLT V3S32 = LLT::vector(3, 32); 204 const LLT V4S32 = LLT::vector(4, 32); 205 const LLT V5S32 = LLT::vector(5, 32); 206 const LLT V6S32 = LLT::vector(6, 32); 207 const LLT V7S32 = LLT::vector(7, 32); 208 const LLT V8S32 = LLT::vector(8, 32); 209 const LLT V9S32 = LLT::vector(9, 32); 210 const LLT V10S32 = LLT::vector(10, 32); 211 const LLT V11S32 = LLT::vector(11, 32); 212 const LLT V12S32 = LLT::vector(12, 32); 213 const LLT V13S32 = LLT::vector(13, 32); 214 const LLT V14S32 = LLT::vector(14, 32); 215 const LLT V15S32 = LLT::vector(15, 32); 216 const LLT V16S32 = LLT::vector(16, 32); 217 const LLT V32S32 = LLT::vector(32, 32); 218 219 const LLT V2S64 = LLT::vector(2, 64); 220 const LLT V3S64 = LLT::vector(3, 64); 221 const LLT V4S64 = LLT::vector(4, 64); 222 const LLT V5S64 = LLT::vector(5, 64); 223 const LLT V6S64 = LLT::vector(6, 64); 224 const LLT V7S64 = LLT::vector(7, 64); 225 const LLT V8S64 = LLT::vector(8, 64); 226 const LLT V16S64 = LLT::vector(16, 64); 227 228 std::initializer_list<LLT> AllS32Vectors = 229 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 230 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 231 std::initializer_list<LLT> AllS64Vectors = 232 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 233 234 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 235 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 236 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 237 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 238 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 239 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 240 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 241 242 const LLT CodePtr = FlatPtr; 243 244 const std::initializer_list<LLT> AddrSpaces64 = { 245 GlobalPtr, ConstantPtr, FlatPtr 246 }; 247 248 const std::initializer_list<LLT> AddrSpaces32 = { 249 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 250 }; 251 252 const std::initializer_list<LLT> FPTypesBase = { 253 S32, S64 254 }; 255 256 const std::initializer_list<LLT> FPTypes16 = { 257 S32, S64, S16 258 }; 259 260 const std::initializer_list<LLT> FPTypesPK16 = { 261 S32, S64, S16, V2S16 262 }; 263 264 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 265 266 setAction({G_BRCOND, S1}, Legal); // VCC branches 267 setAction({G_BRCOND, S32}, Legal); // SCC branches 268 269 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 270 // elements for v3s16 271 getActionDefinitionsBuilder(G_PHI) 272 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 273 .legalFor(AllS32Vectors) 274 .legalFor(AllS64Vectors) 275 .legalFor(AddrSpaces64) 276 .legalFor(AddrSpaces32) 277 .clampScalar(0, S32, S256) 278 .widenScalarToNextPow2(0, 32) 279 .clampMaxNumElements(0, S32, 16) 280 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 281 .legalIf(isPointer(0)); 282 283 if (ST.has16BitInsts()) { 284 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 285 .legalFor({S32, S16}) 286 .clampScalar(0, S16, S32) 287 .scalarize(0); 288 } else { 289 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 290 .legalFor({S32}) 291 .clampScalar(0, S32, S32) 292 .scalarize(0); 293 } 294 295 // FIXME: Not really legal. Placeholder for custom lowering. 296 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 297 .legalFor({S32, S64}) 298 .clampScalar(0, S32, S64) 299 .widenScalarToNextPow2(0, 32) 300 .scalarize(0); 301 302 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 303 .legalFor({S32}) 304 .clampScalar(0, S32, S32) 305 .scalarize(0); 306 307 // Report legal for any types we can handle anywhere. For the cases only legal 308 // on the SALU, RegBankSelect will be able to re-legalize. 309 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 310 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 311 .clampScalar(0, S32, S64) 312 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 313 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 314 .widenScalarToNextPow2(0) 315 .scalarize(0); 316 317 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 318 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 319 .legalFor({{S32, S1}, {S32, S32}}) 320 .clampScalar(0, S32, S32) 321 .scalarize(0); // TODO: Implement. 322 323 getActionDefinitionsBuilder(G_BITCAST) 324 // Don't worry about the size constraint. 325 .legalIf(all(isRegisterType(0), isRegisterType(1))) 326 .lower(); 327 328 329 getActionDefinitionsBuilder(G_CONSTANT) 330 .legalFor({S1, S32, S64, S16, GlobalPtr, 331 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 332 .clampScalar(0, S32, S64) 333 .widenScalarToNextPow2(0) 334 .legalIf(isPointer(0)); 335 336 getActionDefinitionsBuilder(G_FCONSTANT) 337 .legalFor({S32, S64, S16}) 338 .clampScalar(0, S16, S64); 339 340 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 341 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 342 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 343 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 344 .clampScalarOrElt(0, S32, S1024) 345 .legalIf(isMultiple32(0)) 346 .widenScalarToNextPow2(0, 32) 347 .clampMaxNumElements(0, S32, 16); 348 349 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 350 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 351 .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr}); 352 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 353 354 auto &FPOpActions = getActionDefinitionsBuilder( 355 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 356 .legalFor({S32, S64}); 357 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 358 .customFor({S32, S64}); 359 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 360 .customFor({S32, S64}); 361 362 if (ST.has16BitInsts()) { 363 if (ST.hasVOP3PInsts()) 364 FPOpActions.legalFor({S16, V2S16}); 365 else 366 FPOpActions.legalFor({S16}); 367 368 TrigActions.customFor({S16}); 369 FDIVActions.customFor({S16}); 370 } 371 372 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 373 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 374 375 if (ST.hasVOP3PInsts()) { 376 MinNumMaxNum.customFor(FPTypesPK16) 377 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 378 .clampMaxNumElements(0, S16, 2) 379 .clampScalar(0, S16, S64) 380 .scalarize(0); 381 } else if (ST.has16BitInsts()) { 382 MinNumMaxNum.customFor(FPTypes16) 383 .clampScalar(0, S16, S64) 384 .scalarize(0); 385 } else { 386 MinNumMaxNum.customFor(FPTypesBase) 387 .clampScalar(0, S32, S64) 388 .scalarize(0); 389 } 390 391 if (ST.hasVOP3PInsts()) 392 FPOpActions.clampMaxNumElements(0, S16, 2); 393 394 FPOpActions 395 .scalarize(0) 396 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 397 398 TrigActions 399 .scalarize(0) 400 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 401 402 FDIVActions 403 .scalarize(0) 404 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 405 406 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 407 .legalFor(FPTypesPK16) 408 .clampMaxNumElements(0, S16, 2) 409 .scalarize(0) 410 .clampScalar(0, S16, S64); 411 412 if (ST.has16BitInsts()) { 413 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 414 .legalFor({S32, S64, S16}) 415 .scalarize(0) 416 .clampScalar(0, S16, S64); 417 } else { 418 getActionDefinitionsBuilder(G_FSQRT) 419 .legalFor({S32, S64}) 420 .scalarize(0) 421 .clampScalar(0, S32, S64); 422 423 if (ST.hasFractBug()) { 424 getActionDefinitionsBuilder(G_FFLOOR) 425 .customFor({S64}) 426 .legalFor({S32, S64}) 427 .scalarize(0) 428 .clampScalar(0, S32, S64); 429 } else { 430 getActionDefinitionsBuilder(G_FFLOOR) 431 .legalFor({S32, S64}) 432 .scalarize(0) 433 .clampScalar(0, S32, S64); 434 } 435 } 436 437 getActionDefinitionsBuilder(G_FPTRUNC) 438 .legalFor({{S32, S64}, {S16, S32}}) 439 .scalarize(0); 440 441 getActionDefinitionsBuilder(G_FPEXT) 442 .legalFor({{S64, S32}, {S32, S16}}) 443 .lowerFor({{S64, S16}}) // FIXME: Implement 444 .scalarize(0); 445 446 getActionDefinitionsBuilder(G_FSUB) 447 // Use actual fsub instruction 448 .legalFor({S32}) 449 // Must use fadd + fneg 450 .lowerFor({S64, S16, V2S16}) 451 .scalarize(0) 452 .clampScalar(0, S32, S64); 453 454 // Whether this is legal depends on the floating point mode for the function. 455 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 456 if (ST.hasMadF16()) 457 FMad.customFor({S32, S16}); 458 else 459 FMad.customFor({S32}); 460 FMad.scalarize(0) 461 .lower(); 462 463 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 464 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 465 {S32, S1}, {S64, S1}, {S16, S1}}) 466 .scalarize(0) 467 .clampScalar(0, S32, S64); 468 469 // TODO: Split s1->s64 during regbankselect for VALU. 470 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 471 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 472 .lowerFor({{S32, S64}}) 473 .lowerIf(typeIs(1, S1)) 474 .customFor({{S64, S64}}); 475 if (ST.has16BitInsts()) 476 IToFP.legalFor({{S16, S16}}); 477 IToFP.clampScalar(1, S32, S64) 478 .scalarize(0); 479 480 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 481 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 482 .customFor({{S64, S64}}); 483 if (ST.has16BitInsts()) 484 FPToI.legalFor({{S16, S16}}); 485 else 486 FPToI.minScalar(1, S32); 487 488 FPToI.minScalar(0, S32) 489 .scalarize(0) 490 .lower(); 491 492 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 493 .scalarize(0) 494 .lower(); 495 496 if (ST.has16BitInsts()) { 497 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 498 .legalFor({S16, S32, S64}) 499 .clampScalar(0, S16, S64) 500 .scalarize(0); 501 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 502 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 503 .legalFor({S32, S64}) 504 .clampScalar(0, S32, S64) 505 .scalarize(0); 506 } else { 507 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 508 .legalFor({S32}) 509 .customFor({S64}) 510 .clampScalar(0, S32, S64) 511 .scalarize(0); 512 } 513 514 getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK}) 515 .scalarize(0) 516 .alwaysLegal(); 517 518 auto &CmpBuilder = 519 getActionDefinitionsBuilder(G_ICMP) 520 // The compare output type differs based on the register bank of the output, 521 // so make both s1 and s32 legal. 522 // 523 // Scalar compares producing output in scc will be promoted to s32, as that 524 // is the allocatable register type that will be needed for the copy from 525 // scc. This will be promoted during RegBankSelect, and we assume something 526 // before that won't try to use s32 result types. 527 // 528 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 529 // bank. 530 .legalForCartesianProduct( 531 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 532 .legalForCartesianProduct( 533 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 534 if (ST.has16BitInsts()) { 535 CmpBuilder.legalFor({{S1, S16}}); 536 } 537 538 CmpBuilder 539 .widenScalarToNextPow2(1) 540 .clampScalar(1, S32, S64) 541 .scalarize(0) 542 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 543 544 getActionDefinitionsBuilder(G_FCMP) 545 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 546 .widenScalarToNextPow2(1) 547 .clampScalar(1, S32, S64) 548 .scalarize(0); 549 550 // FIXME: fpow has a selection pattern that should move to custom lowering. 551 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2, G_FPOW}); 552 if (ST.has16BitInsts()) 553 Exp2Ops.legalFor({S32, S16}); 554 else 555 Exp2Ops.legalFor({S32}); 556 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 557 Exp2Ops.scalarize(0); 558 559 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10}); 560 if (ST.has16BitInsts()) 561 ExpOps.customFor({{S32}, {S16}}); 562 else 563 ExpOps.customFor({S32}); 564 ExpOps.clampScalar(0, MinScalarFPTy, S32) 565 .scalarize(0); 566 567 // The 64-bit versions produce 32-bit results, but only on the SALU. 568 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, 569 G_CTTZ, G_CTTZ_ZERO_UNDEF, 570 G_CTPOP}) 571 .legalFor({{S32, S32}, {S32, S64}}) 572 .clampScalar(0, S32, S32) 573 .clampScalar(1, S32, S64) 574 .scalarize(0) 575 .widenScalarToNextPow2(0, 32) 576 .widenScalarToNextPow2(1, 32); 577 578 // TODO: Expand for > s32 579 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE}) 580 .legalFor({S32}) 581 .clampScalar(0, S32, S32) 582 .scalarize(0); 583 584 if (ST.has16BitInsts()) { 585 if (ST.hasVOP3PInsts()) { 586 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 587 .legalFor({S32, S16, V2S16}) 588 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 589 .clampMaxNumElements(0, S16, 2) 590 .clampScalar(0, S16, S32) 591 .widenScalarToNextPow2(0) 592 .scalarize(0); 593 } else { 594 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 595 .legalFor({S32, S16}) 596 .widenScalarToNextPow2(0) 597 .clampScalar(0, S16, S32) 598 .scalarize(0); 599 } 600 } else { 601 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 602 .legalFor({S32}) 603 .clampScalar(0, S32, S32) 604 .widenScalarToNextPow2(0) 605 .scalarize(0); 606 } 607 608 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 609 return [=](const LegalityQuery &Query) { 610 return Query.Types[TypeIdx0].getSizeInBits() < 611 Query.Types[TypeIdx1].getSizeInBits(); 612 }; 613 }; 614 615 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 616 return [=](const LegalityQuery &Query) { 617 return Query.Types[TypeIdx0].getSizeInBits() > 618 Query.Types[TypeIdx1].getSizeInBits(); 619 }; 620 }; 621 622 getActionDefinitionsBuilder(G_INTTOPTR) 623 // List the common cases 624 .legalForCartesianProduct(AddrSpaces64, {S64}) 625 .legalForCartesianProduct(AddrSpaces32, {S32}) 626 .scalarize(0) 627 // Accept any address space as long as the size matches 628 .legalIf(sameSize(0, 1)) 629 .widenScalarIf(smallerThan(1, 0), 630 [](const LegalityQuery &Query) { 631 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 632 }) 633 .narrowScalarIf(greaterThan(1, 0), 634 [](const LegalityQuery &Query) { 635 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 636 }); 637 638 getActionDefinitionsBuilder(G_PTRTOINT) 639 // List the common cases 640 .legalForCartesianProduct(AddrSpaces64, {S64}) 641 .legalForCartesianProduct(AddrSpaces32, {S32}) 642 .scalarize(0) 643 // Accept any address space as long as the size matches 644 .legalIf(sameSize(0, 1)) 645 .widenScalarIf(smallerThan(0, 1), 646 [](const LegalityQuery &Query) { 647 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 648 }) 649 .narrowScalarIf( 650 greaterThan(0, 1), 651 [](const LegalityQuery &Query) { 652 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 653 }); 654 655 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 656 .scalarize(0) 657 .custom(); 658 659 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 660 // handle some operations by just promoting the register during 661 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 662 auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned { 663 switch (AS) { 664 // FIXME: Private element size. 665 case AMDGPUAS::PRIVATE_ADDRESS: 666 return 32; 667 // FIXME: Check subtarget 668 case AMDGPUAS::LOCAL_ADDRESS: 669 return ST.useDS128() ? 128 : 64; 670 671 // Treat constant and global as identical. SMRD loads are sometimes usable 672 // for global loads (ideally constant address space should be eliminated) 673 // depending on the context. Legality cannot be context dependent, but 674 // RegBankSelect can split the load as necessary depending on the pointer 675 // register bank/uniformity and if the memory is invariant or not written in 676 // a kernel. 677 case AMDGPUAS::CONSTANT_ADDRESS: 678 case AMDGPUAS::GLOBAL_ADDRESS: 679 return IsLoad ? 512 : 128; 680 default: 681 return 128; 682 } 683 }; 684 685 const auto needToSplitMemOp = [=](const LegalityQuery &Query, bool IsLoad) -> bool { 686 const LLT DstTy = Query.Types[0]; 687 688 // Split vector extloads. 689 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 690 unsigned Align = Query.MMODescrs[0].AlignInBits; 691 692 if (MemSize < DstTy.getSizeInBits()) 693 MemSize = std::max(MemSize, Align); 694 695 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 696 return true; 697 698 const LLT PtrTy = Query.Types[1]; 699 unsigned AS = PtrTy.getAddressSpace(); 700 if (MemSize > maxSizeForAddrSpace(AS, IsLoad)) 701 return true; 702 703 // Catch weird sized loads that don't evenly divide into the access sizes 704 // TODO: May be able to widen depending on alignment etc. 705 unsigned NumRegs = MemSize / 32; 706 if (NumRegs == 3 && !ST.hasDwordx3LoadStores()) 707 return true; 708 709 if (Align < MemSize) { 710 const SITargetLowering *TLI = ST.getTargetLowering(); 711 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 712 } 713 714 return false; 715 }; 716 717 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 718 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 719 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 720 721 // TODO: Refine based on subtargets which support unaligned access or 128-bit 722 // LDS 723 // TODO: Unsupported flat for SI. 724 725 for (unsigned Op : {G_LOAD, G_STORE}) { 726 const bool IsStore = Op == G_STORE; 727 728 auto &Actions = getActionDefinitionsBuilder(Op); 729 // Whitelist the common cases. 730 // TODO: Pointer loads 731 // TODO: Wide constant loads 732 // TODO: Only CI+ has 3x loads 733 // TODO: Loads to s16 on gfx9 734 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 735 {V2S32, GlobalPtr, 64, GlobalAlign32}, 736 {V3S32, GlobalPtr, 96, GlobalAlign32}, 737 {S96, GlobalPtr, 96, GlobalAlign32}, 738 {V4S32, GlobalPtr, 128, GlobalAlign32}, 739 {S128, GlobalPtr, 128, GlobalAlign32}, 740 {S64, GlobalPtr, 64, GlobalAlign32}, 741 {V2S64, GlobalPtr, 128, GlobalAlign32}, 742 {V2S16, GlobalPtr, 32, GlobalAlign32}, 743 {S32, GlobalPtr, 8, GlobalAlign8}, 744 {S32, GlobalPtr, 16, GlobalAlign16}, 745 746 {S32, LocalPtr, 32, 32}, 747 {S64, LocalPtr, 64, 32}, 748 {V2S32, LocalPtr, 64, 32}, 749 {S32, LocalPtr, 8, 8}, 750 {S32, LocalPtr, 16, 16}, 751 {V2S16, LocalPtr, 32, 32}, 752 753 {S32, PrivatePtr, 32, 32}, 754 {S32, PrivatePtr, 8, 8}, 755 {S32, PrivatePtr, 16, 16}, 756 {V2S16, PrivatePtr, 32, 32}, 757 758 {S32, FlatPtr, 32, GlobalAlign32}, 759 {S32, FlatPtr, 16, GlobalAlign16}, 760 {S32, FlatPtr, 8, GlobalAlign8}, 761 {V2S16, FlatPtr, 32, GlobalAlign32}, 762 763 {S32, ConstantPtr, 32, GlobalAlign32}, 764 {V2S32, ConstantPtr, 64, GlobalAlign32}, 765 {V3S32, ConstantPtr, 96, GlobalAlign32}, 766 {V4S32, ConstantPtr, 128, GlobalAlign32}, 767 {S64, ConstantPtr, 64, GlobalAlign32}, 768 {S128, ConstantPtr, 128, GlobalAlign32}, 769 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 770 Actions 771 .customIf(typeIs(1, Constant32Ptr)) 772 .narrowScalarIf( 773 [=](const LegalityQuery &Query) -> bool { 774 return !Query.Types[0].isVector() && 775 needToSplitMemOp(Query, Op == G_LOAD); 776 }, 777 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 778 const LLT DstTy = Query.Types[0]; 779 const LLT PtrTy = Query.Types[1]; 780 781 const unsigned DstSize = DstTy.getSizeInBits(); 782 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 783 784 // Split extloads. 785 if (DstSize > MemSize) 786 return std::make_pair(0, LLT::scalar(MemSize)); 787 788 if (DstSize > 32 && (DstSize % 32 != 0)) { 789 // FIXME: Need a way to specify non-extload of larger size if 790 // suitably aligned. 791 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 792 } 793 794 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 795 Op == G_LOAD); 796 if (MemSize > MaxSize) 797 return std::make_pair(0, LLT::scalar(MaxSize)); 798 799 unsigned Align = Query.MMODescrs[0].AlignInBits; 800 return std::make_pair(0, LLT::scalar(Align)); 801 }) 802 .fewerElementsIf( 803 [=](const LegalityQuery &Query) -> bool { 804 return Query.Types[0].isVector() && 805 needToSplitMemOp(Query, Op == G_LOAD); 806 }, 807 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 808 const LLT DstTy = Query.Types[0]; 809 const LLT PtrTy = Query.Types[1]; 810 811 LLT EltTy = DstTy.getElementType(); 812 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 813 Op == G_LOAD); 814 815 // Split if it's too large for the address space. 816 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 817 unsigned NumElts = DstTy.getNumElements(); 818 unsigned EltSize = EltTy.getSizeInBits(); 819 820 if (MaxSize % EltSize == 0) { 821 return std::make_pair( 822 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 823 } 824 825 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 826 827 // FIXME: Refine when odd breakdowns handled 828 // The scalars will need to be re-legalized. 829 if (NumPieces == 1 || NumPieces >= NumElts || 830 NumElts % NumPieces != 0) 831 return std::make_pair(0, EltTy); 832 833 return std::make_pair(0, 834 LLT::vector(NumElts / NumPieces, EltTy)); 835 } 836 837 // Need to split because of alignment. 838 unsigned Align = Query.MMODescrs[0].AlignInBits; 839 unsigned EltSize = EltTy.getSizeInBits(); 840 if (EltSize > Align && 841 (EltSize / Align < DstTy.getNumElements())) { 842 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 843 } 844 845 // May need relegalization for the scalars. 846 return std::make_pair(0, EltTy); 847 }) 848 .minScalar(0, S32); 849 850 if (IsStore) 851 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 852 853 // TODO: Need a bitcast lower option? 854 Actions 855 .legalIf([=](const LegalityQuery &Query) { 856 const LLT Ty0 = Query.Types[0]; 857 unsigned Size = Ty0.getSizeInBits(); 858 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 859 unsigned Align = Query.MMODescrs[0].AlignInBits; 860 861 // FIXME: Widening store from alignment not valid. 862 if (MemSize < Size) 863 MemSize = std::max(MemSize, Align); 864 865 // No extending vector loads. 866 if (Size > MemSize && Ty0.isVector()) 867 return false; 868 869 switch (MemSize) { 870 case 8: 871 case 16: 872 return Size == 32; 873 case 32: 874 case 64: 875 case 128: 876 return true; 877 case 96: 878 return ST.hasDwordx3LoadStores(); 879 case 256: 880 case 512: 881 return true; 882 default: 883 return false; 884 } 885 }) 886 .widenScalarToNextPow2(0) 887 // TODO: v3s32->v4s32 with alignment 888 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 889 } 890 891 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 892 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 893 {S32, GlobalPtr, 16, 2 * 8}, 894 {S32, LocalPtr, 8, 8}, 895 {S32, LocalPtr, 16, 16}, 896 {S32, PrivatePtr, 8, 8}, 897 {S32, PrivatePtr, 16, 16}, 898 {S32, ConstantPtr, 8, 8}, 899 {S32, ConstantPtr, 16, 2 * 8}}); 900 if (ST.hasFlatAddressSpace()) { 901 ExtLoads.legalForTypesWithMemDesc( 902 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 903 } 904 905 ExtLoads.clampScalar(0, S32, S32) 906 .widenScalarToNextPow2(0) 907 .unsupportedIfMemSizeNotPow2() 908 .lower(); 909 910 auto &Atomics = getActionDefinitionsBuilder( 911 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 912 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 913 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 914 G_ATOMICRMW_UMIN}) 915 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 916 {S64, GlobalPtr}, {S64, LocalPtr}}); 917 if (ST.hasFlatAddressSpace()) { 918 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 919 } 920 921 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 922 .legalFor({{S32, LocalPtr}}); 923 924 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 925 // demarshalling 926 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 927 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 928 {S32, FlatPtr}, {S64, FlatPtr}}) 929 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 930 {S32, RegionPtr}, {S64, RegionPtr}}); 931 // TODO: Pointer types, any 32-bit or 64-bit vector 932 933 // Condition should be s32 for scalar, s1 for vector. 934 getActionDefinitionsBuilder(G_SELECT) 935 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 936 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 937 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 938 .clampScalar(0, S16, S64) 939 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 940 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 941 .scalarize(1) 942 .clampMaxNumElements(0, S32, 2) 943 .clampMaxNumElements(0, LocalPtr, 2) 944 .clampMaxNumElements(0, PrivatePtr, 2) 945 .scalarize(0) 946 .widenScalarToNextPow2(0) 947 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 948 949 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 950 // be more flexible with the shift amount type. 951 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 952 .legalFor({{S32, S32}, {S64, S32}}); 953 if (ST.has16BitInsts()) { 954 if (ST.hasVOP3PInsts()) { 955 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 956 .clampMaxNumElements(0, S16, 2); 957 } else 958 Shifts.legalFor({{S16, S32}, {S16, S16}}); 959 960 // TODO: Support 16-bit shift amounts 961 Shifts.clampScalar(1, S32, S32); 962 Shifts.clampScalar(0, S16, S64); 963 Shifts.widenScalarToNextPow2(0, 16); 964 } else { 965 // Make sure we legalize the shift amount type first, as the general 966 // expansion for the shifted type will produce much worse code if it hasn't 967 // been truncated already. 968 Shifts.clampScalar(1, S32, S32); 969 Shifts.clampScalar(0, S32, S64); 970 Shifts.widenScalarToNextPow2(0, 32); 971 } 972 Shifts.scalarize(0); 973 974 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 975 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 976 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 977 unsigned IdxTypeIdx = 2; 978 979 getActionDefinitionsBuilder(Op) 980 .customIf([=](const LegalityQuery &Query) { 981 const LLT EltTy = Query.Types[EltTypeIdx]; 982 const LLT VecTy = Query.Types[VecTypeIdx]; 983 const LLT IdxTy = Query.Types[IdxTypeIdx]; 984 return (EltTy.getSizeInBits() == 16 || 985 EltTy.getSizeInBits() % 32 == 0) && 986 VecTy.getSizeInBits() % 32 == 0 && 987 VecTy.getSizeInBits() <= 1024 && 988 IdxTy.getSizeInBits() == 32; 989 }) 990 .clampScalar(EltTypeIdx, S32, S64) 991 .clampScalar(VecTypeIdx, S32, S64) 992 .clampScalar(IdxTypeIdx, S32, S32); 993 } 994 995 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 996 .unsupportedIf([=](const LegalityQuery &Query) { 997 const LLT &EltTy = Query.Types[1].getElementType(); 998 return Query.Types[0] != EltTy; 999 }); 1000 1001 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1002 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1003 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1004 1005 // FIXME: Doesn't handle extract of illegal sizes. 1006 getActionDefinitionsBuilder(Op) 1007 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1008 // FIXME: Multiples of 16 should not be legal. 1009 .legalIf([=](const LegalityQuery &Query) { 1010 const LLT BigTy = Query.Types[BigTyIdx]; 1011 const LLT LitTy = Query.Types[LitTyIdx]; 1012 return (BigTy.getSizeInBits() % 32 == 0) && 1013 (LitTy.getSizeInBits() % 16 == 0); 1014 }) 1015 .widenScalarIf( 1016 [=](const LegalityQuery &Query) { 1017 const LLT BigTy = Query.Types[BigTyIdx]; 1018 return (BigTy.getScalarSizeInBits() < 16); 1019 }, 1020 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1021 .widenScalarIf( 1022 [=](const LegalityQuery &Query) { 1023 const LLT LitTy = Query.Types[LitTyIdx]; 1024 return (LitTy.getScalarSizeInBits() < 16); 1025 }, 1026 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1027 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1028 .widenScalarToNextPow2(BigTyIdx, 32); 1029 1030 } 1031 1032 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1033 .legalForCartesianProduct(AllS32Vectors, {S32}) 1034 .legalForCartesianProduct(AllS64Vectors, {S64}) 1035 .clampNumElements(0, V16S32, V32S32) 1036 .clampNumElements(0, V2S64, V16S64) 1037 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1038 1039 if (ST.hasScalarPackInsts()) { 1040 BuildVector 1041 // FIXME: Should probably widen s1 vectors straight to s32 1042 .minScalarOrElt(0, S16) 1043 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1044 .minScalar(1, S32); 1045 1046 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1047 .legalFor({V2S16, S32}) 1048 .lower(); 1049 BuildVector.minScalarOrElt(0, S32); 1050 } else { 1051 BuildVector.customFor({V2S16, S16}); 1052 BuildVector.minScalarOrElt(0, S32); 1053 1054 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1055 .customFor({V2S16, S32}) 1056 .lower(); 1057 } 1058 1059 BuildVector.legalIf(isRegisterType(0)); 1060 1061 // FIXME: Clamp maximum size 1062 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1063 .legalIf(isRegisterType(0)); 1064 1065 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1066 // pre-legalize. 1067 if (ST.hasVOP3PInsts()) { 1068 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1069 .customFor({V2S16, V2S16}) 1070 .lower(); 1071 } else 1072 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1073 1074 // Merge/Unmerge 1075 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1076 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1077 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1078 1079 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1080 const LLT &Ty = Query.Types[TypeIdx]; 1081 if (Ty.isVector()) { 1082 const LLT &EltTy = Ty.getElementType(); 1083 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 1084 return true; 1085 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1086 return true; 1087 } 1088 return false; 1089 }; 1090 1091 auto &Builder = getActionDefinitionsBuilder(Op) 1092 // Try to widen to s16 first for small types. 1093 // TODO: Only do this on targets with legal s16 shifts 1094 .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1095 1096 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1097 .lowerFor({{S16, V2S16}}) 1098 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1099 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1100 elementTypeIs(1, S16)), 1101 changeTo(1, V2S16)) 1102 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1103 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1104 // valid. 1105 .clampScalar(LitTyIdx, S32, S256) 1106 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1107 // Break up vectors with weird elements into scalars 1108 .fewerElementsIf( 1109 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 1110 scalarize(0)) 1111 .fewerElementsIf( 1112 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 1113 scalarize(1)) 1114 .clampScalar(BigTyIdx, S32, S1024); 1115 1116 if (Op == G_MERGE_VALUES) { 1117 Builder.widenScalarIf( 1118 // TODO: Use 16-bit shifts if legal for 8-bit values? 1119 [=](const LegalityQuery &Query) { 1120 const LLT Ty = Query.Types[LitTyIdx]; 1121 return Ty.getSizeInBits() < 32; 1122 }, 1123 changeTo(LitTyIdx, S32)); 1124 } 1125 1126 Builder.widenScalarIf( 1127 [=](const LegalityQuery &Query) { 1128 const LLT Ty = Query.Types[BigTyIdx]; 1129 return !isPowerOf2_32(Ty.getSizeInBits()) && 1130 Ty.getSizeInBits() % 16 != 0; 1131 }, 1132 [=](const LegalityQuery &Query) { 1133 // Pick the next power of 2, or a multiple of 64 over 128. 1134 // Whichever is smaller. 1135 const LLT &Ty = Query.Types[BigTyIdx]; 1136 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1137 if (NewSizeInBits >= 256) { 1138 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1139 if (RoundedTo < NewSizeInBits) 1140 NewSizeInBits = RoundedTo; 1141 } 1142 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1143 }) 1144 .legalIf([=](const LegalityQuery &Query) { 1145 const LLT &BigTy = Query.Types[BigTyIdx]; 1146 const LLT &LitTy = Query.Types[LitTyIdx]; 1147 1148 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1149 return false; 1150 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1151 return false; 1152 1153 return BigTy.getSizeInBits() % 16 == 0 && 1154 LitTy.getSizeInBits() % 16 == 0 && 1155 BigTy.getSizeInBits() <= 1024; 1156 }) 1157 // Any vectors left are the wrong size. Scalarize them. 1158 .scalarize(0) 1159 .scalarize(1); 1160 } 1161 1162 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1163 // RegBankSelect. 1164 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1165 .legalFor({{S32}, {S64}}); 1166 1167 if (ST.hasVOP3PInsts()) { 1168 SextInReg.lowerFor({{V2S16}}) 1169 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1170 // get more vector shift opportunities, since we'll get those when 1171 // expanded. 1172 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1173 } else if (ST.has16BitInsts()) { 1174 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1175 } else { 1176 // Prefer to promote to s32 before lowering if we don't have 16-bit 1177 // shifts. This avoid a lot of intermediate truncate and extend operations. 1178 SextInReg.lowerFor({{S32}, {S64}}); 1179 } 1180 1181 SextInReg 1182 .scalarize(0) 1183 .clampScalar(0, S32, S64) 1184 .lower(); 1185 1186 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1187 .legalFor({S64}); 1188 1189 getActionDefinitionsBuilder({ 1190 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1191 G_FCOPYSIGN, 1192 1193 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1194 G_READ_REGISTER, 1195 G_WRITE_REGISTER, 1196 1197 G_SADDO, G_SSUBO, 1198 1199 // TODO: Implement 1200 G_FMINIMUM, G_FMAXIMUM 1201 }).lower(); 1202 1203 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1204 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1205 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1206 .unsupported(); 1207 1208 computeTables(); 1209 verify(*ST.getInstrInfo()); 1210 } 1211 1212 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1213 MachineRegisterInfo &MRI, 1214 MachineIRBuilder &B, 1215 GISelChangeObserver &Observer) const { 1216 switch (MI.getOpcode()) { 1217 case TargetOpcode::G_ADDRSPACE_CAST: 1218 return legalizeAddrSpaceCast(MI, MRI, B); 1219 case TargetOpcode::G_FRINT: 1220 return legalizeFrint(MI, MRI, B); 1221 case TargetOpcode::G_FCEIL: 1222 return legalizeFceil(MI, MRI, B); 1223 case TargetOpcode::G_INTRINSIC_TRUNC: 1224 return legalizeIntrinsicTrunc(MI, MRI, B); 1225 case TargetOpcode::G_SITOFP: 1226 return legalizeITOFP(MI, MRI, B, true); 1227 case TargetOpcode::G_UITOFP: 1228 return legalizeITOFP(MI, MRI, B, false); 1229 case TargetOpcode::G_FPTOSI: 1230 return legalizeFPTOI(MI, MRI, B, true); 1231 case TargetOpcode::G_FPTOUI: 1232 return legalizeFPTOI(MI, MRI, B, false); 1233 case TargetOpcode::G_FMINNUM: 1234 case TargetOpcode::G_FMAXNUM: 1235 case TargetOpcode::G_FMINNUM_IEEE: 1236 case TargetOpcode::G_FMAXNUM_IEEE: 1237 return legalizeMinNumMaxNum(MI, MRI, B); 1238 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1239 return legalizeExtractVectorElt(MI, MRI, B); 1240 case TargetOpcode::G_INSERT_VECTOR_ELT: 1241 return legalizeInsertVectorElt(MI, MRI, B); 1242 case TargetOpcode::G_SHUFFLE_VECTOR: 1243 return legalizeShuffleVector(MI, MRI, B); 1244 case TargetOpcode::G_FSIN: 1245 case TargetOpcode::G_FCOS: 1246 return legalizeSinCos(MI, MRI, B); 1247 case TargetOpcode::G_GLOBAL_VALUE: 1248 return legalizeGlobalValue(MI, MRI, B); 1249 case TargetOpcode::G_LOAD: 1250 return legalizeLoad(MI, MRI, B, Observer); 1251 case TargetOpcode::G_FMAD: 1252 return legalizeFMad(MI, MRI, B); 1253 case TargetOpcode::G_FDIV: 1254 return legalizeFDIV(MI, MRI, B); 1255 case TargetOpcode::G_ATOMIC_CMPXCHG: 1256 return legalizeAtomicCmpXChg(MI, MRI, B); 1257 case TargetOpcode::G_FLOG: 1258 return legalizeFlog(MI, B, 1.0f / numbers::log2ef); 1259 case TargetOpcode::G_FLOG10: 1260 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1261 case TargetOpcode::G_FEXP: 1262 return legalizeFExp(MI, B); 1263 case TargetOpcode::G_FFLOOR: 1264 return legalizeFFloor(MI, MRI, B); 1265 case TargetOpcode::G_BUILD_VECTOR: 1266 return legalizeBuildVector(MI, MRI, B); 1267 default: 1268 return false; 1269 } 1270 1271 llvm_unreachable("expected switch to return"); 1272 } 1273 1274 Register AMDGPULegalizerInfo::getSegmentAperture( 1275 unsigned AS, 1276 MachineRegisterInfo &MRI, 1277 MachineIRBuilder &B) const { 1278 MachineFunction &MF = B.getMF(); 1279 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1280 const LLT S32 = LLT::scalar(32); 1281 1282 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1283 1284 if (ST.hasApertureRegs()) { 1285 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1286 // getreg. 1287 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1288 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1289 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1290 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1291 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1292 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1293 unsigned Encoding = 1294 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1295 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1296 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1297 1298 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1299 1300 B.buildInstr(AMDGPU::S_GETREG_B32) 1301 .addDef(GetReg) 1302 .addImm(Encoding); 1303 MRI.setType(GetReg, S32); 1304 1305 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1306 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1307 } 1308 1309 Register QueuePtr = MRI.createGenericVirtualRegister( 1310 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1311 1312 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1313 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1314 return Register(); 1315 1316 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1317 // private_segment_aperture_base_hi. 1318 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1319 1320 // TODO: can we be smarter about machine pointer info? 1321 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1322 MachineMemOperand *MMO = MF.getMachineMemOperand( 1323 PtrInfo, 1324 MachineMemOperand::MOLoad | 1325 MachineMemOperand::MODereferenceable | 1326 MachineMemOperand::MOInvariant, 1327 4, 1328 MinAlign(64, StructOffset)); 1329 1330 Register LoadAddr; 1331 1332 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1333 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1334 } 1335 1336 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1337 MachineInstr &MI, MachineRegisterInfo &MRI, 1338 MachineIRBuilder &B) const { 1339 MachineFunction &MF = B.getMF(); 1340 1341 B.setInstr(MI); 1342 1343 const LLT S32 = LLT::scalar(32); 1344 Register Dst = MI.getOperand(0).getReg(); 1345 Register Src = MI.getOperand(1).getReg(); 1346 1347 LLT DstTy = MRI.getType(Dst); 1348 LLT SrcTy = MRI.getType(Src); 1349 unsigned DestAS = DstTy.getAddressSpace(); 1350 unsigned SrcAS = SrcTy.getAddressSpace(); 1351 1352 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1353 // vector element. 1354 assert(!DstTy.isVector()); 1355 1356 const AMDGPUTargetMachine &TM 1357 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1358 1359 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1360 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1361 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1362 return true; 1363 } 1364 1365 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1366 // Truncate. 1367 B.buildExtract(Dst, Src, 0); 1368 MI.eraseFromParent(); 1369 return true; 1370 } 1371 1372 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1373 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1374 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1375 1376 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1377 // another. Merge operands are required to be the same type, but creating an 1378 // extra ptrtoint would be kind of pointless. 1379 auto HighAddr = B.buildConstant( 1380 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1381 B.buildMerge(Dst, {Src, HighAddr.getReg(0)}); 1382 MI.eraseFromParent(); 1383 return true; 1384 } 1385 1386 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1387 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1388 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1389 unsigned NullVal = TM.getNullPointerValue(DestAS); 1390 1391 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1392 auto FlatNull = B.buildConstant(SrcTy, 0); 1393 1394 // Extract low 32-bits of the pointer. 1395 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1396 1397 auto CmpRes = 1398 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1399 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1400 1401 MI.eraseFromParent(); 1402 return true; 1403 } 1404 1405 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1406 return false; 1407 1408 if (!ST.hasFlatAddressSpace()) 1409 return false; 1410 1411 auto SegmentNull = 1412 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1413 auto FlatNull = 1414 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1415 1416 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1417 if (!ApertureReg.isValid()) 1418 return false; 1419 1420 auto CmpRes = 1421 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1422 1423 // Coerce the type of the low half of the result so we can use merge_values. 1424 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1425 1426 // TODO: Should we allow mismatched types but matching sizes in merges to 1427 // avoid the ptrtoint? 1428 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1429 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1430 1431 MI.eraseFromParent(); 1432 return true; 1433 } 1434 1435 bool AMDGPULegalizerInfo::legalizeFrint( 1436 MachineInstr &MI, MachineRegisterInfo &MRI, 1437 MachineIRBuilder &B) const { 1438 B.setInstr(MI); 1439 1440 Register Src = MI.getOperand(1).getReg(); 1441 LLT Ty = MRI.getType(Src); 1442 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1443 1444 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1445 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1446 1447 auto C1 = B.buildFConstant(Ty, C1Val); 1448 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1449 1450 // TODO: Should this propagate fast-math-flags? 1451 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1452 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1453 1454 auto C2 = B.buildFConstant(Ty, C2Val); 1455 auto Fabs = B.buildFAbs(Ty, Src); 1456 1457 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1458 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1459 return true; 1460 } 1461 1462 bool AMDGPULegalizerInfo::legalizeFceil( 1463 MachineInstr &MI, MachineRegisterInfo &MRI, 1464 MachineIRBuilder &B) const { 1465 B.setInstr(MI); 1466 1467 const LLT S1 = LLT::scalar(1); 1468 const LLT S64 = LLT::scalar(64); 1469 1470 Register Src = MI.getOperand(1).getReg(); 1471 assert(MRI.getType(Src) == S64); 1472 1473 // result = trunc(src) 1474 // if (src > 0.0 && src != result) 1475 // result += 1.0 1476 1477 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1478 1479 const auto Zero = B.buildFConstant(S64, 0.0); 1480 const auto One = B.buildFConstant(S64, 1.0); 1481 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1482 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1483 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1484 auto Add = B.buildSelect(S64, And, One, Zero); 1485 1486 // TODO: Should this propagate fast-math-flags? 1487 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1488 return true; 1489 } 1490 1491 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1492 MachineIRBuilder &B) { 1493 const unsigned FractBits = 52; 1494 const unsigned ExpBits = 11; 1495 LLT S32 = LLT::scalar(32); 1496 1497 auto Const0 = B.buildConstant(S32, FractBits - 32); 1498 auto Const1 = B.buildConstant(S32, ExpBits); 1499 1500 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1501 .addUse(Const0.getReg(0)) 1502 .addUse(Const1.getReg(0)); 1503 1504 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1505 } 1506 1507 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1508 MachineInstr &MI, MachineRegisterInfo &MRI, 1509 MachineIRBuilder &B) const { 1510 B.setInstr(MI); 1511 1512 const LLT S1 = LLT::scalar(1); 1513 const LLT S32 = LLT::scalar(32); 1514 const LLT S64 = LLT::scalar(64); 1515 1516 Register Src = MI.getOperand(1).getReg(); 1517 assert(MRI.getType(Src) == S64); 1518 1519 // TODO: Should this use extract since the low half is unused? 1520 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1521 Register Hi = Unmerge.getReg(1); 1522 1523 // Extract the upper half, since this is where we will find the sign and 1524 // exponent. 1525 auto Exp = extractF64Exponent(Hi, B); 1526 1527 const unsigned FractBits = 52; 1528 1529 // Extract the sign bit. 1530 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1531 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1532 1533 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1534 1535 const auto Zero32 = B.buildConstant(S32, 0); 1536 1537 // Extend back to 64-bits. 1538 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); 1539 1540 auto Shr = B.buildAShr(S64, FractMask, Exp); 1541 auto Not = B.buildNot(S64, Shr); 1542 auto Tmp0 = B.buildAnd(S64, Src, Not); 1543 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1544 1545 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1546 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1547 1548 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1549 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1550 return true; 1551 } 1552 1553 bool AMDGPULegalizerInfo::legalizeITOFP( 1554 MachineInstr &MI, MachineRegisterInfo &MRI, 1555 MachineIRBuilder &B, bool Signed) const { 1556 B.setInstr(MI); 1557 1558 Register Dst = MI.getOperand(0).getReg(); 1559 Register Src = MI.getOperand(1).getReg(); 1560 1561 const LLT S64 = LLT::scalar(64); 1562 const LLT S32 = LLT::scalar(32); 1563 1564 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1565 1566 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1567 1568 auto CvtHi = Signed ? 1569 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1570 B.buildUITOFP(S64, Unmerge.getReg(1)); 1571 1572 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1573 1574 auto ThirtyTwo = B.buildConstant(S32, 32); 1575 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1576 .addUse(CvtHi.getReg(0)) 1577 .addUse(ThirtyTwo.getReg(0)); 1578 1579 // TODO: Should this propagate fast-math-flags? 1580 B.buildFAdd(Dst, LdExp, CvtLo); 1581 MI.eraseFromParent(); 1582 return true; 1583 } 1584 1585 // TODO: Copied from DAG implementation. Verify logic and document how this 1586 // actually works. 1587 bool AMDGPULegalizerInfo::legalizeFPTOI( 1588 MachineInstr &MI, MachineRegisterInfo &MRI, 1589 MachineIRBuilder &B, bool Signed) const { 1590 B.setInstr(MI); 1591 1592 Register Dst = MI.getOperand(0).getReg(); 1593 Register Src = MI.getOperand(1).getReg(); 1594 1595 const LLT S64 = LLT::scalar(64); 1596 const LLT S32 = LLT::scalar(32); 1597 1598 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1599 1600 unsigned Flags = MI.getFlags(); 1601 1602 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1603 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1604 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1605 1606 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1607 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1608 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1609 1610 auto Hi = Signed ? 1611 B.buildFPTOSI(S32, FloorMul) : 1612 B.buildFPTOUI(S32, FloorMul); 1613 auto Lo = B.buildFPTOUI(S32, Fma); 1614 1615 B.buildMerge(Dst, { Lo.getReg(0), Hi.getReg(0) }); 1616 MI.eraseFromParent(); 1617 1618 return true; 1619 } 1620 1621 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1622 MachineInstr &MI, MachineRegisterInfo &MRI, 1623 MachineIRBuilder &B) const { 1624 MachineFunction &MF = B.getMF(); 1625 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1626 1627 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1628 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1629 1630 // With ieee_mode disabled, the instructions have the correct behavior 1631 // already for G_FMINNUM/G_FMAXNUM 1632 if (!MFI->getMode().IEEE) 1633 return !IsIEEEOp; 1634 1635 if (IsIEEEOp) 1636 return true; 1637 1638 MachineIRBuilder HelperBuilder(MI); 1639 GISelObserverWrapper DummyObserver; 1640 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1641 HelperBuilder.setInstr(MI); 1642 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1643 } 1644 1645 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1646 MachineInstr &MI, MachineRegisterInfo &MRI, 1647 MachineIRBuilder &B) const { 1648 // TODO: Should move some of this into LegalizerHelper. 1649 1650 // TODO: Promote dynamic indexing of s16 to s32 1651 // TODO: Dynamic s64 indexing is only legal for SGPR. 1652 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); 1653 if (!IdxVal) // Dynamic case will be selected to register indexing. 1654 return true; 1655 1656 Register Dst = MI.getOperand(0).getReg(); 1657 Register Vec = MI.getOperand(1).getReg(); 1658 1659 LLT VecTy = MRI.getType(Vec); 1660 LLT EltTy = VecTy.getElementType(); 1661 assert(EltTy == MRI.getType(Dst)); 1662 1663 B.setInstr(MI); 1664 1665 if (IdxVal.getValue() < VecTy.getNumElements()) 1666 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); 1667 else 1668 B.buildUndef(Dst); 1669 1670 MI.eraseFromParent(); 1671 return true; 1672 } 1673 1674 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1675 MachineInstr &MI, MachineRegisterInfo &MRI, 1676 MachineIRBuilder &B) const { 1677 // TODO: Should move some of this into LegalizerHelper. 1678 1679 // TODO: Promote dynamic indexing of s16 to s32 1680 // TODO: Dynamic s64 indexing is only legal for SGPR. 1681 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); 1682 if (!IdxVal) // Dynamic case will be selected to register indexing. 1683 return true; 1684 1685 Register Dst = MI.getOperand(0).getReg(); 1686 Register Vec = MI.getOperand(1).getReg(); 1687 Register Ins = MI.getOperand(2).getReg(); 1688 1689 LLT VecTy = MRI.getType(Vec); 1690 LLT EltTy = VecTy.getElementType(); 1691 assert(EltTy == MRI.getType(Ins)); 1692 1693 B.setInstr(MI); 1694 1695 if (IdxVal.getValue() < VecTy.getNumElements()) 1696 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); 1697 else 1698 B.buildUndef(Dst); 1699 1700 MI.eraseFromParent(); 1701 return true; 1702 } 1703 1704 static bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask) { 1705 assert(Mask.size() == 2); 1706 1707 // If one half is undef, the other is trivially in the same reg. 1708 if (Mask[0] == -1 || Mask[1] == -1) 1709 return true; 1710 return ((Mask[0] == 0 || Mask[0] == 1) && (Mask[1] == 0 || Mask[1] == 1)) || 1711 ((Mask[0] == 2 || Mask[0] == 3) && (Mask[1] == 2 || Mask[1] == 3)); 1712 } 1713 1714 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1715 MachineInstr &MI, MachineRegisterInfo &MRI, 1716 MachineIRBuilder &B) const { 1717 const LLT V2S16 = LLT::vector(2, 16); 1718 1719 Register Dst = MI.getOperand(0).getReg(); 1720 Register Src0 = MI.getOperand(1).getReg(); 1721 LLT DstTy = MRI.getType(Dst); 1722 LLT SrcTy = MRI.getType(Src0); 1723 1724 if (SrcTy == V2S16 && DstTy == V2S16 && 1725 isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1726 return true; 1727 1728 MachineIRBuilder HelperBuilder(MI); 1729 GISelObserverWrapper DummyObserver; 1730 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1731 HelperBuilder.setInstr(MI); 1732 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1733 } 1734 1735 bool AMDGPULegalizerInfo::legalizeSinCos( 1736 MachineInstr &MI, MachineRegisterInfo &MRI, 1737 MachineIRBuilder &B) const { 1738 B.setInstr(MI); 1739 1740 Register DstReg = MI.getOperand(0).getReg(); 1741 Register SrcReg = MI.getOperand(1).getReg(); 1742 LLT Ty = MRI.getType(DstReg); 1743 unsigned Flags = MI.getFlags(); 1744 1745 Register TrigVal; 1746 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1747 if (ST.hasTrigReducedRange()) { 1748 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1749 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1750 .addUse(MulVal.getReg(0)) 1751 .setMIFlags(Flags).getReg(0); 1752 } else 1753 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1754 1755 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1756 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1757 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1758 .addUse(TrigVal) 1759 .setMIFlags(Flags); 1760 MI.eraseFromParent(); 1761 return true; 1762 } 1763 1764 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1765 Register DstReg, LLT PtrTy, 1766 MachineIRBuilder &B, const GlobalValue *GV, 1767 unsigned Offset, unsigned GAFlags) const { 1768 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1769 // to the following code sequence: 1770 // 1771 // For constant address space: 1772 // s_getpc_b64 s[0:1] 1773 // s_add_u32 s0, s0, $symbol 1774 // s_addc_u32 s1, s1, 0 1775 // 1776 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1777 // a fixup or relocation is emitted to replace $symbol with a literal 1778 // constant, which is a pc-relative offset from the encoding of the $symbol 1779 // operand to the global variable. 1780 // 1781 // For global address space: 1782 // s_getpc_b64 s[0:1] 1783 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1784 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1785 // 1786 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1787 // fixups or relocations are emitted to replace $symbol@*@lo and 1788 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1789 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1790 // operand to the global variable. 1791 // 1792 // What we want here is an offset from the value returned by s_getpc 1793 // (which is the address of the s_add_u32 instruction) to the global 1794 // variable, but since the encoding of $symbol starts 4 bytes after the start 1795 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1796 // small. This requires us to add 4 to the global variable offset in order to 1797 // compute the correct address. 1798 1799 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1800 1801 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1802 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1803 1804 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1805 .addDef(PCReg); 1806 1807 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1808 if (GAFlags == SIInstrInfo::MO_NONE) 1809 MIB.addImm(0); 1810 else 1811 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1812 1813 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1814 1815 if (PtrTy.getSizeInBits() == 32) 1816 B.buildExtract(DstReg, PCReg, 0); 1817 return true; 1818 } 1819 1820 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1821 MachineInstr &MI, MachineRegisterInfo &MRI, 1822 MachineIRBuilder &B) const { 1823 Register DstReg = MI.getOperand(0).getReg(); 1824 LLT Ty = MRI.getType(DstReg); 1825 unsigned AS = Ty.getAddressSpace(); 1826 1827 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1828 MachineFunction &MF = B.getMF(); 1829 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1830 B.setInstr(MI); 1831 1832 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1833 if (!MFI->isEntryFunction()) { 1834 const Function &Fn = MF.getFunction(); 1835 DiagnosticInfoUnsupported BadLDSDecl( 1836 Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 1837 Fn.getContext().diagnose(BadLDSDecl); 1838 } 1839 1840 // TODO: We could emit code to handle the initialization somewhere. 1841 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1842 const SITargetLowering *TLI = ST.getTargetLowering(); 1843 if (!TLI->shouldUseLDSConstAddress(GV)) { 1844 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 1845 return true; // Leave in place; 1846 } 1847 1848 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1849 MI.eraseFromParent(); 1850 return true; 1851 } 1852 1853 const Function &Fn = MF.getFunction(); 1854 DiagnosticInfoUnsupported BadInit( 1855 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1856 Fn.getContext().diagnose(BadInit); 1857 return true; 1858 } 1859 1860 const SITargetLowering *TLI = ST.getTargetLowering(); 1861 1862 if (TLI->shouldEmitFixup(GV)) { 1863 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 1864 MI.eraseFromParent(); 1865 return true; 1866 } 1867 1868 if (TLI->shouldEmitPCReloc(GV)) { 1869 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 1870 MI.eraseFromParent(); 1871 return true; 1872 } 1873 1874 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1875 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 1876 1877 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 1878 MachinePointerInfo::getGOT(MF), 1879 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1880 MachineMemOperand::MOInvariant, 1881 8 /*Size*/, 8 /*Align*/); 1882 1883 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 1884 1885 if (Ty.getSizeInBits() == 32) { 1886 // Truncate if this is a 32-bit constant adrdess. 1887 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 1888 B.buildExtract(DstReg, Load, 0); 1889 } else 1890 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 1891 1892 MI.eraseFromParent(); 1893 return true; 1894 } 1895 1896 bool AMDGPULegalizerInfo::legalizeLoad( 1897 MachineInstr &MI, MachineRegisterInfo &MRI, 1898 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 1899 B.setInstr(MI); 1900 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1901 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 1902 Observer.changingInstr(MI); 1903 MI.getOperand(1).setReg(Cast.getReg(0)); 1904 Observer.changedInstr(MI); 1905 return true; 1906 } 1907 1908 bool AMDGPULegalizerInfo::legalizeFMad( 1909 MachineInstr &MI, MachineRegisterInfo &MRI, 1910 MachineIRBuilder &B) const { 1911 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 1912 assert(Ty.isScalar()); 1913 1914 MachineFunction &MF = B.getMF(); 1915 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1916 1917 // TODO: Always legal with future ftz flag. 1918 // FIXME: Do we need just output? 1919 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 1920 return true; 1921 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 1922 return true; 1923 1924 MachineIRBuilder HelperBuilder(MI); 1925 GISelObserverWrapper DummyObserver; 1926 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1927 HelperBuilder.setMBB(*MI.getParent()); 1928 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 1929 } 1930 1931 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 1932 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 1933 Register DstReg = MI.getOperand(0).getReg(); 1934 Register PtrReg = MI.getOperand(1).getReg(); 1935 Register CmpVal = MI.getOperand(2).getReg(); 1936 Register NewVal = MI.getOperand(3).getReg(); 1937 1938 assert(SITargetLowering::isFlatGlobalAddrSpace( 1939 MRI.getType(PtrReg).getAddressSpace()) && 1940 "this should not have been custom lowered"); 1941 1942 LLT ValTy = MRI.getType(CmpVal); 1943 LLT VecTy = LLT::vector(2, ValTy); 1944 1945 B.setInstr(MI); 1946 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 1947 1948 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 1949 .addDef(DstReg) 1950 .addUse(PtrReg) 1951 .addUse(PackedVal) 1952 .setMemRefs(MI.memoperands()); 1953 1954 MI.eraseFromParent(); 1955 return true; 1956 } 1957 1958 bool AMDGPULegalizerInfo::legalizeFlog( 1959 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 1960 Register Dst = MI.getOperand(0).getReg(); 1961 Register Src = MI.getOperand(1).getReg(); 1962 LLT Ty = B.getMRI()->getType(Dst); 1963 unsigned Flags = MI.getFlags(); 1964 B.setInstr(MI); 1965 1966 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 1967 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 1968 1969 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 1970 MI.eraseFromParent(); 1971 return true; 1972 } 1973 1974 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 1975 MachineIRBuilder &B) const { 1976 Register Dst = MI.getOperand(0).getReg(); 1977 Register Src = MI.getOperand(1).getReg(); 1978 unsigned Flags = MI.getFlags(); 1979 LLT Ty = B.getMRI()->getType(Dst); 1980 B.setInstr(MI); 1981 1982 auto K = B.buildFConstant(Ty, numbers::log2e); 1983 auto Mul = B.buildFMul(Ty, Src, K, Flags); 1984 B.buildFExp2(Dst, Mul, Flags); 1985 MI.eraseFromParent(); 1986 return true; 1987 } 1988 1989 // Find a source register, ignoring any possible source modifiers. 1990 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 1991 Register ModSrc = OrigSrc; 1992 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 1993 ModSrc = SrcFNeg->getOperand(1).getReg(); 1994 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 1995 ModSrc = SrcFAbs->getOperand(1).getReg(); 1996 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 1997 ModSrc = SrcFAbs->getOperand(1).getReg(); 1998 return ModSrc; 1999 } 2000 2001 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2002 MachineRegisterInfo &MRI, 2003 MachineIRBuilder &B) const { 2004 B.setInstr(MI); 2005 2006 const LLT S1 = LLT::scalar(1); 2007 const LLT S64 = LLT::scalar(64); 2008 Register Dst = MI.getOperand(0).getReg(); 2009 Register OrigSrc = MI.getOperand(1).getReg(); 2010 unsigned Flags = MI.getFlags(); 2011 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2012 "this should not have been custom lowered"); 2013 2014 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2015 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2016 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2017 // V_FRACT bug is: 2018 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2019 // 2020 // Convert floor(x) to (x - fract(x)) 2021 2022 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2023 .addUse(OrigSrc) 2024 .setMIFlags(Flags); 2025 2026 // Give source modifier matching some assistance before obscuring a foldable 2027 // pattern. 2028 2029 // TODO: We can avoid the neg on the fract? The input sign to fract 2030 // shouldn't matter? 2031 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2032 2033 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2034 2035 Register Min = MRI.createGenericVirtualRegister(S64); 2036 2037 // We don't need to concern ourselves with the snan handling difference, so 2038 // use the one which will directly select. 2039 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2040 if (MFI->getMode().IEEE) 2041 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2042 else 2043 B.buildFMinNum(Min, Fract, Const, Flags); 2044 2045 Register CorrectedFract = Min; 2046 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2047 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2048 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2049 } 2050 2051 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2052 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2053 2054 MI.eraseFromParent(); 2055 return true; 2056 } 2057 2058 // Turn an illegal packed v2s16 build vector into bit operations. 2059 // TODO: This should probably be a bitcast action in LegalizerHelper. 2060 bool AMDGPULegalizerInfo::legalizeBuildVector( 2061 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2062 Register Dst = MI.getOperand(0).getReg(); 2063 LLT DstTy = MRI.getType(Dst); 2064 const LLT S32 = LLT::scalar(32); 2065 const LLT V2S16 = LLT::vector(2, 16); 2066 (void)DstTy; 2067 (void)V2S16; 2068 assert(DstTy == V2S16); 2069 2070 Register Src0 = MI.getOperand(1).getReg(); 2071 Register Src1 = MI.getOperand(2).getReg(); 2072 assert(MRI.getType(Src0) == LLT::scalar(16)); 2073 2074 B.setInstr(MI); 2075 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2076 B.buildBitcast(Dst, Merge); 2077 2078 MI.eraseFromParent(); 2079 return true; 2080 } 2081 2082 // Return the use branch instruction, otherwise null if the usage is invalid. 2083 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2084 MachineRegisterInfo &MRI, 2085 MachineInstr *&Br) { 2086 Register CondDef = MI.getOperand(0).getReg(); 2087 if (!MRI.hasOneNonDBGUse(CondDef)) 2088 return nullptr; 2089 2090 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2091 if (UseMI.getParent() != MI.getParent() || 2092 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2093 return nullptr; 2094 2095 // Make sure the cond br is followed by a G_BR 2096 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2097 if (Next != MI.getParent()->end()) { 2098 if (Next->getOpcode() != AMDGPU::G_BR) 2099 return nullptr; 2100 Br = &*Next; 2101 } 2102 2103 return &UseMI; 2104 } 2105 2106 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 2107 Register Reg, LLT Ty) const { 2108 Register LiveIn = MRI.getLiveInVirtReg(Reg); 2109 if (LiveIn) 2110 return LiveIn; 2111 2112 Register NewReg = MRI.createGenericVirtualRegister(Ty); 2113 MRI.addLiveIn(Reg, NewReg); 2114 return NewReg; 2115 } 2116 2117 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2118 const ArgDescriptor *Arg) const { 2119 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2120 return false; // TODO: Handle these 2121 2122 assert(Arg->getRegister().isPhysical()); 2123 2124 MachineRegisterInfo &MRI = *B.getMRI(); 2125 2126 LLT Ty = MRI.getType(DstReg); 2127 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 2128 2129 if (Arg->isMasked()) { 2130 // TODO: Should we try to emit this once in the entry block? 2131 const LLT S32 = LLT::scalar(32); 2132 const unsigned Mask = Arg->getMask(); 2133 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2134 2135 Register AndMaskSrc = LiveIn; 2136 2137 if (Shift != 0) { 2138 auto ShiftAmt = B.buildConstant(S32, Shift); 2139 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2140 } 2141 2142 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2143 } else 2144 B.buildCopy(DstReg, LiveIn); 2145 2146 // Insert the argument copy if it doens't already exist. 2147 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2148 if (!MRI.getVRegDef(LiveIn)) { 2149 // FIXME: Should have scoped insert pt 2150 MachineBasicBlock &OrigInsBB = B.getMBB(); 2151 auto OrigInsPt = B.getInsertPt(); 2152 2153 MachineBasicBlock &EntryMBB = B.getMF().front(); 2154 EntryMBB.addLiveIn(Arg->getRegister()); 2155 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2156 B.buildCopy(LiveIn, Arg->getRegister()); 2157 2158 B.setInsertPt(OrigInsBB, OrigInsPt); 2159 } 2160 2161 return true; 2162 } 2163 2164 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2165 MachineInstr &MI, 2166 MachineRegisterInfo &MRI, 2167 MachineIRBuilder &B, 2168 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2169 B.setInstr(MI); 2170 2171 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2172 2173 const ArgDescriptor *Arg; 2174 const TargetRegisterClass *RC; 2175 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 2176 if (!Arg) { 2177 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2178 return false; 2179 } 2180 2181 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 2182 MI.eraseFromParent(); 2183 return true; 2184 } 2185 2186 return false; 2187 } 2188 2189 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2190 MachineRegisterInfo &MRI, 2191 MachineIRBuilder &B) const { 2192 B.setInstr(MI); 2193 Register Dst = MI.getOperand(0).getReg(); 2194 LLT DstTy = MRI.getType(Dst); 2195 LLT S16 = LLT::scalar(16); 2196 LLT S32 = LLT::scalar(32); 2197 LLT S64 = LLT::scalar(64); 2198 2199 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2200 return true; 2201 2202 if (DstTy == S16) 2203 return legalizeFDIV16(MI, MRI, B); 2204 if (DstTy == S32) 2205 return legalizeFDIV32(MI, MRI, B); 2206 if (DstTy == S64) 2207 return legalizeFDIV64(MI, MRI, B); 2208 2209 return false; 2210 } 2211 2212 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2213 MachineRegisterInfo &MRI, 2214 MachineIRBuilder &B) const { 2215 Register Res = MI.getOperand(0).getReg(); 2216 Register LHS = MI.getOperand(1).getReg(); 2217 Register RHS = MI.getOperand(2).getReg(); 2218 2219 uint16_t Flags = MI.getFlags(); 2220 2221 LLT ResTy = MRI.getType(Res); 2222 LLT S32 = LLT::scalar(32); 2223 LLT S64 = LLT::scalar(64); 2224 2225 const MachineFunction &MF = B.getMF(); 2226 bool Unsafe = 2227 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2228 2229 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2230 return false; 2231 2232 if (!Unsafe && ResTy == S32 && 2233 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2234 return false; 2235 2236 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2237 // 1 / x -> RCP(x) 2238 if (CLHS->isExactlyValue(1.0)) { 2239 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2240 .addUse(RHS) 2241 .setMIFlags(Flags); 2242 2243 MI.eraseFromParent(); 2244 return true; 2245 } 2246 2247 // -1 / x -> RCP( FNEG(x) ) 2248 if (CLHS->isExactlyValue(-1.0)) { 2249 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2250 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2251 .addUse(FNeg.getReg(0)) 2252 .setMIFlags(Flags); 2253 2254 MI.eraseFromParent(); 2255 return true; 2256 } 2257 } 2258 2259 // x / y -> x * (1.0 / y) 2260 if (Unsafe) { 2261 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2262 .addUse(RHS) 2263 .setMIFlags(Flags); 2264 B.buildFMul(Res, LHS, RCP, Flags); 2265 2266 MI.eraseFromParent(); 2267 return true; 2268 } 2269 2270 return false; 2271 } 2272 2273 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2274 MachineRegisterInfo &MRI, 2275 MachineIRBuilder &B) const { 2276 B.setInstr(MI); 2277 Register Res = MI.getOperand(0).getReg(); 2278 Register LHS = MI.getOperand(1).getReg(); 2279 Register RHS = MI.getOperand(2).getReg(); 2280 2281 uint16_t Flags = MI.getFlags(); 2282 2283 LLT S16 = LLT::scalar(16); 2284 LLT S32 = LLT::scalar(32); 2285 2286 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2287 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2288 2289 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2290 .addUse(RHSExt.getReg(0)) 2291 .setMIFlags(Flags); 2292 2293 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2294 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2295 2296 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2297 .addUse(RDst.getReg(0)) 2298 .addUse(RHS) 2299 .addUse(LHS) 2300 .setMIFlags(Flags); 2301 2302 MI.eraseFromParent(); 2303 return true; 2304 } 2305 2306 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2307 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2308 static void toggleSPDenormMode(bool Enable, 2309 MachineIRBuilder &B, 2310 const GCNSubtarget &ST, 2311 AMDGPU::SIModeRegisterDefaults Mode) { 2312 // Set SP denorm mode to this value. 2313 unsigned SPDenormMode = 2314 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2315 2316 if (ST.hasDenormModeInst()) { 2317 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2318 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2319 2320 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2321 B.buildInstr(AMDGPU::S_DENORM_MODE) 2322 .addImm(NewDenormModeValue); 2323 2324 } else { 2325 // Select FP32 bit field in mode register. 2326 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2327 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2328 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2329 2330 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2331 .addImm(SPDenormMode) 2332 .addImm(SPDenormModeBitField); 2333 } 2334 } 2335 2336 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2337 MachineRegisterInfo &MRI, 2338 MachineIRBuilder &B) const { 2339 B.setInstr(MI); 2340 Register Res = MI.getOperand(0).getReg(); 2341 Register LHS = MI.getOperand(1).getReg(); 2342 Register RHS = MI.getOperand(2).getReg(); 2343 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2344 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2345 2346 uint16_t Flags = MI.getFlags(); 2347 2348 LLT S32 = LLT::scalar(32); 2349 LLT S1 = LLT::scalar(1); 2350 2351 auto One = B.buildFConstant(S32, 1.0f); 2352 2353 auto DenominatorScaled = 2354 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2355 .addUse(RHS) 2356 .addUse(LHS) 2357 .addImm(1) 2358 .setMIFlags(Flags); 2359 auto NumeratorScaled = 2360 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2361 .addUse(LHS) 2362 .addUse(RHS) 2363 .addImm(0) 2364 .setMIFlags(Flags); 2365 2366 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2367 .addUse(DenominatorScaled.getReg(0)) 2368 .setMIFlags(Flags); 2369 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2370 2371 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2372 // aren't modeled as reading it. 2373 if (!Mode.allFP32Denormals()) 2374 toggleSPDenormMode(true, B, ST, Mode); 2375 2376 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2377 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2378 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2379 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2380 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2381 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2382 2383 if (!Mode.allFP32Denormals()) 2384 toggleSPDenormMode(false, B, ST, Mode); 2385 2386 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2387 .addUse(Fma4.getReg(0)) 2388 .addUse(Fma1.getReg(0)) 2389 .addUse(Fma3.getReg(0)) 2390 .addUse(NumeratorScaled.getReg(1)) 2391 .setMIFlags(Flags); 2392 2393 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2394 .addUse(Fmas.getReg(0)) 2395 .addUse(RHS) 2396 .addUse(LHS) 2397 .setMIFlags(Flags); 2398 2399 MI.eraseFromParent(); 2400 return true; 2401 } 2402 2403 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2404 MachineRegisterInfo &MRI, 2405 MachineIRBuilder &B) const { 2406 B.setInstr(MI); 2407 Register Res = MI.getOperand(0).getReg(); 2408 Register LHS = MI.getOperand(1).getReg(); 2409 Register RHS = MI.getOperand(2).getReg(); 2410 2411 uint16_t Flags = MI.getFlags(); 2412 2413 LLT S64 = LLT::scalar(64); 2414 LLT S1 = LLT::scalar(1); 2415 2416 auto One = B.buildFConstant(S64, 1.0); 2417 2418 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2419 .addUse(LHS) 2420 .addUse(RHS) 2421 .addImm(1) 2422 .setMIFlags(Flags); 2423 2424 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2425 2426 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2427 .addUse(DivScale0.getReg(0)) 2428 .setMIFlags(Flags); 2429 2430 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2431 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2432 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2433 2434 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2435 .addUse(LHS) 2436 .addUse(RHS) 2437 .addImm(0) 2438 .setMIFlags(Flags); 2439 2440 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2441 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); 2442 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2443 2444 Register Scale; 2445 if (!ST.hasUsableDivScaleConditionOutput()) { 2446 // Workaround a hardware bug on SI where the condition output from div_scale 2447 // is not usable. 2448 2449 LLT S32 = LLT::scalar(32); 2450 2451 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2452 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2453 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2454 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 2455 2456 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 2457 Scale1Unmerge.getReg(1)); 2458 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 2459 Scale0Unmerge.getReg(1)); 2460 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 2461 } else { 2462 Scale = DivScale1.getReg(1); 2463 } 2464 2465 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 2466 .addUse(Fma4.getReg(0)) 2467 .addUse(Fma3.getReg(0)) 2468 .addUse(Mul.getReg(0)) 2469 .addUse(Scale) 2470 .setMIFlags(Flags); 2471 2472 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 2473 .addUse(Fmas.getReg(0)) 2474 .addUse(RHS) 2475 .addUse(LHS) 2476 .setMIFlags(Flags); 2477 2478 MI.eraseFromParent(); 2479 return true; 2480 } 2481 2482 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 2483 MachineRegisterInfo &MRI, 2484 MachineIRBuilder &B) const { 2485 B.setInstr(MI); 2486 Register Res = MI.getOperand(0).getReg(); 2487 Register LHS = MI.getOperand(2).getReg(); 2488 Register RHS = MI.getOperand(3).getReg(); 2489 uint16_t Flags = MI.getFlags(); 2490 2491 LLT S32 = LLT::scalar(32); 2492 LLT S1 = LLT::scalar(1); 2493 2494 auto Abs = B.buildFAbs(S32, RHS, Flags); 2495 const APFloat C0Val(1.0f); 2496 2497 auto C0 = B.buildConstant(S32, 0x6f800000); 2498 auto C1 = B.buildConstant(S32, 0x2f800000); 2499 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 2500 2501 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 2502 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 2503 2504 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 2505 2506 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2507 .addUse(Mul0.getReg(0)) 2508 .setMIFlags(Flags); 2509 2510 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 2511 2512 B.buildFMul(Res, Sel, Mul1, Flags); 2513 2514 MI.eraseFromParent(); 2515 return true; 2516 } 2517 2518 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 2519 MachineRegisterInfo &MRI, 2520 MachineIRBuilder &B) const { 2521 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2522 if (!MFI->isEntryFunction()) { 2523 return legalizePreloadedArgIntrin(MI, MRI, B, 2524 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 2525 } 2526 2527 B.setInstr(MI); 2528 2529 uint64_t Offset = 2530 ST.getTargetLowering()->getImplicitParameterOffset( 2531 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 2532 Register DstReg = MI.getOperand(0).getReg(); 2533 LLT DstTy = MRI.getType(DstReg); 2534 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 2535 2536 const ArgDescriptor *Arg; 2537 const TargetRegisterClass *RC; 2538 std::tie(Arg, RC) 2539 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2540 if (!Arg) 2541 return false; 2542 2543 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 2544 if (!loadInputValue(KernargPtrReg, B, Arg)) 2545 return false; 2546 2547 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 2548 MI.eraseFromParent(); 2549 return true; 2550 } 2551 2552 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 2553 MachineRegisterInfo &MRI, 2554 MachineIRBuilder &B, 2555 unsigned AddrSpace) const { 2556 B.setInstr(MI); 2557 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 2558 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 2559 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 2560 MI.eraseFromParent(); 2561 return true; 2562 } 2563 2564 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 2565 // offset (the offset that is included in bounds checking and swizzling, to be 2566 // split between the instruction's voffset and immoffset fields) and soffset 2567 // (the offset that is excluded from bounds checking and swizzling, to go in 2568 // the instruction's soffset field). This function takes the first kind of 2569 // offset and figures out how to split it between voffset and immoffset. 2570 std::tuple<Register, unsigned, unsigned> 2571 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 2572 Register OrigOffset) const { 2573 const unsigned MaxImm = 4095; 2574 Register BaseReg; 2575 unsigned TotalConstOffset; 2576 MachineInstr *OffsetDef; 2577 const LLT S32 = LLT::scalar(32); 2578 2579 std::tie(BaseReg, TotalConstOffset, OffsetDef) 2580 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 2581 2582 unsigned ImmOffset = TotalConstOffset; 2583 2584 // If the immediate value is too big for the immoffset field, put the value 2585 // and -4096 into the immoffset field so that the value that is copied/added 2586 // for the voffset field is a multiple of 4096, and it stands more chance 2587 // of being CSEd with the copy/add for another similar load/store. 2588 // However, do not do that rounding down to a multiple of 4096 if that is a 2589 // negative number, as it appears to be illegal to have a negative offset 2590 // in the vgpr, even if adding the immediate offset makes it positive. 2591 unsigned Overflow = ImmOffset & ~MaxImm; 2592 ImmOffset -= Overflow; 2593 if ((int32_t)Overflow < 0) { 2594 Overflow += ImmOffset; 2595 ImmOffset = 0; 2596 } 2597 2598 if (Overflow != 0) { 2599 if (!BaseReg) { 2600 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 2601 } else { 2602 auto OverflowVal = B.buildConstant(S32, Overflow); 2603 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 2604 } 2605 } 2606 2607 if (!BaseReg) 2608 BaseReg = B.buildConstant(S32, 0).getReg(0); 2609 2610 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 2611 } 2612 2613 /// Handle register layout difference for f16 images for some subtargets. 2614 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 2615 MachineRegisterInfo &MRI, 2616 Register Reg) const { 2617 if (!ST.hasUnpackedD16VMem()) 2618 return Reg; 2619 2620 const LLT S16 = LLT::scalar(16); 2621 const LLT S32 = LLT::scalar(32); 2622 LLT StoreVT = MRI.getType(Reg); 2623 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 2624 2625 auto Unmerge = B.buildUnmerge(S16, Reg); 2626 2627 SmallVector<Register, 4> WideRegs; 2628 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 2629 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 2630 2631 int NumElts = StoreVT.getNumElements(); 2632 2633 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 2634 } 2635 2636 Register AMDGPULegalizerInfo::fixStoreSourceType( 2637 MachineIRBuilder &B, Register VData, bool IsFormat) const { 2638 MachineRegisterInfo *MRI = B.getMRI(); 2639 LLT Ty = MRI->getType(VData); 2640 2641 const LLT S16 = LLT::scalar(16); 2642 2643 // Fixup illegal register types for i8 stores. 2644 if (Ty == LLT::scalar(8) || Ty == S16) { 2645 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 2646 return AnyExt; 2647 } 2648 2649 if (Ty.isVector()) { 2650 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 2651 if (IsFormat) 2652 return handleD16VData(B, *MRI, VData); 2653 } 2654 } 2655 2656 return VData; 2657 } 2658 2659 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 2660 MachineRegisterInfo &MRI, 2661 MachineIRBuilder &B, 2662 bool IsTyped, 2663 bool IsFormat) const { 2664 B.setInstr(MI); 2665 2666 Register VData = MI.getOperand(1).getReg(); 2667 LLT Ty = MRI.getType(VData); 2668 LLT EltTy = Ty.getScalarType(); 2669 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 2670 const LLT S32 = LLT::scalar(32); 2671 2672 VData = fixStoreSourceType(B, VData, IsFormat); 2673 Register RSrc = MI.getOperand(2).getReg(); 2674 2675 MachineMemOperand *MMO = *MI.memoperands_begin(); 2676 const int MemSize = MMO->getSize(); 2677 2678 unsigned ImmOffset; 2679 unsigned TotalOffset; 2680 2681 // The typed intrinsics add an immediate after the registers. 2682 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 2683 2684 // The struct intrinsic variants add one additional operand over raw. 2685 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 2686 Register VIndex; 2687 int OpOffset = 0; 2688 if (HasVIndex) { 2689 VIndex = MI.getOperand(3).getReg(); 2690 OpOffset = 1; 2691 } 2692 2693 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 2694 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 2695 2696 unsigned Format = 0; 2697 if (IsTyped) { 2698 Format = MI.getOperand(5 + OpOffset).getImm(); 2699 ++OpOffset; 2700 } 2701 2702 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 2703 2704 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 2705 if (TotalOffset != 0) 2706 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 2707 2708 unsigned Opc; 2709 if (IsTyped) { 2710 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 2711 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 2712 } else if (IsFormat) { 2713 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 2714 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 2715 } else { 2716 switch (MemSize) { 2717 case 1: 2718 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 2719 break; 2720 case 2: 2721 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 2722 break; 2723 default: 2724 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 2725 break; 2726 } 2727 } 2728 2729 if (!VIndex) 2730 VIndex = B.buildConstant(S32, 0).getReg(0); 2731 2732 auto MIB = B.buildInstr(Opc) 2733 .addUse(VData) // vdata 2734 .addUse(RSrc) // rsrc 2735 .addUse(VIndex) // vindex 2736 .addUse(VOffset) // voffset 2737 .addUse(SOffset) // soffset 2738 .addImm(ImmOffset); // offset(imm) 2739 2740 if (IsTyped) 2741 MIB.addImm(Format); 2742 2743 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 2744 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 2745 .addMemOperand(MMO); 2746 2747 MI.eraseFromParent(); 2748 return true; 2749 } 2750 2751 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 2752 MachineRegisterInfo &MRI, 2753 MachineIRBuilder &B, 2754 bool IsFormat, 2755 bool IsTyped) const { 2756 B.setInstr(MI); 2757 2758 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 2759 MachineMemOperand *MMO = *MI.memoperands_begin(); 2760 const int MemSize = MMO->getSize(); 2761 const LLT S32 = LLT::scalar(32); 2762 2763 Register Dst = MI.getOperand(0).getReg(); 2764 Register RSrc = MI.getOperand(2).getReg(); 2765 2766 // The typed intrinsics add an immediate after the registers. 2767 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 2768 2769 // The struct intrinsic variants add one additional operand over raw. 2770 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 2771 Register VIndex; 2772 int OpOffset = 0; 2773 if (HasVIndex) { 2774 VIndex = MI.getOperand(3).getReg(); 2775 OpOffset = 1; 2776 } 2777 2778 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 2779 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 2780 2781 unsigned Format = 0; 2782 if (IsTyped) { 2783 Format = MI.getOperand(5 + OpOffset).getImm(); 2784 ++OpOffset; 2785 } 2786 2787 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 2788 unsigned ImmOffset; 2789 unsigned TotalOffset; 2790 2791 LLT Ty = MRI.getType(Dst); 2792 LLT EltTy = Ty.getScalarType(); 2793 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 2794 const bool Unpacked = ST.hasUnpackedD16VMem(); 2795 2796 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 2797 if (TotalOffset != 0) 2798 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 2799 2800 unsigned Opc; 2801 2802 if (IsTyped) { 2803 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 2804 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 2805 } else if (IsFormat) { 2806 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 2807 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 2808 } else { 2809 switch (MemSize) { 2810 case 1: 2811 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 2812 break; 2813 case 2: 2814 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 2815 break; 2816 default: 2817 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 2818 break; 2819 } 2820 } 2821 2822 Register LoadDstReg; 2823 2824 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 2825 LLT UnpackedTy = Ty.changeElementSize(32); 2826 2827 if (IsExtLoad) 2828 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 2829 else if (Unpacked && IsD16 && Ty.isVector()) 2830 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 2831 else 2832 LoadDstReg = Dst; 2833 2834 if (!VIndex) 2835 VIndex = B.buildConstant(S32, 0).getReg(0); 2836 2837 auto MIB = B.buildInstr(Opc) 2838 .addDef(LoadDstReg) // vdata 2839 .addUse(RSrc) // rsrc 2840 .addUse(VIndex) // vindex 2841 .addUse(VOffset) // voffset 2842 .addUse(SOffset) // soffset 2843 .addImm(ImmOffset); // offset(imm) 2844 2845 if (IsTyped) 2846 MIB.addImm(Format); 2847 2848 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 2849 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 2850 .addMemOperand(MMO); 2851 2852 if (LoadDstReg != Dst) { 2853 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2854 2855 // Widen result for extending loads was widened. 2856 if (IsExtLoad) 2857 B.buildTrunc(Dst, LoadDstReg); 2858 else { 2859 // Repack to original 16-bit vector result 2860 // FIXME: G_TRUNC should work, but legalization currently fails 2861 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 2862 SmallVector<Register, 4> Repack; 2863 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 2864 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 2865 B.buildMerge(Dst, Repack); 2866 } 2867 } 2868 2869 MI.eraseFromParent(); 2870 return true; 2871 } 2872 2873 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 2874 MachineIRBuilder &B, 2875 bool IsInc) const { 2876 B.setInstr(MI); 2877 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 2878 AMDGPU::G_AMDGPU_ATOMIC_DEC; 2879 B.buildInstr(Opc) 2880 .addDef(MI.getOperand(0).getReg()) 2881 .addUse(MI.getOperand(2).getReg()) 2882 .addUse(MI.getOperand(3).getReg()) 2883 .cloneMemRefs(MI); 2884 MI.eraseFromParent(); 2885 return true; 2886 } 2887 2888 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 2889 switch (IntrID) { 2890 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 2891 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 2892 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 2893 case Intrinsic::amdgcn_raw_buffer_atomic_add: 2894 case Intrinsic::amdgcn_struct_buffer_atomic_add: 2895 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 2896 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 2897 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 2898 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 2899 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 2900 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 2901 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 2902 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 2903 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 2904 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 2905 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 2906 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 2907 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 2908 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 2909 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 2910 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 2911 case Intrinsic::amdgcn_raw_buffer_atomic_and: 2912 case Intrinsic::amdgcn_struct_buffer_atomic_and: 2913 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 2914 case Intrinsic::amdgcn_raw_buffer_atomic_or: 2915 case Intrinsic::amdgcn_struct_buffer_atomic_or: 2916 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 2917 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 2918 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 2919 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 2920 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 2921 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 2922 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 2923 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 2924 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 2925 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 2926 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 2927 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 2928 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 2929 default: 2930 llvm_unreachable("unhandled atomic opcode"); 2931 } 2932 } 2933 2934 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 2935 MachineIRBuilder &B, 2936 Intrinsic::ID IID) const { 2937 B.setInstr(MI); 2938 2939 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 2940 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 2941 2942 Register Dst = MI.getOperand(0).getReg(); 2943 Register VData = MI.getOperand(2).getReg(); 2944 2945 Register CmpVal; 2946 int OpOffset = 0; 2947 2948 if (IsCmpSwap) { 2949 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 2950 ++OpOffset; 2951 } 2952 2953 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 2954 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 2955 2956 // The struct intrinsic variants add one additional operand over raw. 2957 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 2958 Register VIndex; 2959 if (HasVIndex) { 2960 VIndex = MI.getOperand(4 + OpOffset).getReg(); 2961 ++OpOffset; 2962 } 2963 2964 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 2965 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 2966 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 2967 2968 MachineMemOperand *MMO = *MI.memoperands_begin(); 2969 2970 unsigned ImmOffset; 2971 unsigned TotalOffset; 2972 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 2973 if (TotalOffset != 0) 2974 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 2975 2976 if (!VIndex) 2977 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 2978 2979 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 2980 .addDef(Dst) 2981 .addUse(VData); // vdata 2982 2983 if (IsCmpSwap) 2984 MIB.addReg(CmpVal); 2985 2986 MIB.addUse(RSrc) // rsrc 2987 .addUse(VIndex) // vindex 2988 .addUse(VOffset) // voffset 2989 .addUse(SOffset) // soffset 2990 .addImm(ImmOffset) // offset(imm) 2991 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 2992 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 2993 .addMemOperand(MMO); 2994 2995 MI.eraseFromParent(); 2996 return true; 2997 } 2998 2999 // Produce a vector of s16 elements from s32 pieces. 3000 static void truncToS16Vector(MachineIRBuilder &B, Register DstReg, 3001 ArrayRef<Register> UnmergeParts) { 3002 const LLT S16 = LLT::scalar(16); 3003 3004 SmallVector<Register, 4> RemergeParts(UnmergeParts.size()); 3005 for (int I = 0, E = UnmergeParts.size(); I != E; ++I) 3006 RemergeParts[I] = B.buildTrunc(S16, UnmergeParts[I]).getReg(0); 3007 3008 B.buildBuildVector(DstReg, RemergeParts); 3009 } 3010 3011 /// Convert a set of s32 registers to a result vector with s16 elements. 3012 static void bitcastToS16Vector(MachineIRBuilder &B, Register DstReg, 3013 ArrayRef<Register> UnmergeParts) { 3014 MachineRegisterInfo &MRI = *B.getMRI(); 3015 const LLT V2S16 = LLT::vector(2, 16); 3016 LLT TargetTy = MRI.getType(DstReg); 3017 int NumElts = UnmergeParts.size(); 3018 3019 if (NumElts == 1) { 3020 assert(TargetTy == V2S16); 3021 B.buildBitcast(DstReg, UnmergeParts[0]); 3022 return; 3023 } 3024 3025 SmallVector<Register, 4> RemergeParts(NumElts); 3026 for (int I = 0; I != NumElts; ++I) 3027 RemergeParts[I] = B.buildBitcast(V2S16, UnmergeParts[I]).getReg(0); 3028 3029 if (TargetTy.getSizeInBits() == 32u * NumElts) { 3030 B.buildConcatVectors(DstReg, RemergeParts); 3031 return; 3032 } 3033 3034 const LLT V3S16 = LLT::vector(3, 16); 3035 const LLT V6S16 = LLT::vector(6, 16); 3036 3037 // Widen to v6s16 and unpack v3 parts. 3038 assert(TargetTy == V3S16); 3039 3040 RemergeParts.push_back(B.buildUndef(V2S16).getReg(0)); 3041 auto Concat = B.buildConcatVectors(V6S16, RemergeParts); 3042 B.buildUnmerge({DstReg, MRI.createGenericVirtualRegister(V3S16)}, Concat); 3043 } 3044 3045 // FIXME: Just vector trunc should be sufficent, but legalization currently 3046 // broken. 3047 static void repackUnpackedD16Load(MachineIRBuilder &B, Register DstReg, 3048 Register WideDstReg) { 3049 const LLT S32 = LLT::scalar(32); 3050 const LLT S16 = LLT::scalar(16); 3051 3052 auto Unmerge = B.buildUnmerge(S32, WideDstReg); 3053 3054 int NumOps = Unmerge->getNumOperands() - 1; 3055 SmallVector<Register, 4> RemergeParts(NumOps); 3056 for (int I = 0; I != NumOps; ++I) 3057 RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0); 3058 3059 B.buildBuildVector(DstReg, RemergeParts); 3060 } 3061 3062 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3063 MachineInstr &MI, MachineIRBuilder &B, 3064 GISelChangeObserver &Observer, 3065 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3066 bool IsTFE = MI.getNumExplicitDefs() == 2; 3067 3068 // We are only processing the operands of d16 image operations on subtargets 3069 // that use the unpacked register layout, or need to repack the TFE result. 3070 3071 // TODO: Need to handle a16 images too 3072 // TODO: Do we need to guard against already legalized intrinsics? 3073 if (!IsTFE && !ST.hasUnpackedD16VMem()) 3074 return true; 3075 3076 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3077 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3078 3079 if (BaseOpcode->Atomic) // No d16 atomics, or TFE. 3080 return true; 3081 3082 B.setInstr(MI); 3083 3084 MachineRegisterInfo *MRI = B.getMRI(); 3085 const LLT S32 = LLT::scalar(32); 3086 const LLT S16 = LLT::scalar(16); 3087 3088 if (BaseOpcode->Store) { // No TFE for stores? 3089 Register VData = MI.getOperand(1).getReg(); 3090 LLT Ty = MRI->getType(VData); 3091 if (!Ty.isVector() || Ty.getElementType() != S16) 3092 return true; 3093 3094 B.setInstr(MI); 3095 3096 Observer.changingInstr(MI); 3097 MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData)); 3098 Observer.changedInstr(MI); 3099 return true; 3100 } 3101 3102 Register DstReg = MI.getOperand(0).getReg(); 3103 LLT Ty = MRI->getType(DstReg); 3104 const LLT EltTy = Ty.getScalarType(); 3105 const bool IsD16 = Ty.getScalarType() == S16; 3106 const unsigned NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3107 3108 if (IsTFE) { 3109 // In the IR, TFE is supposed to be used with a 2 element struct return 3110 // type. The intruction really returns these two values in one contiguous 3111 // register, with one additional dword beyond the loaded data. Rewrite the 3112 // return type to use a single register result. 3113 Register Dst1Reg = MI.getOperand(1).getReg(); 3114 if (MRI->getType(Dst1Reg) != S32) 3115 return false; 3116 3117 // TODO: Make sure the TFE operand bit is set. 3118 3119 // The raw dword aligned data component of the load. The only legal cases 3120 // where this matters should be when using the packed D16 format, for 3121 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3122 LLT RoundedTy; 3123 LLT TFETy; 3124 3125 if (IsD16 && ST.hasUnpackedD16VMem()) { 3126 RoundedTy = LLT::scalarOrVector(NumElts, 32); 3127 TFETy = LLT::vector(NumElts + 1, 32); 3128 } else { 3129 unsigned EltSize = Ty.getScalarSizeInBits(); 3130 unsigned RoundedElts = (Ty.getSizeInBits() + 31) / 32; 3131 unsigned RoundedSize = 32 * RoundedElts; 3132 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3133 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3134 } 3135 3136 Register TFEReg = MRI->createGenericVirtualRegister(TFETy); 3137 Observer.changingInstr(MI); 3138 3139 MI.getOperand(0).setReg(TFEReg); 3140 MI.RemoveOperand(1); 3141 3142 Observer.changedInstr(MI); 3143 3144 // Insert after the instruction. 3145 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3146 3147 // Now figure out how to copy the new result register back into the old 3148 // result. 3149 3150 SmallVector<Register, 5> UnmergeResults(TFETy.getNumElements(), Dst1Reg); 3151 int NumDataElts = TFETy.getNumElements() - 1; 3152 3153 if (!Ty.isVector()) { 3154 // Simplest case is a trivial unmerge (plus a truncate for d16). 3155 UnmergeResults[0] = Ty == S32 ? 3156 DstReg : MRI->createGenericVirtualRegister(S32); 3157 3158 B.buildUnmerge(UnmergeResults, TFEReg); 3159 if (Ty != S32) 3160 B.buildTrunc(DstReg, UnmergeResults[0]); 3161 return true; 3162 } 3163 3164 // We have to repack into a new vector of some kind. 3165 for (int I = 0; I != NumDataElts; ++I) 3166 UnmergeResults[I] = MRI->createGenericVirtualRegister(S32); 3167 B.buildUnmerge(UnmergeResults, TFEReg); 3168 3169 // Drop the final TFE element. 3170 ArrayRef<Register> DataPart(UnmergeResults.data(), NumDataElts); 3171 3172 if (EltTy == S32) 3173 B.buildBuildVector(DstReg, DataPart); 3174 else if (ST.hasUnpackedD16VMem()) 3175 truncToS16Vector(B, DstReg, DataPart); 3176 else 3177 bitcastToS16Vector(B, DstReg, DataPart); 3178 3179 return true; 3180 } 3181 3182 // Must be an image load. 3183 if (!Ty.isVector() || Ty.getElementType() != S16) 3184 return true; 3185 3186 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3187 3188 LLT WidenedTy = Ty.changeElementType(S32); 3189 Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy); 3190 3191 Observer.changingInstr(MI); 3192 MI.getOperand(0).setReg(WideDstReg); 3193 Observer.changedInstr(MI); 3194 3195 repackUnpackedD16Load(B, DstReg, WideDstReg); 3196 return true; 3197 } 3198 3199 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 3200 MachineInstr &MI, MachineIRBuilder &B, 3201 GISelChangeObserver &Observer) const { 3202 Register Dst = MI.getOperand(0).getReg(); 3203 LLT Ty = B.getMRI()->getType(Dst); 3204 unsigned Size = Ty.getSizeInBits(); 3205 MachineFunction &MF = B.getMF(); 3206 3207 Observer.changingInstr(MI); 3208 3209 // FIXME: We don't really need this intermediate instruction. The intrinsic 3210 // should be fixed to have a memory operand. Since it's readnone, we're not 3211 // allowed to add one. 3212 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 3213 MI.RemoveOperand(1); // Remove intrinsic ID 3214 3215 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 3216 // TODO: Should this use datalayout alignment? 3217 const unsigned MemSize = (Size + 7) / 8; 3218 const unsigned MemAlign = 4; 3219 MachineMemOperand *MMO = MF.getMachineMemOperand( 3220 MachinePointerInfo(), 3221 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 3222 MachineMemOperand::MOInvariant, MemSize, MemAlign); 3223 MI.addMemOperand(MF, MMO); 3224 3225 // There are no 96-bit result scalar loads, but widening to 128-bit should 3226 // always be legal. We may need to restore this to a 96-bit result if it turns 3227 // out this needs to be converted to a vector load during RegBankSelect. 3228 if (!isPowerOf2_32(Size)) { 3229 LegalizerHelper Helper(MF, *this, Observer, B); 3230 B.setInstr(MI); 3231 3232 if (Ty.isVector()) 3233 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 3234 else 3235 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 3236 } 3237 3238 Observer.changedInstr(MI); 3239 return true; 3240 } 3241 3242 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 3243 MachineIRBuilder &B, 3244 GISelChangeObserver &Observer) const { 3245 MachineRegisterInfo &MRI = *B.getMRI(); 3246 3247 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 3248 auto IntrID = MI.getIntrinsicID(); 3249 switch (IntrID) { 3250 case Intrinsic::amdgcn_if: 3251 case Intrinsic::amdgcn_else: { 3252 MachineInstr *Br = nullptr; 3253 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 3254 const SIRegisterInfo *TRI 3255 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 3256 3257 B.setInstr(*BrCond); 3258 Register Def = MI.getOperand(1).getReg(); 3259 Register Use = MI.getOperand(3).getReg(); 3260 3261 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 3262 if (Br) 3263 BrTarget = Br->getOperand(0).getMBB(); 3264 3265 if (IntrID == Intrinsic::amdgcn_if) { 3266 B.buildInstr(AMDGPU::SI_IF) 3267 .addDef(Def) 3268 .addUse(Use) 3269 .addMBB(BrTarget); 3270 } else { 3271 B.buildInstr(AMDGPU::SI_ELSE) 3272 .addDef(Def) 3273 .addUse(Use) 3274 .addMBB(BrTarget) 3275 .addImm(0); 3276 } 3277 3278 if (Br) 3279 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 3280 3281 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 3282 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 3283 MI.eraseFromParent(); 3284 BrCond->eraseFromParent(); 3285 return true; 3286 } 3287 3288 return false; 3289 } 3290 case Intrinsic::amdgcn_loop: { 3291 MachineInstr *Br = nullptr; 3292 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 3293 const SIRegisterInfo *TRI 3294 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 3295 3296 B.setInstr(*BrCond); 3297 3298 // FIXME: Need to adjust branch targets based on unconditional branch. 3299 Register Reg = MI.getOperand(2).getReg(); 3300 B.buildInstr(AMDGPU::SI_LOOP) 3301 .addUse(Reg) 3302 .addMBB(BrCond->getOperand(1).getMBB()); 3303 MI.eraseFromParent(); 3304 BrCond->eraseFromParent(); 3305 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 3306 return true; 3307 } 3308 3309 return false; 3310 } 3311 case Intrinsic::amdgcn_kernarg_segment_ptr: 3312 return legalizePreloadedArgIntrin( 3313 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3314 case Intrinsic::amdgcn_implicitarg_ptr: 3315 return legalizeImplicitArgPtr(MI, MRI, B); 3316 case Intrinsic::amdgcn_workitem_id_x: 3317 return legalizePreloadedArgIntrin(MI, MRI, B, 3318 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 3319 case Intrinsic::amdgcn_workitem_id_y: 3320 return legalizePreloadedArgIntrin(MI, MRI, B, 3321 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 3322 case Intrinsic::amdgcn_workitem_id_z: 3323 return legalizePreloadedArgIntrin(MI, MRI, B, 3324 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 3325 case Intrinsic::amdgcn_workgroup_id_x: 3326 return legalizePreloadedArgIntrin(MI, MRI, B, 3327 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 3328 case Intrinsic::amdgcn_workgroup_id_y: 3329 return legalizePreloadedArgIntrin(MI, MRI, B, 3330 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 3331 case Intrinsic::amdgcn_workgroup_id_z: 3332 return legalizePreloadedArgIntrin(MI, MRI, B, 3333 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 3334 case Intrinsic::amdgcn_dispatch_ptr: 3335 return legalizePreloadedArgIntrin(MI, MRI, B, 3336 AMDGPUFunctionArgInfo::DISPATCH_PTR); 3337 case Intrinsic::amdgcn_queue_ptr: 3338 return legalizePreloadedArgIntrin(MI, MRI, B, 3339 AMDGPUFunctionArgInfo::QUEUE_PTR); 3340 case Intrinsic::amdgcn_implicit_buffer_ptr: 3341 return legalizePreloadedArgIntrin( 3342 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 3343 case Intrinsic::amdgcn_dispatch_id: 3344 return legalizePreloadedArgIntrin(MI, MRI, B, 3345 AMDGPUFunctionArgInfo::DISPATCH_ID); 3346 case Intrinsic::amdgcn_fdiv_fast: 3347 return legalizeFDIVFastIntrin(MI, MRI, B); 3348 case Intrinsic::amdgcn_is_shared: 3349 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 3350 case Intrinsic::amdgcn_is_private: 3351 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 3352 case Intrinsic::amdgcn_wavefrontsize: { 3353 B.setInstr(MI); 3354 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 3355 MI.eraseFromParent(); 3356 return true; 3357 } 3358 case Intrinsic::amdgcn_s_buffer_load: 3359 return legalizeSBufferLoad(MI, B, Observer); 3360 case Intrinsic::amdgcn_raw_buffer_store: 3361 case Intrinsic::amdgcn_struct_buffer_store: 3362 return legalizeBufferStore(MI, MRI, B, false, false); 3363 case Intrinsic::amdgcn_raw_buffer_store_format: 3364 case Intrinsic::amdgcn_struct_buffer_store_format: 3365 return legalizeBufferStore(MI, MRI, B, false, true); 3366 case Intrinsic::amdgcn_raw_tbuffer_store: 3367 case Intrinsic::amdgcn_struct_tbuffer_store: 3368 return legalizeBufferStore(MI, MRI, B, true, true); 3369 case Intrinsic::amdgcn_raw_buffer_load: 3370 case Intrinsic::amdgcn_struct_buffer_load: 3371 return legalizeBufferLoad(MI, MRI, B, false, false); 3372 case Intrinsic::amdgcn_raw_buffer_load_format: 3373 case Intrinsic::amdgcn_struct_buffer_load_format: 3374 return legalizeBufferLoad(MI, MRI, B, true, false); 3375 case Intrinsic::amdgcn_raw_tbuffer_load: 3376 case Intrinsic::amdgcn_struct_tbuffer_load: 3377 return legalizeBufferLoad(MI, MRI, B, true, true); 3378 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3379 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3380 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3381 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3382 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3383 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3384 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3385 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3386 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3387 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3388 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3389 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3390 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3391 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3392 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3393 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3394 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3395 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3396 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3397 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3398 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3399 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3400 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3401 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3402 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3403 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3404 return legalizeBufferAtomic(MI, B, IntrID); 3405 case Intrinsic::amdgcn_atomic_inc: 3406 return legalizeAtomicIncDec(MI, B, true); 3407 case Intrinsic::amdgcn_atomic_dec: 3408 return legalizeAtomicIncDec(MI, B, false); 3409 default: { 3410 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 3411 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 3412 return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr); 3413 return true; 3414 } 3415 } 3416 3417 return true; 3418 } 3419