1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPULegalizerInfo.h" 22 23 #include "AMDGPU.h" 24 #include "AMDGPUGlobalISelUtils.h" 25 #include "AMDGPUTargetMachine.h" 26 #include "SIMachineFunctionInfo.h" 27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 30 #include "llvm/CodeGen/TargetOpcodes.h" 31 #include "llvm/CodeGen/ValueTypes.h" 32 #include "llvm/IR/DerivedTypes.h" 33 #include "llvm/IR/DiagnosticInfo.h" 34 #include "llvm/IR/Type.h" 35 #include "llvm/Support/Debug.h" 36 37 #define DEBUG_TYPE "amdgpu-legalinfo" 38 39 using namespace llvm; 40 using namespace LegalizeActions; 41 using namespace LegalizeMutations; 42 using namespace LegalityPredicates; 43 using namespace MIPatternMatch; 44 45 // Round the number of elements to the next power of two elements 46 static LLT getPow2VectorType(LLT Ty) { 47 unsigned NElts = Ty.getNumElements(); 48 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 49 return Ty.changeNumElements(Pow2NElts); 50 } 51 52 // Round the number of bits to the next power of two bits 53 static LLT getPow2ScalarType(LLT Ty) { 54 unsigned Bits = Ty.getSizeInBits(); 55 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 56 return LLT::scalar(Pow2Bits); 57 } 58 59 static LegalityPredicate isMultiple32(unsigned TypeIdx, 60 unsigned MaxSize = 1024) { 61 return [=](const LegalityQuery &Query) { 62 const LLT Ty = Query.Types[TypeIdx]; 63 const LLT EltTy = Ty.getScalarType(); 64 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 65 }; 66 } 67 68 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 69 return [=](const LegalityQuery &Query) { 70 return Query.Types[TypeIdx].getSizeInBits() == Size; 71 }; 72 } 73 74 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 75 return [=](const LegalityQuery &Query) { 76 const LLT Ty = Query.Types[TypeIdx]; 77 return Ty.isVector() && 78 Ty.getNumElements() % 2 != 0 && 79 Ty.getElementType().getSizeInBits() < 32 && 80 Ty.getSizeInBits() % 32 != 0; 81 }; 82 } 83 84 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 85 return [=](const LegalityQuery &Query) { 86 const LLT Ty = Query.Types[TypeIdx]; 87 const LLT EltTy = Ty.getScalarType(); 88 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 89 }; 90 } 91 92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 93 return [=](const LegalityQuery &Query) { 94 const LLT Ty = Query.Types[TypeIdx]; 95 const LLT EltTy = Ty.getElementType(); 96 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 97 }; 98 } 99 100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 101 return [=](const LegalityQuery &Query) { 102 const LLT Ty = Query.Types[TypeIdx]; 103 const LLT EltTy = Ty.getElementType(); 104 unsigned Size = Ty.getSizeInBits(); 105 unsigned Pieces = (Size + 63) / 64; 106 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 107 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 108 }; 109 } 110 111 // Increase the number of vector elements to reach the next multiple of 32-bit 112 // type. 113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 114 return [=](const LegalityQuery &Query) { 115 const LLT Ty = Query.Types[TypeIdx]; 116 117 const LLT EltTy = Ty.getElementType(); 118 const int Size = Ty.getSizeInBits(); 119 const int EltSize = EltTy.getSizeInBits(); 120 const int NextMul32 = (Size + 31) / 32; 121 122 assert(EltSize < 32); 123 124 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 125 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 126 }; 127 } 128 129 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 130 return [=](const LegalityQuery &Query) { 131 const LLT QueryTy = Query.Types[TypeIdx]; 132 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 133 }; 134 } 135 136 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 137 return [=](const LegalityQuery &Query) { 138 const LLT QueryTy = Query.Types[TypeIdx]; 139 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 140 }; 141 } 142 143 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 144 return [=](const LegalityQuery &Query) { 145 const LLT QueryTy = Query.Types[TypeIdx]; 146 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 147 }; 148 } 149 150 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 151 // v2s16. 152 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 153 return [=](const LegalityQuery &Query) { 154 const LLT Ty = Query.Types[TypeIdx]; 155 if (Ty.isVector()) { 156 const int EltSize = Ty.getElementType().getSizeInBits(); 157 return EltSize == 32 || EltSize == 64 || 158 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 159 EltSize == 128 || EltSize == 256; 160 } 161 162 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 163 }; 164 } 165 166 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 167 return [=](const LegalityQuery &Query) { 168 const LLT QueryTy = Query.Types[TypeIdx]; 169 return QueryTy.isVector() && QueryTy.getElementType() == Type; 170 }; 171 } 172 173 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 174 return [=](const LegalityQuery &Query) { 175 const LLT Ty = Query.Types[TypeIdx]; 176 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 177 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 178 }; 179 } 180 181 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 182 const GCNTargetMachine &TM) 183 : ST(ST_) { 184 using namespace TargetOpcode; 185 186 auto GetAddrSpacePtr = [&TM](unsigned AS) { 187 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 188 }; 189 190 const LLT S1 = LLT::scalar(1); 191 const LLT S16 = LLT::scalar(16); 192 const LLT S32 = LLT::scalar(32); 193 const LLT S64 = LLT::scalar(64); 194 const LLT S128 = LLT::scalar(128); 195 const LLT S256 = LLT::scalar(256); 196 const LLT S1024 = LLT::scalar(1024); 197 198 const LLT V2S16 = LLT::vector(2, 16); 199 const LLT V4S16 = LLT::vector(4, 16); 200 201 const LLT V2S32 = LLT::vector(2, 32); 202 const LLT V3S32 = LLT::vector(3, 32); 203 const LLT V4S32 = LLT::vector(4, 32); 204 const LLT V5S32 = LLT::vector(5, 32); 205 const LLT V6S32 = LLT::vector(6, 32); 206 const LLT V7S32 = LLT::vector(7, 32); 207 const LLT V8S32 = LLT::vector(8, 32); 208 const LLT V9S32 = LLT::vector(9, 32); 209 const LLT V10S32 = LLT::vector(10, 32); 210 const LLT V11S32 = LLT::vector(11, 32); 211 const LLT V12S32 = LLT::vector(12, 32); 212 const LLT V13S32 = LLT::vector(13, 32); 213 const LLT V14S32 = LLT::vector(14, 32); 214 const LLT V15S32 = LLT::vector(15, 32); 215 const LLT V16S32 = LLT::vector(16, 32); 216 const LLT V32S32 = LLT::vector(32, 32); 217 218 const LLT V2S64 = LLT::vector(2, 64); 219 const LLT V3S64 = LLT::vector(3, 64); 220 const LLT V4S64 = LLT::vector(4, 64); 221 const LLT V5S64 = LLT::vector(5, 64); 222 const LLT V6S64 = LLT::vector(6, 64); 223 const LLT V7S64 = LLT::vector(7, 64); 224 const LLT V8S64 = LLT::vector(8, 64); 225 const LLT V16S64 = LLT::vector(16, 64); 226 227 std::initializer_list<LLT> AllS32Vectors = 228 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 229 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 230 std::initializer_list<LLT> AllS64Vectors = 231 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 232 233 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 234 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 235 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 236 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 237 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 238 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 239 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 240 241 const LLT CodePtr = FlatPtr; 242 243 const std::initializer_list<LLT> AddrSpaces64 = { 244 GlobalPtr, ConstantPtr, FlatPtr 245 }; 246 247 const std::initializer_list<LLT> AddrSpaces32 = { 248 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 249 }; 250 251 const std::initializer_list<LLT> FPTypesBase = { 252 S32, S64 253 }; 254 255 const std::initializer_list<LLT> FPTypes16 = { 256 S32, S64, S16 257 }; 258 259 const std::initializer_list<LLT> FPTypesPK16 = { 260 S32, S64, S16, V2S16 261 }; 262 263 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 264 265 setAction({G_BRCOND, S1}, Legal); // VCC branches 266 setAction({G_BRCOND, S32}, Legal); // SCC branches 267 268 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 269 // elements for v3s16 270 getActionDefinitionsBuilder(G_PHI) 271 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 272 .legalFor(AllS32Vectors) 273 .legalFor(AllS64Vectors) 274 .legalFor(AddrSpaces64) 275 .legalFor(AddrSpaces32) 276 .clampScalar(0, S32, S256) 277 .widenScalarToNextPow2(0, 32) 278 .clampMaxNumElements(0, S32, 16) 279 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 280 .legalIf(isPointer(0)); 281 282 if (ST.has16BitInsts()) { 283 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 284 .legalFor({S32, S16}) 285 .clampScalar(0, S16, S32) 286 .scalarize(0) 287 .widenScalarToNextPow2(0, 32); 288 } else { 289 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 290 .legalFor({S32}) 291 .clampScalar(0, S32, S32) 292 .scalarize(0); 293 } 294 295 // FIXME: Not really legal. Placeholder for custom lowering. 296 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 297 .legalFor({S32, S64}) 298 .clampScalar(0, S32, S64) 299 .widenScalarToNextPow2(0, 32) 300 .scalarize(0); 301 302 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 303 .legalFor({S32}) 304 .clampScalar(0, S32, S32) 305 .scalarize(0); 306 307 // Report legal for any types we can handle anywhere. For the cases only legal 308 // on the SALU, RegBankSelect will be able to re-legalize. 309 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 310 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 311 .clampScalar(0, S32, S64) 312 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 313 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 314 .widenScalarToNextPow2(0) 315 .scalarize(0); 316 317 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 318 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 319 .legalFor({{S32, S1}, {S32, S32}}) 320 .clampScalar(0, S32, S32) 321 .scalarize(0); // TODO: Implement. 322 323 getActionDefinitionsBuilder(G_BITCAST) 324 // Don't worry about the size constraint. 325 .legalIf(all(isRegisterType(0), isRegisterType(1))) 326 .lower(); 327 328 329 getActionDefinitionsBuilder(G_CONSTANT) 330 .legalFor({S1, S32, S64, S16, GlobalPtr, 331 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 332 .clampScalar(0, S32, S64) 333 .widenScalarToNextPow2(0) 334 .legalIf(isPointer(0)); 335 336 getActionDefinitionsBuilder(G_FCONSTANT) 337 .legalFor({S32, S64, S16}) 338 .clampScalar(0, S16, S64); 339 340 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 341 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 342 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 343 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 344 .clampScalarOrElt(0, S32, S1024) 345 .legalIf(isMultiple32(0)) 346 .widenScalarToNextPow2(0, 32) 347 .clampMaxNumElements(0, S32, 16); 348 349 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 350 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 351 .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr}); 352 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 353 354 auto &FPOpActions = getActionDefinitionsBuilder( 355 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 356 .legalFor({S32, S64}); 357 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 358 .customFor({S32, S64}); 359 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 360 .customFor({S32, S64}); 361 362 if (ST.has16BitInsts()) { 363 if (ST.hasVOP3PInsts()) 364 FPOpActions.legalFor({S16, V2S16}); 365 else 366 FPOpActions.legalFor({S16}); 367 368 TrigActions.customFor({S16}); 369 FDIVActions.customFor({S16}); 370 } 371 372 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 373 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 374 375 if (ST.hasVOP3PInsts()) { 376 MinNumMaxNum.customFor(FPTypesPK16) 377 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 378 .clampMaxNumElements(0, S16, 2) 379 .clampScalar(0, S16, S64) 380 .scalarize(0); 381 } else if (ST.has16BitInsts()) { 382 MinNumMaxNum.customFor(FPTypes16) 383 .clampScalar(0, S16, S64) 384 .scalarize(0); 385 } else { 386 MinNumMaxNum.customFor(FPTypesBase) 387 .clampScalar(0, S32, S64) 388 .scalarize(0); 389 } 390 391 if (ST.hasVOP3PInsts()) 392 FPOpActions.clampMaxNumElements(0, S16, 2); 393 394 FPOpActions 395 .scalarize(0) 396 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 397 398 TrigActions 399 .scalarize(0) 400 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 401 402 FDIVActions 403 .scalarize(0) 404 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 405 406 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 407 .legalFor(FPTypesPK16) 408 .clampMaxNumElements(0, S16, 2) 409 .scalarize(0) 410 .clampScalar(0, S16, S64); 411 412 if (ST.has16BitInsts()) { 413 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 414 .legalFor({S32, S64, S16}) 415 .scalarize(0) 416 .clampScalar(0, S16, S64); 417 } else { 418 getActionDefinitionsBuilder(G_FSQRT) 419 .legalFor({S32, S64}) 420 .scalarize(0) 421 .clampScalar(0, S32, S64); 422 423 if (ST.hasFractBug()) { 424 getActionDefinitionsBuilder(G_FFLOOR) 425 .customFor({S64}) 426 .legalFor({S32, S64}) 427 .scalarize(0) 428 .clampScalar(0, S32, S64); 429 } else { 430 getActionDefinitionsBuilder(G_FFLOOR) 431 .legalFor({S32, S64}) 432 .scalarize(0) 433 .clampScalar(0, S32, S64); 434 } 435 } 436 437 getActionDefinitionsBuilder(G_FPTRUNC) 438 .legalFor({{S32, S64}, {S16, S32}}) 439 .scalarize(0) 440 .lower(); 441 442 getActionDefinitionsBuilder(G_FPEXT) 443 .legalFor({{S64, S32}, {S32, S16}}) 444 .lowerFor({{S64, S16}}) // FIXME: Implement 445 .scalarize(0); 446 447 getActionDefinitionsBuilder(G_FSUB) 448 // Use actual fsub instruction 449 .legalFor({S32}) 450 // Must use fadd + fneg 451 .lowerFor({S64, S16, V2S16}) 452 .scalarize(0) 453 .clampScalar(0, S32, S64); 454 455 // Whether this is legal depends on the floating point mode for the function. 456 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 457 if (ST.hasMadF16()) 458 FMad.customFor({S32, S16}); 459 else 460 FMad.customFor({S32}); 461 FMad.scalarize(0) 462 .lower(); 463 464 getActionDefinitionsBuilder(G_TRUNC) 465 .alwaysLegal(); 466 467 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 468 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 469 {S32, S1}, {S64, S1}, {S16, S1}}) 470 .scalarize(0) 471 .clampScalar(0, S32, S64) 472 .widenScalarToNextPow2(1, 32); 473 474 // TODO: Split s1->s64 during regbankselect for VALU. 475 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 476 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 477 .lowerFor({{S32, S64}}) 478 .lowerIf(typeIs(1, S1)) 479 .customFor({{S64, S64}}); 480 if (ST.has16BitInsts()) 481 IToFP.legalFor({{S16, S16}}); 482 IToFP.clampScalar(1, S32, S64) 483 .scalarize(0); 484 485 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 486 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 487 .customFor({{S64, S64}}); 488 if (ST.has16BitInsts()) 489 FPToI.legalFor({{S16, S16}}); 490 else 491 FPToI.minScalar(1, S32); 492 493 FPToI.minScalar(0, S32) 494 .scalarize(0) 495 .lower(); 496 497 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 498 .scalarize(0) 499 .lower(); 500 501 if (ST.has16BitInsts()) { 502 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 503 .legalFor({S16, S32, S64}) 504 .clampScalar(0, S16, S64) 505 .scalarize(0); 506 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 507 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 508 .legalFor({S32, S64}) 509 .clampScalar(0, S32, S64) 510 .scalarize(0); 511 } else { 512 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 513 .legalFor({S32}) 514 .customFor({S64}) 515 .clampScalar(0, S32, S64) 516 .scalarize(0); 517 } 518 519 getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK}) 520 .scalarize(0) 521 .alwaysLegal(); 522 523 auto &CmpBuilder = 524 getActionDefinitionsBuilder(G_ICMP) 525 // The compare output type differs based on the register bank of the output, 526 // so make both s1 and s32 legal. 527 // 528 // Scalar compares producing output in scc will be promoted to s32, as that 529 // is the allocatable register type that will be needed for the copy from 530 // scc. This will be promoted during RegBankSelect, and we assume something 531 // before that won't try to use s32 result types. 532 // 533 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 534 // bank. 535 .legalForCartesianProduct( 536 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 537 .legalForCartesianProduct( 538 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 539 if (ST.has16BitInsts()) { 540 CmpBuilder.legalFor({{S1, S16}}); 541 } 542 543 CmpBuilder 544 .widenScalarToNextPow2(1) 545 .clampScalar(1, S32, S64) 546 .scalarize(0) 547 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 548 549 getActionDefinitionsBuilder(G_FCMP) 550 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 551 .widenScalarToNextPow2(1) 552 .clampScalar(1, S32, S64) 553 .scalarize(0); 554 555 // FIXME: fpow has a selection pattern that should move to custom lowering. 556 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2, G_FPOW}); 557 if (ST.has16BitInsts()) 558 Exp2Ops.legalFor({S32, S16}); 559 else 560 Exp2Ops.legalFor({S32}); 561 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 562 Exp2Ops.scalarize(0); 563 564 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10}); 565 if (ST.has16BitInsts()) 566 ExpOps.customFor({{S32}, {S16}}); 567 else 568 ExpOps.customFor({S32}); 569 ExpOps.clampScalar(0, MinScalarFPTy, S32) 570 .scalarize(0); 571 572 // The 64-bit versions produce 32-bit results, but only on the SALU. 573 getActionDefinitionsBuilder(G_CTPOP) 574 .legalFor({{S32, S32}, {S32, S64}}) 575 .clampScalar(0, S32, S32) 576 .clampScalar(1, S32, S64) 577 .scalarize(0) 578 .widenScalarToNextPow2(0, 32) 579 .widenScalarToNextPow2(1, 32); 580 581 // The hardware instructions return a different result on 0 than the generic 582 // instructions expect. The hardware produces -1, but these produce the 583 // bitwidth. 584 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 585 .scalarize(0) 586 .clampScalar(0, S32, S32) 587 .clampScalar(1, S32, S64) 588 .widenScalarToNextPow2(0, 32) 589 .widenScalarToNextPow2(1, 32) 590 .lower(); 591 592 // The 64-bit versions produce 32-bit results, but only on the SALU. 593 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 594 .legalFor({{S32, S32}, {S32, S64}}) 595 .clampScalar(0, S32, S32) 596 .clampScalar(1, S32, S64) 597 .scalarize(0) 598 .widenScalarToNextPow2(0, 32) 599 .widenScalarToNextPow2(1, 32); 600 601 getActionDefinitionsBuilder(G_BITREVERSE) 602 .legalFor({S32}) 603 .clampScalar(0, S32, S32) 604 .scalarize(0); 605 606 if (ST.has16BitInsts()) { 607 getActionDefinitionsBuilder(G_BSWAP) 608 .legalFor({S16, S32, V2S16}) 609 .clampMaxNumElements(0, S16, 2) 610 // FIXME: Fixing non-power-of-2 before clamp is workaround for 611 // narrowScalar limitation. 612 .widenScalarToNextPow2(0) 613 .clampScalar(0, S16, S32) 614 .scalarize(0); 615 616 if (ST.hasVOP3PInsts()) { 617 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 618 .legalFor({S32, S16, V2S16}) 619 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 620 .clampMaxNumElements(0, S16, 2) 621 .clampScalar(0, S16, S32) 622 .widenScalarToNextPow2(0) 623 .scalarize(0); 624 } else { 625 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 626 .legalFor({S32, S16}) 627 .widenScalarToNextPow2(0) 628 .clampScalar(0, S16, S32) 629 .scalarize(0); 630 } 631 } else { 632 // TODO: Should have same legality without v_perm_b32 633 getActionDefinitionsBuilder(G_BSWAP) 634 .legalFor({S32}) 635 .lowerIf(narrowerThan(0, 32)) 636 // FIXME: Fixing non-power-of-2 before clamp is workaround for 637 // narrowScalar limitation. 638 .widenScalarToNextPow2(0) 639 .maxScalar(0, S32) 640 .scalarize(0) 641 .lower(); 642 643 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 644 .legalFor({S32}) 645 .clampScalar(0, S32, S32) 646 .widenScalarToNextPow2(0) 647 .scalarize(0); 648 } 649 650 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 651 return [=](const LegalityQuery &Query) { 652 return Query.Types[TypeIdx0].getSizeInBits() < 653 Query.Types[TypeIdx1].getSizeInBits(); 654 }; 655 }; 656 657 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 658 return [=](const LegalityQuery &Query) { 659 return Query.Types[TypeIdx0].getSizeInBits() > 660 Query.Types[TypeIdx1].getSizeInBits(); 661 }; 662 }; 663 664 getActionDefinitionsBuilder(G_INTTOPTR) 665 // List the common cases 666 .legalForCartesianProduct(AddrSpaces64, {S64}) 667 .legalForCartesianProduct(AddrSpaces32, {S32}) 668 .scalarize(0) 669 // Accept any address space as long as the size matches 670 .legalIf(sameSize(0, 1)) 671 .widenScalarIf(smallerThan(1, 0), 672 [](const LegalityQuery &Query) { 673 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 674 }) 675 .narrowScalarIf(greaterThan(1, 0), 676 [](const LegalityQuery &Query) { 677 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 678 }); 679 680 getActionDefinitionsBuilder(G_PTRTOINT) 681 // List the common cases 682 .legalForCartesianProduct(AddrSpaces64, {S64}) 683 .legalForCartesianProduct(AddrSpaces32, {S32}) 684 .scalarize(0) 685 // Accept any address space as long as the size matches 686 .legalIf(sameSize(0, 1)) 687 .widenScalarIf(smallerThan(0, 1), 688 [](const LegalityQuery &Query) { 689 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 690 }) 691 .narrowScalarIf( 692 greaterThan(0, 1), 693 [](const LegalityQuery &Query) { 694 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 695 }); 696 697 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 698 .scalarize(0) 699 .custom(); 700 701 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 702 // handle some operations by just promoting the register during 703 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 704 auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned { 705 switch (AS) { 706 // FIXME: Private element size. 707 case AMDGPUAS::PRIVATE_ADDRESS: 708 return 32; 709 // FIXME: Check subtarget 710 case AMDGPUAS::LOCAL_ADDRESS: 711 return ST.useDS128() ? 128 : 64; 712 713 // Treat constant and global as identical. SMRD loads are sometimes usable 714 // for global loads (ideally constant address space should be eliminated) 715 // depending on the context. Legality cannot be context dependent, but 716 // RegBankSelect can split the load as necessary depending on the pointer 717 // register bank/uniformity and if the memory is invariant or not written in 718 // a kernel. 719 case AMDGPUAS::CONSTANT_ADDRESS: 720 case AMDGPUAS::GLOBAL_ADDRESS: 721 return IsLoad ? 512 : 128; 722 default: 723 return 128; 724 } 725 }; 726 727 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 728 bool IsLoad) -> bool { 729 const LLT DstTy = Query.Types[0]; 730 731 // Split vector extloads. 732 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 733 unsigned Align = Query.MMODescrs[0].AlignInBits; 734 735 if (MemSize < DstTy.getSizeInBits()) 736 MemSize = std::max(MemSize, Align); 737 738 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 739 return true; 740 741 const LLT PtrTy = Query.Types[1]; 742 unsigned AS = PtrTy.getAddressSpace(); 743 if (MemSize > maxSizeForAddrSpace(AS, IsLoad)) 744 return true; 745 746 // Catch weird sized loads that don't evenly divide into the access sizes 747 // TODO: May be able to widen depending on alignment etc. 748 unsigned NumRegs = (MemSize + 31) / 32; 749 if (NumRegs == 3) { 750 if (!ST.hasDwordx3LoadStores()) 751 return true; 752 } else { 753 // If the alignment allows, these should have been widened. 754 if (!isPowerOf2_32(NumRegs)) 755 return true; 756 } 757 758 if (Align < MemSize) { 759 const SITargetLowering *TLI = ST.getTargetLowering(); 760 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 761 } 762 763 return false; 764 }; 765 766 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool { 767 unsigned Size = Query.Types[0].getSizeInBits(); 768 if (isPowerOf2_32(Size)) 769 return false; 770 771 if (Size == 96 && ST.hasDwordx3LoadStores()) 772 return false; 773 774 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 775 if (Size >= maxSizeForAddrSpace(AddrSpace, true)) 776 return false; 777 778 unsigned Align = Query.MMODescrs[0].AlignInBits; 779 unsigned RoundedSize = NextPowerOf2(Size); 780 return (Align >= RoundedSize); 781 }; 782 783 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 784 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 785 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 786 787 // TODO: Refine based on subtargets which support unaligned access or 128-bit 788 // LDS 789 // TODO: Unsupported flat for SI. 790 791 for (unsigned Op : {G_LOAD, G_STORE}) { 792 const bool IsStore = Op == G_STORE; 793 794 auto &Actions = getActionDefinitionsBuilder(Op); 795 // Whitelist the common cases. 796 // TODO: Loads to s16 on gfx9 797 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 798 {V2S32, GlobalPtr, 64, GlobalAlign32}, 799 {V4S32, GlobalPtr, 128, GlobalAlign32}, 800 {S128, GlobalPtr, 128, GlobalAlign32}, 801 {S64, GlobalPtr, 64, GlobalAlign32}, 802 {V2S64, GlobalPtr, 128, GlobalAlign32}, 803 {V2S16, GlobalPtr, 32, GlobalAlign32}, 804 {S32, GlobalPtr, 8, GlobalAlign8}, 805 {S32, GlobalPtr, 16, GlobalAlign16}, 806 807 {S32, LocalPtr, 32, 32}, 808 {S64, LocalPtr, 64, 32}, 809 {V2S32, LocalPtr, 64, 32}, 810 {S32, LocalPtr, 8, 8}, 811 {S32, LocalPtr, 16, 16}, 812 {V2S16, LocalPtr, 32, 32}, 813 814 {S32, PrivatePtr, 32, 32}, 815 {S32, PrivatePtr, 8, 8}, 816 {S32, PrivatePtr, 16, 16}, 817 {V2S16, PrivatePtr, 32, 32}, 818 819 {S32, FlatPtr, 32, GlobalAlign32}, 820 {S32, FlatPtr, 16, GlobalAlign16}, 821 {S32, FlatPtr, 8, GlobalAlign8}, 822 {V2S16, FlatPtr, 32, GlobalAlign32}, 823 824 {S32, ConstantPtr, 32, GlobalAlign32}, 825 {V2S32, ConstantPtr, 64, GlobalAlign32}, 826 {V4S32, ConstantPtr, 128, GlobalAlign32}, 827 {S64, ConstantPtr, 64, GlobalAlign32}, 828 {S128, ConstantPtr, 128, GlobalAlign32}, 829 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 830 Actions 831 .customIf(typeIs(1, Constant32Ptr)) 832 // Widen suitably aligned loads by loading extra elements. 833 .moreElementsIf([=](const LegalityQuery &Query) { 834 const LLT Ty = Query.Types[0]; 835 return Op == G_LOAD && Ty.isVector() && 836 shouldWidenLoadResult(Query); 837 }, moreElementsToNextPow2(0)) 838 .widenScalarIf([=](const LegalityQuery &Query) { 839 const LLT Ty = Query.Types[0]; 840 return Op == G_LOAD && !Ty.isVector() && 841 shouldWidenLoadResult(Query); 842 }, widenScalarOrEltToNextPow2(0)) 843 .narrowScalarIf( 844 [=](const LegalityQuery &Query) -> bool { 845 return !Query.Types[0].isVector() && 846 needToSplitMemOp(Query, Op == G_LOAD); 847 }, 848 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 849 const LLT DstTy = Query.Types[0]; 850 const LLT PtrTy = Query.Types[1]; 851 852 const unsigned DstSize = DstTy.getSizeInBits(); 853 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 854 855 // Split extloads. 856 if (DstSize > MemSize) 857 return std::make_pair(0, LLT::scalar(MemSize)); 858 859 if (!isPowerOf2_32(DstSize)) { 860 // We're probably decomposing an odd sized store. Try to split 861 // to the widest type. TODO: Account for alignment. As-is it 862 // should be OK, since the new parts will be further legalized. 863 unsigned FloorSize = PowerOf2Floor(DstSize); 864 return std::make_pair(0, LLT::scalar(FloorSize)); 865 } 866 867 if (DstSize > 32 && (DstSize % 32 != 0)) { 868 // FIXME: Need a way to specify non-extload of larger size if 869 // suitably aligned. 870 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 871 } 872 873 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 874 Op == G_LOAD); 875 if (MemSize > MaxSize) 876 return std::make_pair(0, LLT::scalar(MaxSize)); 877 878 unsigned Align = Query.MMODescrs[0].AlignInBits; 879 return std::make_pair(0, LLT::scalar(Align)); 880 }) 881 .fewerElementsIf( 882 [=](const LegalityQuery &Query) -> bool { 883 return Query.Types[0].isVector() && 884 needToSplitMemOp(Query, Op == G_LOAD); 885 }, 886 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 887 const LLT DstTy = Query.Types[0]; 888 const LLT PtrTy = Query.Types[1]; 889 890 LLT EltTy = DstTy.getElementType(); 891 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 892 Op == G_LOAD); 893 894 // FIXME: Handle widened to power of 2 results better. This ends 895 // up scalarizing. 896 // FIXME: 3 element stores scalarized on SI 897 898 // Split if it's too large for the address space. 899 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 900 unsigned NumElts = DstTy.getNumElements(); 901 unsigned EltSize = EltTy.getSizeInBits(); 902 903 if (MaxSize % EltSize == 0) { 904 return std::make_pair( 905 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 906 } 907 908 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 909 910 // FIXME: Refine when odd breakdowns handled 911 // The scalars will need to be re-legalized. 912 if (NumPieces == 1 || NumPieces >= NumElts || 913 NumElts % NumPieces != 0) 914 return std::make_pair(0, EltTy); 915 916 return std::make_pair(0, 917 LLT::vector(NumElts / NumPieces, EltTy)); 918 } 919 920 // FIXME: We could probably handle weird extending loads better. 921 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 922 if (DstTy.getSizeInBits() > MemSize) 923 return std::make_pair(0, EltTy); 924 925 unsigned EltSize = EltTy.getSizeInBits(); 926 unsigned DstSize = DstTy.getSizeInBits(); 927 if (!isPowerOf2_32(DstSize)) { 928 // We're probably decomposing an odd sized store. Try to split 929 // to the widest type. TODO: Account for alignment. As-is it 930 // should be OK, since the new parts will be further legalized. 931 unsigned FloorSize = PowerOf2Floor(DstSize); 932 return std::make_pair( 933 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 934 } 935 936 // Need to split because of alignment. 937 unsigned Align = Query.MMODescrs[0].AlignInBits; 938 if (EltSize > Align && 939 (EltSize / Align < DstTy.getNumElements())) { 940 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 941 } 942 943 // May need relegalization for the scalars. 944 return std::make_pair(0, EltTy); 945 }) 946 .minScalar(0, S32); 947 948 if (IsStore) 949 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 950 951 // TODO: Need a bitcast lower option? 952 Actions 953 .legalIf([=](const LegalityQuery &Query) { 954 const LLT Ty0 = Query.Types[0]; 955 unsigned Size = Ty0.getSizeInBits(); 956 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 957 unsigned Align = Query.MMODescrs[0].AlignInBits; 958 959 // FIXME: Widening store from alignment not valid. 960 if (MemSize < Size) 961 MemSize = std::max(MemSize, Align); 962 963 // No extending vector loads. 964 if (Size > MemSize && Ty0.isVector()) 965 return false; 966 967 switch (MemSize) { 968 case 8: 969 case 16: 970 return Size == 32; 971 case 32: 972 case 64: 973 case 128: 974 return true; 975 case 96: 976 return ST.hasDwordx3LoadStores(); 977 case 256: 978 case 512: 979 return true; 980 default: 981 return false; 982 } 983 }) 984 .widenScalarToNextPow2(0) 985 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 986 } 987 988 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 989 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 990 {S32, GlobalPtr, 16, 2 * 8}, 991 {S32, LocalPtr, 8, 8}, 992 {S32, LocalPtr, 16, 16}, 993 {S32, PrivatePtr, 8, 8}, 994 {S32, PrivatePtr, 16, 16}, 995 {S32, ConstantPtr, 8, 8}, 996 {S32, ConstantPtr, 16, 2 * 8}}); 997 if (ST.hasFlatAddressSpace()) { 998 ExtLoads.legalForTypesWithMemDesc( 999 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1000 } 1001 1002 ExtLoads.clampScalar(0, S32, S32) 1003 .widenScalarToNextPow2(0) 1004 .unsupportedIfMemSizeNotPow2() 1005 .lower(); 1006 1007 auto &Atomics = getActionDefinitionsBuilder( 1008 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1009 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1010 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1011 G_ATOMICRMW_UMIN}) 1012 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1013 {S64, GlobalPtr}, {S64, LocalPtr}}); 1014 if (ST.hasFlatAddressSpace()) { 1015 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1016 } 1017 1018 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1019 .legalFor({{S32, LocalPtr}}); 1020 1021 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1022 // demarshalling 1023 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1024 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1025 {S32, FlatPtr}, {S64, FlatPtr}}) 1026 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1027 {S32, RegionPtr}, {S64, RegionPtr}}); 1028 // TODO: Pointer types, any 32-bit or 64-bit vector 1029 1030 // Condition should be s32 for scalar, s1 for vector. 1031 getActionDefinitionsBuilder(G_SELECT) 1032 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1033 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1034 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1035 .clampScalar(0, S16, S64) 1036 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1037 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1038 .scalarize(1) 1039 .clampMaxNumElements(0, S32, 2) 1040 .clampMaxNumElements(0, LocalPtr, 2) 1041 .clampMaxNumElements(0, PrivatePtr, 2) 1042 .scalarize(0) 1043 .widenScalarToNextPow2(0) 1044 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1045 1046 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1047 // be more flexible with the shift amount type. 1048 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1049 .legalFor({{S32, S32}, {S64, S32}}); 1050 if (ST.has16BitInsts()) { 1051 if (ST.hasVOP3PInsts()) { 1052 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 1053 .clampMaxNumElements(0, S16, 2); 1054 } else 1055 Shifts.legalFor({{S16, S32}, {S16, S16}}); 1056 1057 // TODO: Support 16-bit shift amounts 1058 Shifts.clampScalar(1, S32, S32); 1059 Shifts.clampScalar(0, S16, S64); 1060 Shifts.widenScalarToNextPow2(0, 16); 1061 } else { 1062 // Make sure we legalize the shift amount type first, as the general 1063 // expansion for the shifted type will produce much worse code if it hasn't 1064 // been truncated already. 1065 Shifts.clampScalar(1, S32, S32); 1066 Shifts.clampScalar(0, S32, S64); 1067 Shifts.widenScalarToNextPow2(0, 32); 1068 } 1069 Shifts.scalarize(0); 1070 1071 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1072 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1073 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1074 unsigned IdxTypeIdx = 2; 1075 1076 getActionDefinitionsBuilder(Op) 1077 .customIf([=](const LegalityQuery &Query) { 1078 const LLT EltTy = Query.Types[EltTypeIdx]; 1079 const LLT VecTy = Query.Types[VecTypeIdx]; 1080 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1081 return (EltTy.getSizeInBits() == 16 || 1082 EltTy.getSizeInBits() % 32 == 0) && 1083 VecTy.getSizeInBits() % 32 == 0 && 1084 VecTy.getSizeInBits() <= 1024 && 1085 IdxTy.getSizeInBits() == 32; 1086 }) 1087 .clampScalar(EltTypeIdx, S32, S64) 1088 .clampScalar(VecTypeIdx, S32, S64) 1089 .clampScalar(IdxTypeIdx, S32, S32); 1090 } 1091 1092 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1093 .unsupportedIf([=](const LegalityQuery &Query) { 1094 const LLT &EltTy = Query.Types[1].getElementType(); 1095 return Query.Types[0] != EltTy; 1096 }); 1097 1098 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1099 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1100 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1101 1102 // FIXME: Doesn't handle extract of illegal sizes. 1103 getActionDefinitionsBuilder(Op) 1104 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1105 // FIXME: Multiples of 16 should not be legal. 1106 .legalIf([=](const LegalityQuery &Query) { 1107 const LLT BigTy = Query.Types[BigTyIdx]; 1108 const LLT LitTy = Query.Types[LitTyIdx]; 1109 return (BigTy.getSizeInBits() % 32 == 0) && 1110 (LitTy.getSizeInBits() % 16 == 0); 1111 }) 1112 .widenScalarIf( 1113 [=](const LegalityQuery &Query) { 1114 const LLT BigTy = Query.Types[BigTyIdx]; 1115 return (BigTy.getScalarSizeInBits() < 16); 1116 }, 1117 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1118 .widenScalarIf( 1119 [=](const LegalityQuery &Query) { 1120 const LLT LitTy = Query.Types[LitTyIdx]; 1121 return (LitTy.getScalarSizeInBits() < 16); 1122 }, 1123 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1124 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1125 .widenScalarToNextPow2(BigTyIdx, 32); 1126 1127 } 1128 1129 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1130 .legalForCartesianProduct(AllS32Vectors, {S32}) 1131 .legalForCartesianProduct(AllS64Vectors, {S64}) 1132 .clampNumElements(0, V16S32, V32S32) 1133 .clampNumElements(0, V2S64, V16S64) 1134 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1135 1136 if (ST.hasScalarPackInsts()) { 1137 BuildVector 1138 // FIXME: Should probably widen s1 vectors straight to s32 1139 .minScalarOrElt(0, S16) 1140 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1141 .minScalar(1, S32); 1142 1143 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1144 .legalFor({V2S16, S32}) 1145 .lower(); 1146 BuildVector.minScalarOrElt(0, S32); 1147 } else { 1148 BuildVector.customFor({V2S16, S16}); 1149 BuildVector.minScalarOrElt(0, S32); 1150 1151 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1152 .customFor({V2S16, S32}) 1153 .lower(); 1154 } 1155 1156 BuildVector.legalIf(isRegisterType(0)); 1157 1158 // FIXME: Clamp maximum size 1159 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1160 .legalIf(isRegisterType(0)); 1161 1162 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1163 // pre-legalize. 1164 if (ST.hasVOP3PInsts()) { 1165 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1166 .customFor({V2S16, V2S16}) 1167 .lower(); 1168 } else 1169 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1170 1171 // Merge/Unmerge 1172 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1173 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1174 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1175 1176 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1177 const LLT &Ty = Query.Types[TypeIdx]; 1178 if (Ty.isVector()) { 1179 const LLT &EltTy = Ty.getElementType(); 1180 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 1181 return true; 1182 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1183 return true; 1184 } 1185 return false; 1186 }; 1187 1188 auto &Builder = getActionDefinitionsBuilder(Op) 1189 // Try to widen to s16 first for small types. 1190 // TODO: Only do this on targets with legal s16 shifts 1191 .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1192 1193 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1194 .lowerFor({{S16, V2S16}}) 1195 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1196 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1197 elementTypeIs(1, S16)), 1198 changeTo(1, V2S16)) 1199 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1200 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1201 // valid. 1202 .clampScalar(LitTyIdx, S32, S256) 1203 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1204 // Break up vectors with weird elements into scalars 1205 .fewerElementsIf( 1206 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 1207 scalarize(0)) 1208 .fewerElementsIf( 1209 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 1210 scalarize(1)) 1211 .clampScalar(BigTyIdx, S32, S1024); 1212 1213 if (Op == G_MERGE_VALUES) { 1214 Builder.widenScalarIf( 1215 // TODO: Use 16-bit shifts if legal for 8-bit values? 1216 [=](const LegalityQuery &Query) { 1217 const LLT Ty = Query.Types[LitTyIdx]; 1218 return Ty.getSizeInBits() < 32; 1219 }, 1220 changeTo(LitTyIdx, S32)); 1221 } 1222 1223 Builder.widenScalarIf( 1224 [=](const LegalityQuery &Query) { 1225 const LLT Ty = Query.Types[BigTyIdx]; 1226 return !isPowerOf2_32(Ty.getSizeInBits()) && 1227 Ty.getSizeInBits() % 16 != 0; 1228 }, 1229 [=](const LegalityQuery &Query) { 1230 // Pick the next power of 2, or a multiple of 64 over 128. 1231 // Whichever is smaller. 1232 const LLT &Ty = Query.Types[BigTyIdx]; 1233 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1234 if (NewSizeInBits >= 256) { 1235 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1236 if (RoundedTo < NewSizeInBits) 1237 NewSizeInBits = RoundedTo; 1238 } 1239 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1240 }) 1241 .legalIf([=](const LegalityQuery &Query) { 1242 const LLT &BigTy = Query.Types[BigTyIdx]; 1243 const LLT &LitTy = Query.Types[LitTyIdx]; 1244 1245 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1246 return false; 1247 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1248 return false; 1249 1250 return BigTy.getSizeInBits() % 16 == 0 && 1251 LitTy.getSizeInBits() % 16 == 0 && 1252 BigTy.getSizeInBits() <= 1024; 1253 }) 1254 // Any vectors left are the wrong size. Scalarize them. 1255 .scalarize(0) 1256 .scalarize(1); 1257 } 1258 1259 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1260 // RegBankSelect. 1261 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1262 .legalFor({{S32}, {S64}}); 1263 1264 if (ST.hasVOP3PInsts()) { 1265 SextInReg.lowerFor({{V2S16}}) 1266 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1267 // get more vector shift opportunities, since we'll get those when 1268 // expanded. 1269 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1270 } else if (ST.has16BitInsts()) { 1271 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1272 } else { 1273 // Prefer to promote to s32 before lowering if we don't have 16-bit 1274 // shifts. This avoid a lot of intermediate truncate and extend operations. 1275 SextInReg.lowerFor({{S32}, {S64}}); 1276 } 1277 1278 SextInReg 1279 .scalarize(0) 1280 .clampScalar(0, S32, S64) 1281 .lower(); 1282 1283 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1284 .legalFor({S64}); 1285 1286 getActionDefinitionsBuilder({ 1287 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1288 G_FCOPYSIGN, 1289 1290 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1291 G_READ_REGISTER, 1292 G_WRITE_REGISTER, 1293 1294 G_SADDO, G_SSUBO, 1295 1296 // TODO: Implement 1297 G_FMINIMUM, G_FMAXIMUM 1298 }).lower(); 1299 1300 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1301 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1302 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1303 .unsupported(); 1304 1305 computeTables(); 1306 verify(*ST.getInstrInfo()); 1307 } 1308 1309 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1310 MachineRegisterInfo &MRI, 1311 MachineIRBuilder &B, 1312 GISelChangeObserver &Observer) const { 1313 switch (MI.getOpcode()) { 1314 case TargetOpcode::G_ADDRSPACE_CAST: 1315 return legalizeAddrSpaceCast(MI, MRI, B); 1316 case TargetOpcode::G_FRINT: 1317 return legalizeFrint(MI, MRI, B); 1318 case TargetOpcode::G_FCEIL: 1319 return legalizeFceil(MI, MRI, B); 1320 case TargetOpcode::G_INTRINSIC_TRUNC: 1321 return legalizeIntrinsicTrunc(MI, MRI, B); 1322 case TargetOpcode::G_SITOFP: 1323 return legalizeITOFP(MI, MRI, B, true); 1324 case TargetOpcode::G_UITOFP: 1325 return legalizeITOFP(MI, MRI, B, false); 1326 case TargetOpcode::G_FPTOSI: 1327 return legalizeFPTOI(MI, MRI, B, true); 1328 case TargetOpcode::G_FPTOUI: 1329 return legalizeFPTOI(MI, MRI, B, false); 1330 case TargetOpcode::G_FMINNUM: 1331 case TargetOpcode::G_FMAXNUM: 1332 case TargetOpcode::G_FMINNUM_IEEE: 1333 case TargetOpcode::G_FMAXNUM_IEEE: 1334 return legalizeMinNumMaxNum(MI, MRI, B); 1335 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1336 return legalizeExtractVectorElt(MI, MRI, B); 1337 case TargetOpcode::G_INSERT_VECTOR_ELT: 1338 return legalizeInsertVectorElt(MI, MRI, B); 1339 case TargetOpcode::G_SHUFFLE_VECTOR: 1340 return legalizeShuffleVector(MI, MRI, B); 1341 case TargetOpcode::G_FSIN: 1342 case TargetOpcode::G_FCOS: 1343 return legalizeSinCos(MI, MRI, B); 1344 case TargetOpcode::G_GLOBAL_VALUE: 1345 return legalizeGlobalValue(MI, MRI, B); 1346 case TargetOpcode::G_LOAD: 1347 return legalizeLoad(MI, MRI, B, Observer); 1348 case TargetOpcode::G_FMAD: 1349 return legalizeFMad(MI, MRI, B); 1350 case TargetOpcode::G_FDIV: 1351 return legalizeFDIV(MI, MRI, B); 1352 case TargetOpcode::G_ATOMIC_CMPXCHG: 1353 return legalizeAtomicCmpXChg(MI, MRI, B); 1354 case TargetOpcode::G_FLOG: 1355 return legalizeFlog(MI, B, 1.0f / numbers::log2ef); 1356 case TargetOpcode::G_FLOG10: 1357 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1358 case TargetOpcode::G_FEXP: 1359 return legalizeFExp(MI, B); 1360 case TargetOpcode::G_FFLOOR: 1361 return legalizeFFloor(MI, MRI, B); 1362 case TargetOpcode::G_BUILD_VECTOR: 1363 return legalizeBuildVector(MI, MRI, B); 1364 default: 1365 return false; 1366 } 1367 1368 llvm_unreachable("expected switch to return"); 1369 } 1370 1371 Register AMDGPULegalizerInfo::getSegmentAperture( 1372 unsigned AS, 1373 MachineRegisterInfo &MRI, 1374 MachineIRBuilder &B) const { 1375 MachineFunction &MF = B.getMF(); 1376 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1377 const LLT S32 = LLT::scalar(32); 1378 1379 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1380 1381 if (ST.hasApertureRegs()) { 1382 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1383 // getreg. 1384 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1385 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1386 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1387 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1388 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1389 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1390 unsigned Encoding = 1391 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1392 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1393 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1394 1395 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1396 1397 B.buildInstr(AMDGPU::S_GETREG_B32) 1398 .addDef(GetReg) 1399 .addImm(Encoding); 1400 MRI.setType(GetReg, S32); 1401 1402 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1403 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1404 } 1405 1406 Register QueuePtr = MRI.createGenericVirtualRegister( 1407 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1408 1409 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1410 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1411 return Register(); 1412 1413 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1414 // private_segment_aperture_base_hi. 1415 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1416 1417 // TODO: can we be smarter about machine pointer info? 1418 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1419 MachineMemOperand *MMO = MF.getMachineMemOperand( 1420 PtrInfo, 1421 MachineMemOperand::MOLoad | 1422 MachineMemOperand::MODereferenceable | 1423 MachineMemOperand::MOInvariant, 1424 4, 1425 MinAlign(64, StructOffset)); 1426 1427 Register LoadAddr; 1428 1429 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1430 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1431 } 1432 1433 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1434 MachineInstr &MI, MachineRegisterInfo &MRI, 1435 MachineIRBuilder &B) const { 1436 MachineFunction &MF = B.getMF(); 1437 1438 B.setInstr(MI); 1439 1440 const LLT S32 = LLT::scalar(32); 1441 Register Dst = MI.getOperand(0).getReg(); 1442 Register Src = MI.getOperand(1).getReg(); 1443 1444 LLT DstTy = MRI.getType(Dst); 1445 LLT SrcTy = MRI.getType(Src); 1446 unsigned DestAS = DstTy.getAddressSpace(); 1447 unsigned SrcAS = SrcTy.getAddressSpace(); 1448 1449 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1450 // vector element. 1451 assert(!DstTy.isVector()); 1452 1453 const AMDGPUTargetMachine &TM 1454 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1455 1456 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1457 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1458 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1459 return true; 1460 } 1461 1462 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1463 // Truncate. 1464 B.buildExtract(Dst, Src, 0); 1465 MI.eraseFromParent(); 1466 return true; 1467 } 1468 1469 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1470 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1471 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1472 1473 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1474 // another. Merge operands are required to be the same type, but creating an 1475 // extra ptrtoint would be kind of pointless. 1476 auto HighAddr = B.buildConstant( 1477 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1478 B.buildMerge(Dst, {Src, HighAddr}); 1479 MI.eraseFromParent(); 1480 return true; 1481 } 1482 1483 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1484 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1485 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1486 unsigned NullVal = TM.getNullPointerValue(DestAS); 1487 1488 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1489 auto FlatNull = B.buildConstant(SrcTy, 0); 1490 1491 // Extract low 32-bits of the pointer. 1492 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1493 1494 auto CmpRes = 1495 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1496 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1497 1498 MI.eraseFromParent(); 1499 return true; 1500 } 1501 1502 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1503 return false; 1504 1505 if (!ST.hasFlatAddressSpace()) 1506 return false; 1507 1508 auto SegmentNull = 1509 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1510 auto FlatNull = 1511 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1512 1513 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1514 if (!ApertureReg.isValid()) 1515 return false; 1516 1517 auto CmpRes = 1518 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1519 1520 // Coerce the type of the low half of the result so we can use merge_values. 1521 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1522 1523 // TODO: Should we allow mismatched types but matching sizes in merges to 1524 // avoid the ptrtoint? 1525 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1526 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1527 1528 MI.eraseFromParent(); 1529 return true; 1530 } 1531 1532 bool AMDGPULegalizerInfo::legalizeFrint( 1533 MachineInstr &MI, MachineRegisterInfo &MRI, 1534 MachineIRBuilder &B) const { 1535 B.setInstr(MI); 1536 1537 Register Src = MI.getOperand(1).getReg(); 1538 LLT Ty = MRI.getType(Src); 1539 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1540 1541 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1542 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1543 1544 auto C1 = B.buildFConstant(Ty, C1Val); 1545 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1546 1547 // TODO: Should this propagate fast-math-flags? 1548 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1549 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1550 1551 auto C2 = B.buildFConstant(Ty, C2Val); 1552 auto Fabs = B.buildFAbs(Ty, Src); 1553 1554 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1555 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1556 return true; 1557 } 1558 1559 bool AMDGPULegalizerInfo::legalizeFceil( 1560 MachineInstr &MI, MachineRegisterInfo &MRI, 1561 MachineIRBuilder &B) const { 1562 B.setInstr(MI); 1563 1564 const LLT S1 = LLT::scalar(1); 1565 const LLT S64 = LLT::scalar(64); 1566 1567 Register Src = MI.getOperand(1).getReg(); 1568 assert(MRI.getType(Src) == S64); 1569 1570 // result = trunc(src) 1571 // if (src > 0.0 && src != result) 1572 // result += 1.0 1573 1574 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1575 1576 const auto Zero = B.buildFConstant(S64, 0.0); 1577 const auto One = B.buildFConstant(S64, 1.0); 1578 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1579 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1580 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1581 auto Add = B.buildSelect(S64, And, One, Zero); 1582 1583 // TODO: Should this propagate fast-math-flags? 1584 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1585 return true; 1586 } 1587 1588 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1589 MachineIRBuilder &B) { 1590 const unsigned FractBits = 52; 1591 const unsigned ExpBits = 11; 1592 LLT S32 = LLT::scalar(32); 1593 1594 auto Const0 = B.buildConstant(S32, FractBits - 32); 1595 auto Const1 = B.buildConstant(S32, ExpBits); 1596 1597 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1598 .addUse(Const0.getReg(0)) 1599 .addUse(Const1.getReg(0)); 1600 1601 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1602 } 1603 1604 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1605 MachineInstr &MI, MachineRegisterInfo &MRI, 1606 MachineIRBuilder &B) const { 1607 B.setInstr(MI); 1608 1609 const LLT S1 = LLT::scalar(1); 1610 const LLT S32 = LLT::scalar(32); 1611 const LLT S64 = LLT::scalar(64); 1612 1613 Register Src = MI.getOperand(1).getReg(); 1614 assert(MRI.getType(Src) == S64); 1615 1616 // TODO: Should this use extract since the low half is unused? 1617 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1618 Register Hi = Unmerge.getReg(1); 1619 1620 // Extract the upper half, since this is where we will find the sign and 1621 // exponent. 1622 auto Exp = extractF64Exponent(Hi, B); 1623 1624 const unsigned FractBits = 52; 1625 1626 // Extract the sign bit. 1627 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1628 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1629 1630 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1631 1632 const auto Zero32 = B.buildConstant(S32, 0); 1633 1634 // Extend back to 64-bits. 1635 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1636 1637 auto Shr = B.buildAShr(S64, FractMask, Exp); 1638 auto Not = B.buildNot(S64, Shr); 1639 auto Tmp0 = B.buildAnd(S64, Src, Not); 1640 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1641 1642 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1643 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1644 1645 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1646 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1647 return true; 1648 } 1649 1650 bool AMDGPULegalizerInfo::legalizeITOFP( 1651 MachineInstr &MI, MachineRegisterInfo &MRI, 1652 MachineIRBuilder &B, bool Signed) const { 1653 B.setInstr(MI); 1654 1655 Register Dst = MI.getOperand(0).getReg(); 1656 Register Src = MI.getOperand(1).getReg(); 1657 1658 const LLT S64 = LLT::scalar(64); 1659 const LLT S32 = LLT::scalar(32); 1660 1661 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1662 1663 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1664 1665 auto CvtHi = Signed ? 1666 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1667 B.buildUITOFP(S64, Unmerge.getReg(1)); 1668 1669 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1670 1671 auto ThirtyTwo = B.buildConstant(S32, 32); 1672 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1673 .addUse(CvtHi.getReg(0)) 1674 .addUse(ThirtyTwo.getReg(0)); 1675 1676 // TODO: Should this propagate fast-math-flags? 1677 B.buildFAdd(Dst, LdExp, CvtLo); 1678 MI.eraseFromParent(); 1679 return true; 1680 } 1681 1682 // TODO: Copied from DAG implementation. Verify logic and document how this 1683 // actually works. 1684 bool AMDGPULegalizerInfo::legalizeFPTOI( 1685 MachineInstr &MI, MachineRegisterInfo &MRI, 1686 MachineIRBuilder &B, bool Signed) const { 1687 B.setInstr(MI); 1688 1689 Register Dst = MI.getOperand(0).getReg(); 1690 Register Src = MI.getOperand(1).getReg(); 1691 1692 const LLT S64 = LLT::scalar(64); 1693 const LLT S32 = LLT::scalar(32); 1694 1695 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1696 1697 unsigned Flags = MI.getFlags(); 1698 1699 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1700 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1701 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1702 1703 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1704 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1705 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1706 1707 auto Hi = Signed ? 1708 B.buildFPTOSI(S32, FloorMul) : 1709 B.buildFPTOUI(S32, FloorMul); 1710 auto Lo = B.buildFPTOUI(S32, Fma); 1711 1712 B.buildMerge(Dst, { Lo, Hi }); 1713 MI.eraseFromParent(); 1714 1715 return true; 1716 } 1717 1718 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1719 MachineInstr &MI, MachineRegisterInfo &MRI, 1720 MachineIRBuilder &B) const { 1721 MachineFunction &MF = B.getMF(); 1722 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1723 1724 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1725 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1726 1727 // With ieee_mode disabled, the instructions have the correct behavior 1728 // already for G_FMINNUM/G_FMAXNUM 1729 if (!MFI->getMode().IEEE) 1730 return !IsIEEEOp; 1731 1732 if (IsIEEEOp) 1733 return true; 1734 1735 MachineIRBuilder HelperBuilder(MI); 1736 GISelObserverWrapper DummyObserver; 1737 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1738 HelperBuilder.setInstr(MI); 1739 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1740 } 1741 1742 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1743 MachineInstr &MI, MachineRegisterInfo &MRI, 1744 MachineIRBuilder &B) const { 1745 // TODO: Should move some of this into LegalizerHelper. 1746 1747 // TODO: Promote dynamic indexing of s16 to s32 1748 1749 // FIXME: Artifact combiner probably should have replaced the truncated 1750 // constant before this, so we shouldn't need 1751 // getConstantVRegValWithLookThrough. 1752 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1753 MI.getOperand(2).getReg(), MRI); 1754 if (!IdxVal) // Dynamic case will be selected to register indexing. 1755 return true; 1756 1757 Register Dst = MI.getOperand(0).getReg(); 1758 Register Vec = MI.getOperand(1).getReg(); 1759 1760 LLT VecTy = MRI.getType(Vec); 1761 LLT EltTy = VecTy.getElementType(); 1762 assert(EltTy == MRI.getType(Dst)); 1763 1764 B.setInstr(MI); 1765 1766 if (IdxVal->Value < VecTy.getNumElements()) 1767 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1768 else 1769 B.buildUndef(Dst); 1770 1771 MI.eraseFromParent(); 1772 return true; 1773 } 1774 1775 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1776 MachineInstr &MI, MachineRegisterInfo &MRI, 1777 MachineIRBuilder &B) const { 1778 // TODO: Should move some of this into LegalizerHelper. 1779 1780 // TODO: Promote dynamic indexing of s16 to s32 1781 1782 // FIXME: Artifact combiner probably should have replaced the truncated 1783 // constant before this, so we shouldn't need 1784 // getConstantVRegValWithLookThrough. 1785 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1786 MI.getOperand(3).getReg(), MRI); 1787 if (!IdxVal) // Dynamic case will be selected to register indexing. 1788 return true; 1789 1790 Register Dst = MI.getOperand(0).getReg(); 1791 Register Vec = MI.getOperand(1).getReg(); 1792 Register Ins = MI.getOperand(2).getReg(); 1793 1794 LLT VecTy = MRI.getType(Vec); 1795 LLT EltTy = VecTy.getElementType(); 1796 assert(EltTy == MRI.getType(Ins)); 1797 1798 B.setInstr(MI); 1799 1800 if (IdxVal->Value < VecTy.getNumElements()) 1801 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 1802 else 1803 B.buildUndef(Dst); 1804 1805 MI.eraseFromParent(); 1806 return true; 1807 } 1808 1809 static bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask) { 1810 assert(Mask.size() == 2); 1811 1812 // If one half is undef, the other is trivially in the same reg. 1813 if (Mask[0] == -1 || Mask[1] == -1) 1814 return true; 1815 return ((Mask[0] == 0 || Mask[0] == 1) && (Mask[1] == 0 || Mask[1] == 1)) || 1816 ((Mask[0] == 2 || Mask[0] == 3) && (Mask[1] == 2 || Mask[1] == 3)); 1817 } 1818 1819 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1820 MachineInstr &MI, MachineRegisterInfo &MRI, 1821 MachineIRBuilder &B) const { 1822 const LLT V2S16 = LLT::vector(2, 16); 1823 1824 Register Dst = MI.getOperand(0).getReg(); 1825 Register Src0 = MI.getOperand(1).getReg(); 1826 LLT DstTy = MRI.getType(Dst); 1827 LLT SrcTy = MRI.getType(Src0); 1828 1829 if (SrcTy == V2S16 && DstTy == V2S16 && 1830 isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1831 return true; 1832 1833 MachineIRBuilder HelperBuilder(MI); 1834 GISelObserverWrapper DummyObserver; 1835 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1836 HelperBuilder.setInstr(MI); 1837 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1838 } 1839 1840 bool AMDGPULegalizerInfo::legalizeSinCos( 1841 MachineInstr &MI, MachineRegisterInfo &MRI, 1842 MachineIRBuilder &B) const { 1843 B.setInstr(MI); 1844 1845 Register DstReg = MI.getOperand(0).getReg(); 1846 Register SrcReg = MI.getOperand(1).getReg(); 1847 LLT Ty = MRI.getType(DstReg); 1848 unsigned Flags = MI.getFlags(); 1849 1850 Register TrigVal; 1851 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1852 if (ST.hasTrigReducedRange()) { 1853 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1854 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1855 .addUse(MulVal.getReg(0)) 1856 .setMIFlags(Flags).getReg(0); 1857 } else 1858 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1859 1860 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1861 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1862 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1863 .addUse(TrigVal) 1864 .setMIFlags(Flags); 1865 MI.eraseFromParent(); 1866 return true; 1867 } 1868 1869 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1870 Register DstReg, LLT PtrTy, 1871 MachineIRBuilder &B, const GlobalValue *GV, 1872 unsigned Offset, unsigned GAFlags) const { 1873 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1874 // to the following code sequence: 1875 // 1876 // For constant address space: 1877 // s_getpc_b64 s[0:1] 1878 // s_add_u32 s0, s0, $symbol 1879 // s_addc_u32 s1, s1, 0 1880 // 1881 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1882 // a fixup or relocation is emitted to replace $symbol with a literal 1883 // constant, which is a pc-relative offset from the encoding of the $symbol 1884 // operand to the global variable. 1885 // 1886 // For global address space: 1887 // s_getpc_b64 s[0:1] 1888 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1889 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1890 // 1891 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1892 // fixups or relocations are emitted to replace $symbol@*@lo and 1893 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1894 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1895 // operand to the global variable. 1896 // 1897 // What we want here is an offset from the value returned by s_getpc 1898 // (which is the address of the s_add_u32 instruction) to the global 1899 // variable, but since the encoding of $symbol starts 4 bytes after the start 1900 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1901 // small. This requires us to add 4 to the global variable offset in order to 1902 // compute the correct address. 1903 1904 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1905 1906 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1907 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1908 1909 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1910 .addDef(PCReg); 1911 1912 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1913 if (GAFlags == SIInstrInfo::MO_NONE) 1914 MIB.addImm(0); 1915 else 1916 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1917 1918 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1919 1920 if (PtrTy.getSizeInBits() == 32) 1921 B.buildExtract(DstReg, PCReg, 0); 1922 return true; 1923 } 1924 1925 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1926 MachineInstr &MI, MachineRegisterInfo &MRI, 1927 MachineIRBuilder &B) const { 1928 Register DstReg = MI.getOperand(0).getReg(); 1929 LLT Ty = MRI.getType(DstReg); 1930 unsigned AS = Ty.getAddressSpace(); 1931 1932 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1933 MachineFunction &MF = B.getMF(); 1934 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1935 B.setInstr(MI); 1936 1937 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1938 if (!MFI->isEntryFunction()) { 1939 const Function &Fn = MF.getFunction(); 1940 DiagnosticInfoUnsupported BadLDSDecl( 1941 Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 1942 Fn.getContext().diagnose(BadLDSDecl); 1943 } 1944 1945 // TODO: We could emit code to handle the initialization somewhere. 1946 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1947 const SITargetLowering *TLI = ST.getTargetLowering(); 1948 if (!TLI->shouldUseLDSConstAddress(GV)) { 1949 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 1950 return true; // Leave in place; 1951 } 1952 1953 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1954 MI.eraseFromParent(); 1955 return true; 1956 } 1957 1958 const Function &Fn = MF.getFunction(); 1959 DiagnosticInfoUnsupported BadInit( 1960 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1961 Fn.getContext().diagnose(BadInit); 1962 return true; 1963 } 1964 1965 const SITargetLowering *TLI = ST.getTargetLowering(); 1966 1967 if (TLI->shouldEmitFixup(GV)) { 1968 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 1969 MI.eraseFromParent(); 1970 return true; 1971 } 1972 1973 if (TLI->shouldEmitPCReloc(GV)) { 1974 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 1975 MI.eraseFromParent(); 1976 return true; 1977 } 1978 1979 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1980 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 1981 1982 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 1983 MachinePointerInfo::getGOT(MF), 1984 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1985 MachineMemOperand::MOInvariant, 1986 8 /*Size*/, 8 /*Align*/); 1987 1988 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 1989 1990 if (Ty.getSizeInBits() == 32) { 1991 // Truncate if this is a 32-bit constant adrdess. 1992 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 1993 B.buildExtract(DstReg, Load, 0); 1994 } else 1995 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 1996 1997 MI.eraseFromParent(); 1998 return true; 1999 } 2000 2001 bool AMDGPULegalizerInfo::legalizeLoad( 2002 MachineInstr &MI, MachineRegisterInfo &MRI, 2003 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2004 B.setInstr(MI); 2005 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2006 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2007 Observer.changingInstr(MI); 2008 MI.getOperand(1).setReg(Cast.getReg(0)); 2009 Observer.changedInstr(MI); 2010 return true; 2011 } 2012 2013 bool AMDGPULegalizerInfo::legalizeFMad( 2014 MachineInstr &MI, MachineRegisterInfo &MRI, 2015 MachineIRBuilder &B) const { 2016 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2017 assert(Ty.isScalar()); 2018 2019 MachineFunction &MF = B.getMF(); 2020 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2021 2022 // TODO: Always legal with future ftz flag. 2023 // FIXME: Do we need just output? 2024 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2025 return true; 2026 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2027 return true; 2028 2029 MachineIRBuilder HelperBuilder(MI); 2030 GISelObserverWrapper DummyObserver; 2031 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2032 HelperBuilder.setMBB(*MI.getParent()); 2033 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2034 } 2035 2036 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2037 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2038 Register DstReg = MI.getOperand(0).getReg(); 2039 Register PtrReg = MI.getOperand(1).getReg(); 2040 Register CmpVal = MI.getOperand(2).getReg(); 2041 Register NewVal = MI.getOperand(3).getReg(); 2042 2043 assert(SITargetLowering::isFlatGlobalAddrSpace( 2044 MRI.getType(PtrReg).getAddressSpace()) && 2045 "this should not have been custom lowered"); 2046 2047 LLT ValTy = MRI.getType(CmpVal); 2048 LLT VecTy = LLT::vector(2, ValTy); 2049 2050 B.setInstr(MI); 2051 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2052 2053 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2054 .addDef(DstReg) 2055 .addUse(PtrReg) 2056 .addUse(PackedVal) 2057 .setMemRefs(MI.memoperands()); 2058 2059 MI.eraseFromParent(); 2060 return true; 2061 } 2062 2063 bool AMDGPULegalizerInfo::legalizeFlog( 2064 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2065 Register Dst = MI.getOperand(0).getReg(); 2066 Register Src = MI.getOperand(1).getReg(); 2067 LLT Ty = B.getMRI()->getType(Dst); 2068 unsigned Flags = MI.getFlags(); 2069 B.setInstr(MI); 2070 2071 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2072 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2073 2074 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2075 MI.eraseFromParent(); 2076 return true; 2077 } 2078 2079 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2080 MachineIRBuilder &B) const { 2081 Register Dst = MI.getOperand(0).getReg(); 2082 Register Src = MI.getOperand(1).getReg(); 2083 unsigned Flags = MI.getFlags(); 2084 LLT Ty = B.getMRI()->getType(Dst); 2085 B.setInstr(MI); 2086 2087 auto K = B.buildFConstant(Ty, numbers::log2e); 2088 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2089 B.buildFExp2(Dst, Mul, Flags); 2090 MI.eraseFromParent(); 2091 return true; 2092 } 2093 2094 // Find a source register, ignoring any possible source modifiers. 2095 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2096 Register ModSrc = OrigSrc; 2097 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2098 ModSrc = SrcFNeg->getOperand(1).getReg(); 2099 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2100 ModSrc = SrcFAbs->getOperand(1).getReg(); 2101 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2102 ModSrc = SrcFAbs->getOperand(1).getReg(); 2103 return ModSrc; 2104 } 2105 2106 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2107 MachineRegisterInfo &MRI, 2108 MachineIRBuilder &B) const { 2109 B.setInstr(MI); 2110 2111 const LLT S1 = LLT::scalar(1); 2112 const LLT S64 = LLT::scalar(64); 2113 Register Dst = MI.getOperand(0).getReg(); 2114 Register OrigSrc = MI.getOperand(1).getReg(); 2115 unsigned Flags = MI.getFlags(); 2116 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2117 "this should not have been custom lowered"); 2118 2119 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2120 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2121 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2122 // V_FRACT bug is: 2123 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2124 // 2125 // Convert floor(x) to (x - fract(x)) 2126 2127 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2128 .addUse(OrigSrc) 2129 .setMIFlags(Flags); 2130 2131 // Give source modifier matching some assistance before obscuring a foldable 2132 // pattern. 2133 2134 // TODO: We can avoid the neg on the fract? The input sign to fract 2135 // shouldn't matter? 2136 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2137 2138 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2139 2140 Register Min = MRI.createGenericVirtualRegister(S64); 2141 2142 // We don't need to concern ourselves with the snan handling difference, so 2143 // use the one which will directly select. 2144 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2145 if (MFI->getMode().IEEE) 2146 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2147 else 2148 B.buildFMinNum(Min, Fract, Const, Flags); 2149 2150 Register CorrectedFract = Min; 2151 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2152 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2153 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2154 } 2155 2156 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2157 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2158 2159 MI.eraseFromParent(); 2160 return true; 2161 } 2162 2163 // Turn an illegal packed v2s16 build vector into bit operations. 2164 // TODO: This should probably be a bitcast action in LegalizerHelper. 2165 bool AMDGPULegalizerInfo::legalizeBuildVector( 2166 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2167 Register Dst = MI.getOperand(0).getReg(); 2168 LLT DstTy = MRI.getType(Dst); 2169 const LLT S32 = LLT::scalar(32); 2170 const LLT V2S16 = LLT::vector(2, 16); 2171 (void)DstTy; 2172 (void)V2S16; 2173 assert(DstTy == V2S16); 2174 2175 Register Src0 = MI.getOperand(1).getReg(); 2176 Register Src1 = MI.getOperand(2).getReg(); 2177 assert(MRI.getType(Src0) == LLT::scalar(16)); 2178 2179 B.setInstr(MI); 2180 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2181 B.buildBitcast(Dst, Merge); 2182 2183 MI.eraseFromParent(); 2184 return true; 2185 } 2186 2187 // Return the use branch instruction, otherwise null if the usage is invalid. 2188 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2189 MachineRegisterInfo &MRI, 2190 MachineInstr *&Br) { 2191 Register CondDef = MI.getOperand(0).getReg(); 2192 if (!MRI.hasOneNonDBGUse(CondDef)) 2193 return nullptr; 2194 2195 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2196 if (UseMI.getParent() != MI.getParent() || 2197 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2198 return nullptr; 2199 2200 // Make sure the cond br is followed by a G_BR 2201 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2202 if (Next != MI.getParent()->end()) { 2203 if (Next->getOpcode() != AMDGPU::G_BR) 2204 return nullptr; 2205 Br = &*Next; 2206 } 2207 2208 return &UseMI; 2209 } 2210 2211 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 2212 Register Reg, LLT Ty) const { 2213 Register LiveIn = MRI.getLiveInVirtReg(Reg); 2214 if (LiveIn) 2215 return LiveIn; 2216 2217 Register NewReg = MRI.createGenericVirtualRegister(Ty); 2218 MRI.addLiveIn(Reg, NewReg); 2219 return NewReg; 2220 } 2221 2222 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2223 const ArgDescriptor *Arg) const { 2224 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2225 return false; // TODO: Handle these 2226 2227 assert(Arg->getRegister().isPhysical()); 2228 2229 MachineRegisterInfo &MRI = *B.getMRI(); 2230 2231 LLT Ty = MRI.getType(DstReg); 2232 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 2233 2234 if (Arg->isMasked()) { 2235 // TODO: Should we try to emit this once in the entry block? 2236 const LLT S32 = LLT::scalar(32); 2237 const unsigned Mask = Arg->getMask(); 2238 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2239 2240 Register AndMaskSrc = LiveIn; 2241 2242 if (Shift != 0) { 2243 auto ShiftAmt = B.buildConstant(S32, Shift); 2244 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2245 } 2246 2247 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2248 } else 2249 B.buildCopy(DstReg, LiveIn); 2250 2251 // Insert the argument copy if it doens't already exist. 2252 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2253 if (!MRI.getVRegDef(LiveIn)) { 2254 // FIXME: Should have scoped insert pt 2255 MachineBasicBlock &OrigInsBB = B.getMBB(); 2256 auto OrigInsPt = B.getInsertPt(); 2257 2258 MachineBasicBlock &EntryMBB = B.getMF().front(); 2259 EntryMBB.addLiveIn(Arg->getRegister()); 2260 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2261 B.buildCopy(LiveIn, Arg->getRegister()); 2262 2263 B.setInsertPt(OrigInsBB, OrigInsPt); 2264 } 2265 2266 return true; 2267 } 2268 2269 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2270 MachineInstr &MI, 2271 MachineRegisterInfo &MRI, 2272 MachineIRBuilder &B, 2273 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2274 B.setInstr(MI); 2275 2276 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2277 2278 const ArgDescriptor *Arg; 2279 const TargetRegisterClass *RC; 2280 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 2281 if (!Arg) { 2282 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2283 return false; 2284 } 2285 2286 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 2287 MI.eraseFromParent(); 2288 return true; 2289 } 2290 2291 return false; 2292 } 2293 2294 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2295 MachineRegisterInfo &MRI, 2296 MachineIRBuilder &B) const { 2297 B.setInstr(MI); 2298 Register Dst = MI.getOperand(0).getReg(); 2299 LLT DstTy = MRI.getType(Dst); 2300 LLT S16 = LLT::scalar(16); 2301 LLT S32 = LLT::scalar(32); 2302 LLT S64 = LLT::scalar(64); 2303 2304 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2305 return true; 2306 2307 if (DstTy == S16) 2308 return legalizeFDIV16(MI, MRI, B); 2309 if (DstTy == S32) 2310 return legalizeFDIV32(MI, MRI, B); 2311 if (DstTy == S64) 2312 return legalizeFDIV64(MI, MRI, B); 2313 2314 return false; 2315 } 2316 2317 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2318 MachineRegisterInfo &MRI, 2319 MachineIRBuilder &B) const { 2320 Register Res = MI.getOperand(0).getReg(); 2321 Register LHS = MI.getOperand(1).getReg(); 2322 Register RHS = MI.getOperand(2).getReg(); 2323 2324 uint16_t Flags = MI.getFlags(); 2325 2326 LLT ResTy = MRI.getType(Res); 2327 LLT S32 = LLT::scalar(32); 2328 LLT S64 = LLT::scalar(64); 2329 2330 const MachineFunction &MF = B.getMF(); 2331 bool Unsafe = 2332 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2333 2334 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2335 return false; 2336 2337 if (!Unsafe && ResTy == S32 && 2338 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2339 return false; 2340 2341 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2342 // 1 / x -> RCP(x) 2343 if (CLHS->isExactlyValue(1.0)) { 2344 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2345 .addUse(RHS) 2346 .setMIFlags(Flags); 2347 2348 MI.eraseFromParent(); 2349 return true; 2350 } 2351 2352 // -1 / x -> RCP( FNEG(x) ) 2353 if (CLHS->isExactlyValue(-1.0)) { 2354 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2355 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2356 .addUse(FNeg.getReg(0)) 2357 .setMIFlags(Flags); 2358 2359 MI.eraseFromParent(); 2360 return true; 2361 } 2362 } 2363 2364 // x / y -> x * (1.0 / y) 2365 if (Unsafe) { 2366 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2367 .addUse(RHS) 2368 .setMIFlags(Flags); 2369 B.buildFMul(Res, LHS, RCP, Flags); 2370 2371 MI.eraseFromParent(); 2372 return true; 2373 } 2374 2375 return false; 2376 } 2377 2378 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2379 MachineRegisterInfo &MRI, 2380 MachineIRBuilder &B) const { 2381 B.setInstr(MI); 2382 Register Res = MI.getOperand(0).getReg(); 2383 Register LHS = MI.getOperand(1).getReg(); 2384 Register RHS = MI.getOperand(2).getReg(); 2385 2386 uint16_t Flags = MI.getFlags(); 2387 2388 LLT S16 = LLT::scalar(16); 2389 LLT S32 = LLT::scalar(32); 2390 2391 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2392 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2393 2394 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2395 .addUse(RHSExt.getReg(0)) 2396 .setMIFlags(Flags); 2397 2398 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2399 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2400 2401 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2402 .addUse(RDst.getReg(0)) 2403 .addUse(RHS) 2404 .addUse(LHS) 2405 .setMIFlags(Flags); 2406 2407 MI.eraseFromParent(); 2408 return true; 2409 } 2410 2411 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2412 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2413 static void toggleSPDenormMode(bool Enable, 2414 MachineIRBuilder &B, 2415 const GCNSubtarget &ST, 2416 AMDGPU::SIModeRegisterDefaults Mode) { 2417 // Set SP denorm mode to this value. 2418 unsigned SPDenormMode = 2419 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2420 2421 if (ST.hasDenormModeInst()) { 2422 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2423 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2424 2425 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2426 B.buildInstr(AMDGPU::S_DENORM_MODE) 2427 .addImm(NewDenormModeValue); 2428 2429 } else { 2430 // Select FP32 bit field in mode register. 2431 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2432 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2433 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2434 2435 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2436 .addImm(SPDenormMode) 2437 .addImm(SPDenormModeBitField); 2438 } 2439 } 2440 2441 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2442 MachineRegisterInfo &MRI, 2443 MachineIRBuilder &B) const { 2444 B.setInstr(MI); 2445 Register Res = MI.getOperand(0).getReg(); 2446 Register LHS = MI.getOperand(1).getReg(); 2447 Register RHS = MI.getOperand(2).getReg(); 2448 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2449 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2450 2451 uint16_t Flags = MI.getFlags(); 2452 2453 LLT S32 = LLT::scalar(32); 2454 LLT S1 = LLT::scalar(1); 2455 2456 auto One = B.buildFConstant(S32, 1.0f); 2457 2458 auto DenominatorScaled = 2459 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2460 .addUse(RHS) 2461 .addUse(LHS) 2462 .addImm(1) 2463 .setMIFlags(Flags); 2464 auto NumeratorScaled = 2465 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2466 .addUse(LHS) 2467 .addUse(RHS) 2468 .addImm(0) 2469 .setMIFlags(Flags); 2470 2471 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2472 .addUse(DenominatorScaled.getReg(0)) 2473 .setMIFlags(Flags); 2474 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2475 2476 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2477 // aren't modeled as reading it. 2478 if (!Mode.allFP32Denormals()) 2479 toggleSPDenormMode(true, B, ST, Mode); 2480 2481 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2482 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2483 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2484 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2485 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2486 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2487 2488 if (!Mode.allFP32Denormals()) 2489 toggleSPDenormMode(false, B, ST, Mode); 2490 2491 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2492 .addUse(Fma4.getReg(0)) 2493 .addUse(Fma1.getReg(0)) 2494 .addUse(Fma3.getReg(0)) 2495 .addUse(NumeratorScaled.getReg(1)) 2496 .setMIFlags(Flags); 2497 2498 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2499 .addUse(Fmas.getReg(0)) 2500 .addUse(RHS) 2501 .addUse(LHS) 2502 .setMIFlags(Flags); 2503 2504 MI.eraseFromParent(); 2505 return true; 2506 } 2507 2508 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2509 MachineRegisterInfo &MRI, 2510 MachineIRBuilder &B) const { 2511 B.setInstr(MI); 2512 Register Res = MI.getOperand(0).getReg(); 2513 Register LHS = MI.getOperand(1).getReg(); 2514 Register RHS = MI.getOperand(2).getReg(); 2515 2516 uint16_t Flags = MI.getFlags(); 2517 2518 LLT S64 = LLT::scalar(64); 2519 LLT S1 = LLT::scalar(1); 2520 2521 auto One = B.buildFConstant(S64, 1.0); 2522 2523 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2524 .addUse(LHS) 2525 .addUse(RHS) 2526 .addImm(1) 2527 .setMIFlags(Flags); 2528 2529 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2530 2531 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2532 .addUse(DivScale0.getReg(0)) 2533 .setMIFlags(Flags); 2534 2535 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2536 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2537 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2538 2539 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2540 .addUse(LHS) 2541 .addUse(RHS) 2542 .addImm(0) 2543 .setMIFlags(Flags); 2544 2545 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2546 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); 2547 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2548 2549 Register Scale; 2550 if (!ST.hasUsableDivScaleConditionOutput()) { 2551 // Workaround a hardware bug on SI where the condition output from div_scale 2552 // is not usable. 2553 2554 LLT S32 = LLT::scalar(32); 2555 2556 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2557 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2558 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2559 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 2560 2561 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 2562 Scale1Unmerge.getReg(1)); 2563 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 2564 Scale0Unmerge.getReg(1)); 2565 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 2566 } else { 2567 Scale = DivScale1.getReg(1); 2568 } 2569 2570 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 2571 .addUse(Fma4.getReg(0)) 2572 .addUse(Fma3.getReg(0)) 2573 .addUse(Mul.getReg(0)) 2574 .addUse(Scale) 2575 .setMIFlags(Flags); 2576 2577 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 2578 .addUse(Fmas.getReg(0)) 2579 .addUse(RHS) 2580 .addUse(LHS) 2581 .setMIFlags(Flags); 2582 2583 MI.eraseFromParent(); 2584 return true; 2585 } 2586 2587 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 2588 MachineRegisterInfo &MRI, 2589 MachineIRBuilder &B) const { 2590 B.setInstr(MI); 2591 Register Res = MI.getOperand(0).getReg(); 2592 Register LHS = MI.getOperand(2).getReg(); 2593 Register RHS = MI.getOperand(3).getReg(); 2594 uint16_t Flags = MI.getFlags(); 2595 2596 LLT S32 = LLT::scalar(32); 2597 LLT S1 = LLT::scalar(1); 2598 2599 auto Abs = B.buildFAbs(S32, RHS, Flags); 2600 const APFloat C0Val(1.0f); 2601 2602 auto C0 = B.buildConstant(S32, 0x6f800000); 2603 auto C1 = B.buildConstant(S32, 0x2f800000); 2604 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 2605 2606 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 2607 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 2608 2609 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 2610 2611 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2612 .addUse(Mul0.getReg(0)) 2613 .setMIFlags(Flags); 2614 2615 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 2616 2617 B.buildFMul(Res, Sel, Mul1, Flags); 2618 2619 MI.eraseFromParent(); 2620 return true; 2621 } 2622 2623 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 2624 MachineRegisterInfo &MRI, 2625 MachineIRBuilder &B) const { 2626 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2627 if (!MFI->isEntryFunction()) { 2628 return legalizePreloadedArgIntrin(MI, MRI, B, 2629 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 2630 } 2631 2632 B.setInstr(MI); 2633 2634 uint64_t Offset = 2635 ST.getTargetLowering()->getImplicitParameterOffset( 2636 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 2637 Register DstReg = MI.getOperand(0).getReg(); 2638 LLT DstTy = MRI.getType(DstReg); 2639 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 2640 2641 const ArgDescriptor *Arg; 2642 const TargetRegisterClass *RC; 2643 std::tie(Arg, RC) 2644 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2645 if (!Arg) 2646 return false; 2647 2648 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 2649 if (!loadInputValue(KernargPtrReg, B, Arg)) 2650 return false; 2651 2652 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 2653 MI.eraseFromParent(); 2654 return true; 2655 } 2656 2657 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 2658 MachineRegisterInfo &MRI, 2659 MachineIRBuilder &B, 2660 unsigned AddrSpace) const { 2661 B.setInstr(MI); 2662 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 2663 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 2664 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 2665 MI.eraseFromParent(); 2666 return true; 2667 } 2668 2669 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 2670 // offset (the offset that is included in bounds checking and swizzling, to be 2671 // split between the instruction's voffset and immoffset fields) and soffset 2672 // (the offset that is excluded from bounds checking and swizzling, to go in 2673 // the instruction's soffset field). This function takes the first kind of 2674 // offset and figures out how to split it between voffset and immoffset. 2675 std::tuple<Register, unsigned, unsigned> 2676 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 2677 Register OrigOffset) const { 2678 const unsigned MaxImm = 4095; 2679 Register BaseReg; 2680 unsigned TotalConstOffset; 2681 MachineInstr *OffsetDef; 2682 const LLT S32 = LLT::scalar(32); 2683 2684 std::tie(BaseReg, TotalConstOffset, OffsetDef) 2685 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 2686 2687 unsigned ImmOffset = TotalConstOffset; 2688 2689 // If the immediate value is too big for the immoffset field, put the value 2690 // and -4096 into the immoffset field so that the value that is copied/added 2691 // for the voffset field is a multiple of 4096, and it stands more chance 2692 // of being CSEd with the copy/add for another similar load/store. 2693 // However, do not do that rounding down to a multiple of 4096 if that is a 2694 // negative number, as it appears to be illegal to have a negative offset 2695 // in the vgpr, even if adding the immediate offset makes it positive. 2696 unsigned Overflow = ImmOffset & ~MaxImm; 2697 ImmOffset -= Overflow; 2698 if ((int32_t)Overflow < 0) { 2699 Overflow += ImmOffset; 2700 ImmOffset = 0; 2701 } 2702 2703 if (Overflow != 0) { 2704 if (!BaseReg) { 2705 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 2706 } else { 2707 auto OverflowVal = B.buildConstant(S32, Overflow); 2708 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 2709 } 2710 } 2711 2712 if (!BaseReg) 2713 BaseReg = B.buildConstant(S32, 0).getReg(0); 2714 2715 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 2716 } 2717 2718 /// Handle register layout difference for f16 images for some subtargets. 2719 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 2720 MachineRegisterInfo &MRI, 2721 Register Reg) const { 2722 if (!ST.hasUnpackedD16VMem()) 2723 return Reg; 2724 2725 const LLT S16 = LLT::scalar(16); 2726 const LLT S32 = LLT::scalar(32); 2727 LLT StoreVT = MRI.getType(Reg); 2728 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 2729 2730 auto Unmerge = B.buildUnmerge(S16, Reg); 2731 2732 SmallVector<Register, 4> WideRegs; 2733 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 2734 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 2735 2736 int NumElts = StoreVT.getNumElements(); 2737 2738 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 2739 } 2740 2741 Register AMDGPULegalizerInfo::fixStoreSourceType( 2742 MachineIRBuilder &B, Register VData, bool IsFormat) const { 2743 MachineRegisterInfo *MRI = B.getMRI(); 2744 LLT Ty = MRI->getType(VData); 2745 2746 const LLT S16 = LLT::scalar(16); 2747 2748 // Fixup illegal register types for i8 stores. 2749 if (Ty == LLT::scalar(8) || Ty == S16) { 2750 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 2751 return AnyExt; 2752 } 2753 2754 if (Ty.isVector()) { 2755 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 2756 if (IsFormat) 2757 return handleD16VData(B, *MRI, VData); 2758 } 2759 } 2760 2761 return VData; 2762 } 2763 2764 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 2765 MachineRegisterInfo &MRI, 2766 MachineIRBuilder &B, 2767 bool IsTyped, 2768 bool IsFormat) const { 2769 B.setInstr(MI); 2770 2771 Register VData = MI.getOperand(1).getReg(); 2772 LLT Ty = MRI.getType(VData); 2773 LLT EltTy = Ty.getScalarType(); 2774 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 2775 const LLT S32 = LLT::scalar(32); 2776 2777 VData = fixStoreSourceType(B, VData, IsFormat); 2778 Register RSrc = MI.getOperand(2).getReg(); 2779 2780 MachineMemOperand *MMO = *MI.memoperands_begin(); 2781 const int MemSize = MMO->getSize(); 2782 2783 unsigned ImmOffset; 2784 unsigned TotalOffset; 2785 2786 // The typed intrinsics add an immediate after the registers. 2787 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 2788 2789 // The struct intrinsic variants add one additional operand over raw. 2790 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 2791 Register VIndex; 2792 int OpOffset = 0; 2793 if (HasVIndex) { 2794 VIndex = MI.getOperand(3).getReg(); 2795 OpOffset = 1; 2796 } 2797 2798 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 2799 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 2800 2801 unsigned Format = 0; 2802 if (IsTyped) { 2803 Format = MI.getOperand(5 + OpOffset).getImm(); 2804 ++OpOffset; 2805 } 2806 2807 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 2808 2809 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 2810 if (TotalOffset != 0) 2811 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 2812 2813 unsigned Opc; 2814 if (IsTyped) { 2815 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 2816 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 2817 } else if (IsFormat) { 2818 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 2819 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 2820 } else { 2821 switch (MemSize) { 2822 case 1: 2823 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 2824 break; 2825 case 2: 2826 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 2827 break; 2828 default: 2829 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 2830 break; 2831 } 2832 } 2833 2834 if (!VIndex) 2835 VIndex = B.buildConstant(S32, 0).getReg(0); 2836 2837 auto MIB = B.buildInstr(Opc) 2838 .addUse(VData) // vdata 2839 .addUse(RSrc) // rsrc 2840 .addUse(VIndex) // vindex 2841 .addUse(VOffset) // voffset 2842 .addUse(SOffset) // soffset 2843 .addImm(ImmOffset); // offset(imm) 2844 2845 if (IsTyped) 2846 MIB.addImm(Format); 2847 2848 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 2849 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 2850 .addMemOperand(MMO); 2851 2852 MI.eraseFromParent(); 2853 return true; 2854 } 2855 2856 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 2857 MachineRegisterInfo &MRI, 2858 MachineIRBuilder &B, 2859 bool IsFormat, 2860 bool IsTyped) const { 2861 B.setInstr(MI); 2862 2863 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 2864 MachineMemOperand *MMO = *MI.memoperands_begin(); 2865 const int MemSize = MMO->getSize(); 2866 const LLT S32 = LLT::scalar(32); 2867 2868 Register Dst = MI.getOperand(0).getReg(); 2869 Register RSrc = MI.getOperand(2).getReg(); 2870 2871 // The typed intrinsics add an immediate after the registers. 2872 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 2873 2874 // The struct intrinsic variants add one additional operand over raw. 2875 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 2876 Register VIndex; 2877 int OpOffset = 0; 2878 if (HasVIndex) { 2879 VIndex = MI.getOperand(3).getReg(); 2880 OpOffset = 1; 2881 } 2882 2883 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 2884 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 2885 2886 unsigned Format = 0; 2887 if (IsTyped) { 2888 Format = MI.getOperand(5 + OpOffset).getImm(); 2889 ++OpOffset; 2890 } 2891 2892 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 2893 unsigned ImmOffset; 2894 unsigned TotalOffset; 2895 2896 LLT Ty = MRI.getType(Dst); 2897 LLT EltTy = Ty.getScalarType(); 2898 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 2899 const bool Unpacked = ST.hasUnpackedD16VMem(); 2900 2901 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 2902 if (TotalOffset != 0) 2903 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 2904 2905 unsigned Opc; 2906 2907 if (IsTyped) { 2908 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 2909 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 2910 } else if (IsFormat) { 2911 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 2912 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 2913 } else { 2914 switch (MemSize) { 2915 case 1: 2916 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 2917 break; 2918 case 2: 2919 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 2920 break; 2921 default: 2922 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 2923 break; 2924 } 2925 } 2926 2927 Register LoadDstReg; 2928 2929 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 2930 LLT UnpackedTy = Ty.changeElementSize(32); 2931 2932 if (IsExtLoad) 2933 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 2934 else if (Unpacked && IsD16 && Ty.isVector()) 2935 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 2936 else 2937 LoadDstReg = Dst; 2938 2939 if (!VIndex) 2940 VIndex = B.buildConstant(S32, 0).getReg(0); 2941 2942 auto MIB = B.buildInstr(Opc) 2943 .addDef(LoadDstReg) // vdata 2944 .addUse(RSrc) // rsrc 2945 .addUse(VIndex) // vindex 2946 .addUse(VOffset) // voffset 2947 .addUse(SOffset) // soffset 2948 .addImm(ImmOffset); // offset(imm) 2949 2950 if (IsTyped) 2951 MIB.addImm(Format); 2952 2953 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 2954 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 2955 .addMemOperand(MMO); 2956 2957 if (LoadDstReg != Dst) { 2958 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2959 2960 // Widen result for extending loads was widened. 2961 if (IsExtLoad) 2962 B.buildTrunc(Dst, LoadDstReg); 2963 else { 2964 // Repack to original 16-bit vector result 2965 // FIXME: G_TRUNC should work, but legalization currently fails 2966 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 2967 SmallVector<Register, 4> Repack; 2968 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 2969 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 2970 B.buildMerge(Dst, Repack); 2971 } 2972 } 2973 2974 MI.eraseFromParent(); 2975 return true; 2976 } 2977 2978 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 2979 MachineIRBuilder &B, 2980 bool IsInc) const { 2981 B.setInstr(MI); 2982 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 2983 AMDGPU::G_AMDGPU_ATOMIC_DEC; 2984 B.buildInstr(Opc) 2985 .addDef(MI.getOperand(0).getReg()) 2986 .addUse(MI.getOperand(2).getReg()) 2987 .addUse(MI.getOperand(3).getReg()) 2988 .cloneMemRefs(MI); 2989 MI.eraseFromParent(); 2990 return true; 2991 } 2992 2993 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 2994 switch (IntrID) { 2995 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 2996 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 2997 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 2998 case Intrinsic::amdgcn_raw_buffer_atomic_add: 2999 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3000 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3001 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3002 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3003 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3004 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3005 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3006 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3007 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3008 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3009 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3010 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3011 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3012 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3013 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3014 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3015 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3016 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3017 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3018 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3019 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3020 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3021 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3022 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3023 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3024 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3025 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3026 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3027 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3028 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3029 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3030 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3031 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3032 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3033 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3034 default: 3035 llvm_unreachable("unhandled atomic opcode"); 3036 } 3037 } 3038 3039 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3040 MachineIRBuilder &B, 3041 Intrinsic::ID IID) const { 3042 B.setInstr(MI); 3043 3044 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3045 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3046 3047 Register Dst = MI.getOperand(0).getReg(); 3048 Register VData = MI.getOperand(2).getReg(); 3049 3050 Register CmpVal; 3051 int OpOffset = 0; 3052 3053 if (IsCmpSwap) { 3054 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3055 ++OpOffset; 3056 } 3057 3058 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3059 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3060 3061 // The struct intrinsic variants add one additional operand over raw. 3062 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3063 Register VIndex; 3064 if (HasVIndex) { 3065 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3066 ++OpOffset; 3067 } 3068 3069 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3070 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3071 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3072 3073 MachineMemOperand *MMO = *MI.memoperands_begin(); 3074 3075 unsigned ImmOffset; 3076 unsigned TotalOffset; 3077 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3078 if (TotalOffset != 0) 3079 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3080 3081 if (!VIndex) 3082 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3083 3084 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3085 .addDef(Dst) 3086 .addUse(VData); // vdata 3087 3088 if (IsCmpSwap) 3089 MIB.addReg(CmpVal); 3090 3091 MIB.addUse(RSrc) // rsrc 3092 .addUse(VIndex) // vindex 3093 .addUse(VOffset) // voffset 3094 .addUse(SOffset) // soffset 3095 .addImm(ImmOffset) // offset(imm) 3096 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3097 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3098 .addMemOperand(MMO); 3099 3100 MI.eraseFromParent(); 3101 return true; 3102 } 3103 3104 // Produce a vector of s16 elements from s32 pieces. 3105 static void truncToS16Vector(MachineIRBuilder &B, Register DstReg, 3106 ArrayRef<Register> UnmergeParts) { 3107 const LLT S16 = LLT::scalar(16); 3108 3109 SmallVector<Register, 4> RemergeParts(UnmergeParts.size()); 3110 for (int I = 0, E = UnmergeParts.size(); I != E; ++I) 3111 RemergeParts[I] = B.buildTrunc(S16, UnmergeParts[I]).getReg(0); 3112 3113 B.buildBuildVector(DstReg, RemergeParts); 3114 } 3115 3116 /// Convert a set of s32 registers to a result vector with s16 elements. 3117 static void bitcastToS16Vector(MachineIRBuilder &B, Register DstReg, 3118 ArrayRef<Register> UnmergeParts) { 3119 MachineRegisterInfo &MRI = *B.getMRI(); 3120 const LLT V2S16 = LLT::vector(2, 16); 3121 LLT TargetTy = MRI.getType(DstReg); 3122 int NumElts = UnmergeParts.size(); 3123 3124 if (NumElts == 1) { 3125 assert(TargetTy == V2S16); 3126 B.buildBitcast(DstReg, UnmergeParts[0]); 3127 return; 3128 } 3129 3130 SmallVector<Register, 4> RemergeParts(NumElts); 3131 for (int I = 0; I != NumElts; ++I) 3132 RemergeParts[I] = B.buildBitcast(V2S16, UnmergeParts[I]).getReg(0); 3133 3134 if (TargetTy.getSizeInBits() == 32u * NumElts) { 3135 B.buildConcatVectors(DstReg, RemergeParts); 3136 return; 3137 } 3138 3139 const LLT V3S16 = LLT::vector(3, 16); 3140 const LLT V6S16 = LLT::vector(6, 16); 3141 3142 // Widen to v6s16 and unpack v3 parts. 3143 assert(TargetTy == V3S16); 3144 3145 RemergeParts.push_back(B.buildUndef(V2S16).getReg(0)); 3146 auto Concat = B.buildConcatVectors(V6S16, RemergeParts); 3147 B.buildUnmerge({DstReg, MRI.createGenericVirtualRegister(V3S16)}, Concat); 3148 } 3149 3150 // FIXME: Just vector trunc should be sufficent, but legalization currently 3151 // broken. 3152 static void repackUnpackedD16Load(MachineIRBuilder &B, Register DstReg, 3153 Register WideDstReg) { 3154 const LLT S32 = LLT::scalar(32); 3155 const LLT S16 = LLT::scalar(16); 3156 3157 auto Unmerge = B.buildUnmerge(S32, WideDstReg); 3158 3159 int NumOps = Unmerge->getNumOperands() - 1; 3160 SmallVector<Register, 4> RemergeParts(NumOps); 3161 for (int I = 0; I != NumOps; ++I) 3162 RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0); 3163 3164 B.buildBuildVector(DstReg, RemergeParts); 3165 } 3166 3167 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3168 MachineInstr &MI, MachineIRBuilder &B, 3169 GISelChangeObserver &Observer, 3170 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3171 bool IsTFE = MI.getNumExplicitDefs() == 2; 3172 3173 // We are only processing the operands of d16 image operations on subtargets 3174 // that use the unpacked register layout, or need to repack the TFE result. 3175 3176 // TODO: Need to handle a16 images too 3177 // TODO: Do we need to guard against already legalized intrinsics? 3178 if (!IsTFE && !ST.hasUnpackedD16VMem()) 3179 return true; 3180 3181 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3182 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3183 3184 if (BaseOpcode->Atomic) // No d16 atomics, or TFE. 3185 return true; 3186 3187 B.setInstr(MI); 3188 3189 MachineRegisterInfo *MRI = B.getMRI(); 3190 const LLT S32 = LLT::scalar(32); 3191 const LLT S16 = LLT::scalar(16); 3192 3193 if (BaseOpcode->Store) { // No TFE for stores? 3194 Register VData = MI.getOperand(1).getReg(); 3195 LLT Ty = MRI->getType(VData); 3196 if (!Ty.isVector() || Ty.getElementType() != S16) 3197 return true; 3198 3199 B.setInstr(MI); 3200 3201 Observer.changingInstr(MI); 3202 MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData)); 3203 Observer.changedInstr(MI); 3204 return true; 3205 } 3206 3207 Register DstReg = MI.getOperand(0).getReg(); 3208 LLT Ty = MRI->getType(DstReg); 3209 const LLT EltTy = Ty.getScalarType(); 3210 const bool IsD16 = Ty.getScalarType() == S16; 3211 const unsigned NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3212 3213 if (IsTFE) { 3214 // In the IR, TFE is supposed to be used with a 2 element struct return 3215 // type. The intruction really returns these two values in one contiguous 3216 // register, with one additional dword beyond the loaded data. Rewrite the 3217 // return type to use a single register result. 3218 Register Dst1Reg = MI.getOperand(1).getReg(); 3219 if (MRI->getType(Dst1Reg) != S32) 3220 return false; 3221 3222 // TODO: Make sure the TFE operand bit is set. 3223 3224 // The raw dword aligned data component of the load. The only legal cases 3225 // where this matters should be when using the packed D16 format, for 3226 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3227 LLT RoundedTy; 3228 LLT TFETy; 3229 3230 if (IsD16 && ST.hasUnpackedD16VMem()) { 3231 RoundedTy = LLT::scalarOrVector(NumElts, 32); 3232 TFETy = LLT::vector(NumElts + 1, 32); 3233 } else { 3234 unsigned EltSize = Ty.getScalarSizeInBits(); 3235 unsigned RoundedElts = (Ty.getSizeInBits() + 31) / 32; 3236 unsigned RoundedSize = 32 * RoundedElts; 3237 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3238 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3239 } 3240 3241 Register TFEReg = MRI->createGenericVirtualRegister(TFETy); 3242 Observer.changingInstr(MI); 3243 3244 MI.getOperand(0).setReg(TFEReg); 3245 MI.RemoveOperand(1); 3246 3247 Observer.changedInstr(MI); 3248 3249 // Insert after the instruction. 3250 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3251 3252 // Now figure out how to copy the new result register back into the old 3253 // result. 3254 3255 SmallVector<Register, 5> UnmergeResults(TFETy.getNumElements(), Dst1Reg); 3256 int NumDataElts = TFETy.getNumElements() - 1; 3257 3258 if (!Ty.isVector()) { 3259 // Simplest case is a trivial unmerge (plus a truncate for d16). 3260 UnmergeResults[0] = Ty == S32 ? 3261 DstReg : MRI->createGenericVirtualRegister(S32); 3262 3263 B.buildUnmerge(UnmergeResults, TFEReg); 3264 if (Ty != S32) 3265 B.buildTrunc(DstReg, UnmergeResults[0]); 3266 return true; 3267 } 3268 3269 // We have to repack into a new vector of some kind. 3270 for (int I = 0; I != NumDataElts; ++I) 3271 UnmergeResults[I] = MRI->createGenericVirtualRegister(S32); 3272 B.buildUnmerge(UnmergeResults, TFEReg); 3273 3274 // Drop the final TFE element. 3275 ArrayRef<Register> DataPart(UnmergeResults.data(), NumDataElts); 3276 3277 if (EltTy == S32) 3278 B.buildBuildVector(DstReg, DataPart); 3279 else if (ST.hasUnpackedD16VMem()) 3280 truncToS16Vector(B, DstReg, DataPart); 3281 else 3282 bitcastToS16Vector(B, DstReg, DataPart); 3283 3284 return true; 3285 } 3286 3287 // Must be an image load. 3288 if (!Ty.isVector() || Ty.getElementType() != S16) 3289 return true; 3290 3291 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3292 3293 LLT WidenedTy = Ty.changeElementType(S32); 3294 Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy); 3295 3296 Observer.changingInstr(MI); 3297 MI.getOperand(0).setReg(WideDstReg); 3298 Observer.changedInstr(MI); 3299 3300 repackUnpackedD16Load(B, DstReg, WideDstReg); 3301 return true; 3302 } 3303 3304 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 3305 MachineInstr &MI, MachineIRBuilder &B, 3306 GISelChangeObserver &Observer) const { 3307 Register Dst = MI.getOperand(0).getReg(); 3308 LLT Ty = B.getMRI()->getType(Dst); 3309 unsigned Size = Ty.getSizeInBits(); 3310 MachineFunction &MF = B.getMF(); 3311 3312 Observer.changingInstr(MI); 3313 3314 // FIXME: We don't really need this intermediate instruction. The intrinsic 3315 // should be fixed to have a memory operand. Since it's readnone, we're not 3316 // allowed to add one. 3317 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 3318 MI.RemoveOperand(1); // Remove intrinsic ID 3319 3320 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 3321 // TODO: Should this use datalayout alignment? 3322 const unsigned MemSize = (Size + 7) / 8; 3323 const unsigned MemAlign = 4; 3324 MachineMemOperand *MMO = MF.getMachineMemOperand( 3325 MachinePointerInfo(), 3326 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 3327 MachineMemOperand::MOInvariant, MemSize, MemAlign); 3328 MI.addMemOperand(MF, MMO); 3329 3330 // There are no 96-bit result scalar loads, but widening to 128-bit should 3331 // always be legal. We may need to restore this to a 96-bit result if it turns 3332 // out this needs to be converted to a vector load during RegBankSelect. 3333 if (!isPowerOf2_32(Size)) { 3334 LegalizerHelper Helper(MF, *this, Observer, B); 3335 B.setInstr(MI); 3336 3337 if (Ty.isVector()) 3338 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 3339 else 3340 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 3341 } 3342 3343 Observer.changedInstr(MI); 3344 return true; 3345 } 3346 3347 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 3348 MachineIRBuilder &B, 3349 GISelChangeObserver &Observer) const { 3350 MachineRegisterInfo &MRI = *B.getMRI(); 3351 3352 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 3353 auto IntrID = MI.getIntrinsicID(); 3354 switch (IntrID) { 3355 case Intrinsic::amdgcn_if: 3356 case Intrinsic::amdgcn_else: { 3357 MachineInstr *Br = nullptr; 3358 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 3359 const SIRegisterInfo *TRI 3360 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 3361 3362 B.setInstr(*BrCond); 3363 Register Def = MI.getOperand(1).getReg(); 3364 Register Use = MI.getOperand(3).getReg(); 3365 3366 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 3367 if (Br) 3368 BrTarget = Br->getOperand(0).getMBB(); 3369 3370 if (IntrID == Intrinsic::amdgcn_if) { 3371 B.buildInstr(AMDGPU::SI_IF) 3372 .addDef(Def) 3373 .addUse(Use) 3374 .addMBB(BrTarget); 3375 } else { 3376 B.buildInstr(AMDGPU::SI_ELSE) 3377 .addDef(Def) 3378 .addUse(Use) 3379 .addMBB(BrTarget) 3380 .addImm(0); 3381 } 3382 3383 if (Br) 3384 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 3385 3386 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 3387 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 3388 MI.eraseFromParent(); 3389 BrCond->eraseFromParent(); 3390 return true; 3391 } 3392 3393 return false; 3394 } 3395 case Intrinsic::amdgcn_loop: { 3396 MachineInstr *Br = nullptr; 3397 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 3398 const SIRegisterInfo *TRI 3399 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 3400 3401 B.setInstr(*BrCond); 3402 3403 // FIXME: Need to adjust branch targets based on unconditional branch. 3404 Register Reg = MI.getOperand(2).getReg(); 3405 B.buildInstr(AMDGPU::SI_LOOP) 3406 .addUse(Reg) 3407 .addMBB(BrCond->getOperand(1).getMBB()); 3408 MI.eraseFromParent(); 3409 BrCond->eraseFromParent(); 3410 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 3411 return true; 3412 } 3413 3414 return false; 3415 } 3416 case Intrinsic::amdgcn_kernarg_segment_ptr: 3417 return legalizePreloadedArgIntrin( 3418 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3419 case Intrinsic::amdgcn_implicitarg_ptr: 3420 return legalizeImplicitArgPtr(MI, MRI, B); 3421 case Intrinsic::amdgcn_workitem_id_x: 3422 return legalizePreloadedArgIntrin(MI, MRI, B, 3423 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 3424 case Intrinsic::amdgcn_workitem_id_y: 3425 return legalizePreloadedArgIntrin(MI, MRI, B, 3426 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 3427 case Intrinsic::amdgcn_workitem_id_z: 3428 return legalizePreloadedArgIntrin(MI, MRI, B, 3429 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 3430 case Intrinsic::amdgcn_workgroup_id_x: 3431 return legalizePreloadedArgIntrin(MI, MRI, B, 3432 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 3433 case Intrinsic::amdgcn_workgroup_id_y: 3434 return legalizePreloadedArgIntrin(MI, MRI, B, 3435 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 3436 case Intrinsic::amdgcn_workgroup_id_z: 3437 return legalizePreloadedArgIntrin(MI, MRI, B, 3438 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 3439 case Intrinsic::amdgcn_dispatch_ptr: 3440 return legalizePreloadedArgIntrin(MI, MRI, B, 3441 AMDGPUFunctionArgInfo::DISPATCH_PTR); 3442 case Intrinsic::amdgcn_queue_ptr: 3443 return legalizePreloadedArgIntrin(MI, MRI, B, 3444 AMDGPUFunctionArgInfo::QUEUE_PTR); 3445 case Intrinsic::amdgcn_implicit_buffer_ptr: 3446 return legalizePreloadedArgIntrin( 3447 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 3448 case Intrinsic::amdgcn_dispatch_id: 3449 return legalizePreloadedArgIntrin(MI, MRI, B, 3450 AMDGPUFunctionArgInfo::DISPATCH_ID); 3451 case Intrinsic::amdgcn_fdiv_fast: 3452 return legalizeFDIVFastIntrin(MI, MRI, B); 3453 case Intrinsic::amdgcn_is_shared: 3454 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 3455 case Intrinsic::amdgcn_is_private: 3456 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 3457 case Intrinsic::amdgcn_wavefrontsize: { 3458 B.setInstr(MI); 3459 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 3460 MI.eraseFromParent(); 3461 return true; 3462 } 3463 case Intrinsic::amdgcn_s_buffer_load: 3464 return legalizeSBufferLoad(MI, B, Observer); 3465 case Intrinsic::amdgcn_raw_buffer_store: 3466 case Intrinsic::amdgcn_struct_buffer_store: 3467 return legalizeBufferStore(MI, MRI, B, false, false); 3468 case Intrinsic::amdgcn_raw_buffer_store_format: 3469 case Intrinsic::amdgcn_struct_buffer_store_format: 3470 return legalizeBufferStore(MI, MRI, B, false, true); 3471 case Intrinsic::amdgcn_raw_tbuffer_store: 3472 case Intrinsic::amdgcn_struct_tbuffer_store: 3473 return legalizeBufferStore(MI, MRI, B, true, true); 3474 case Intrinsic::amdgcn_raw_buffer_load: 3475 case Intrinsic::amdgcn_struct_buffer_load: 3476 return legalizeBufferLoad(MI, MRI, B, false, false); 3477 case Intrinsic::amdgcn_raw_buffer_load_format: 3478 case Intrinsic::amdgcn_struct_buffer_load_format: 3479 return legalizeBufferLoad(MI, MRI, B, true, false); 3480 case Intrinsic::amdgcn_raw_tbuffer_load: 3481 case Intrinsic::amdgcn_struct_tbuffer_load: 3482 return legalizeBufferLoad(MI, MRI, B, true, true); 3483 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3484 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3485 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3486 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3487 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3488 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3489 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3490 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3491 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3492 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3493 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3494 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3495 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3496 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3497 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3498 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3499 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3500 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3501 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3502 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3503 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3504 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3505 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3506 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3507 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3508 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3509 return legalizeBufferAtomic(MI, B, IntrID); 3510 case Intrinsic::amdgcn_atomic_inc: 3511 return legalizeAtomicIncDec(MI, B, true); 3512 case Intrinsic::amdgcn_atomic_dec: 3513 return legalizeAtomicIncDec(MI, B, false); 3514 default: { 3515 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 3516 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 3517 return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr); 3518 return true; 3519 } 3520 } 3521 3522 return true; 3523 } 3524