1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPULegalizerInfo.h" 22 23 #include "AMDGPU.h" 24 #include "AMDGPUGlobalISelUtils.h" 25 #include "AMDGPUTargetMachine.h" 26 #include "SIMachineFunctionInfo.h" 27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 30 #include "llvm/CodeGen/TargetOpcodes.h" 31 #include "llvm/CodeGen/ValueTypes.h" 32 #include "llvm/IR/DerivedTypes.h" 33 #include "llvm/IR/DiagnosticInfo.h" 34 #include "llvm/IR/Type.h" 35 #include "llvm/Support/Debug.h" 36 37 #define DEBUG_TYPE "amdgpu-legalinfo" 38 39 using namespace llvm; 40 using namespace LegalizeActions; 41 using namespace LegalizeMutations; 42 using namespace LegalityPredicates; 43 using namespace MIPatternMatch; 44 45 static LegalityPredicate isMultiple32(unsigned TypeIdx, 46 unsigned MaxSize = 1024) { 47 return [=](const LegalityQuery &Query) { 48 const LLT Ty = Query.Types[TypeIdx]; 49 const LLT EltTy = Ty.getScalarType(); 50 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 51 }; 52 } 53 54 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 55 return [=](const LegalityQuery &Query) { 56 return Query.Types[TypeIdx].getSizeInBits() == Size; 57 }; 58 } 59 60 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 61 return [=](const LegalityQuery &Query) { 62 const LLT Ty = Query.Types[TypeIdx]; 63 return Ty.isVector() && 64 Ty.getNumElements() % 2 != 0 && 65 Ty.getElementType().getSizeInBits() < 32 && 66 Ty.getSizeInBits() % 32 != 0; 67 }; 68 } 69 70 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 71 return [=](const LegalityQuery &Query) { 72 const LLT Ty = Query.Types[TypeIdx]; 73 const LLT EltTy = Ty.getScalarType(); 74 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 75 }; 76 } 77 78 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 79 return [=](const LegalityQuery &Query) { 80 const LLT Ty = Query.Types[TypeIdx]; 81 const LLT EltTy = Ty.getElementType(); 82 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 83 }; 84 } 85 86 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getElementType(); 90 unsigned Size = Ty.getSizeInBits(); 91 unsigned Pieces = (Size + 63) / 64; 92 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 93 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 94 }; 95 } 96 97 // Increase the number of vector elements to reach the next multiple of 32-bit 98 // type. 99 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 100 return [=](const LegalityQuery &Query) { 101 const LLT Ty = Query.Types[TypeIdx]; 102 103 const LLT EltTy = Ty.getElementType(); 104 const int Size = Ty.getSizeInBits(); 105 const int EltSize = EltTy.getSizeInBits(); 106 const int NextMul32 = (Size + 31) / 32; 107 108 assert(EltSize < 32); 109 110 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 111 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 112 }; 113 } 114 115 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 116 return [=](const LegalityQuery &Query) { 117 const LLT QueryTy = Query.Types[TypeIdx]; 118 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 119 }; 120 } 121 122 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 123 return [=](const LegalityQuery &Query) { 124 const LLT QueryTy = Query.Types[TypeIdx]; 125 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 126 }; 127 } 128 129 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 130 return [=](const LegalityQuery &Query) { 131 const LLT QueryTy = Query.Types[TypeIdx]; 132 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 133 }; 134 } 135 136 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 137 // v2s16. 138 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 139 return [=](const LegalityQuery &Query) { 140 const LLT Ty = Query.Types[TypeIdx]; 141 if (Ty.isVector()) { 142 const int EltSize = Ty.getElementType().getSizeInBits(); 143 return EltSize == 32 || EltSize == 64 || 144 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 145 EltSize == 128 || EltSize == 256; 146 } 147 148 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 149 }; 150 } 151 152 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 153 return [=](const LegalityQuery &Query) { 154 return Query.Types[TypeIdx].getElementType() == Type; 155 }; 156 } 157 158 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 159 return [=](const LegalityQuery &Query) { 160 const LLT Ty = Query.Types[TypeIdx]; 161 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 162 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 163 }; 164 } 165 166 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 167 const GCNTargetMachine &TM) 168 : ST(ST_) { 169 using namespace TargetOpcode; 170 171 auto GetAddrSpacePtr = [&TM](unsigned AS) { 172 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 173 }; 174 175 const LLT S1 = LLT::scalar(1); 176 const LLT S8 = LLT::scalar(8); 177 const LLT S16 = LLT::scalar(16); 178 const LLT S32 = LLT::scalar(32); 179 const LLT S64 = LLT::scalar(64); 180 const LLT S96 = LLT::scalar(96); 181 const LLT S128 = LLT::scalar(128); 182 const LLT S256 = LLT::scalar(256); 183 const LLT S1024 = LLT::scalar(1024); 184 185 const LLT V2S16 = LLT::vector(2, 16); 186 const LLT V4S16 = LLT::vector(4, 16); 187 188 const LLT V2S32 = LLT::vector(2, 32); 189 const LLT V3S32 = LLT::vector(3, 32); 190 const LLT V4S32 = LLT::vector(4, 32); 191 const LLT V5S32 = LLT::vector(5, 32); 192 const LLT V6S32 = LLT::vector(6, 32); 193 const LLT V7S32 = LLT::vector(7, 32); 194 const LLT V8S32 = LLT::vector(8, 32); 195 const LLT V9S32 = LLT::vector(9, 32); 196 const LLT V10S32 = LLT::vector(10, 32); 197 const LLT V11S32 = LLT::vector(11, 32); 198 const LLT V12S32 = LLT::vector(12, 32); 199 const LLT V13S32 = LLT::vector(13, 32); 200 const LLT V14S32 = LLT::vector(14, 32); 201 const LLT V15S32 = LLT::vector(15, 32); 202 const LLT V16S32 = LLT::vector(16, 32); 203 const LLT V32S32 = LLT::vector(32, 32); 204 205 const LLT V2S64 = LLT::vector(2, 64); 206 const LLT V3S64 = LLT::vector(3, 64); 207 const LLT V4S64 = LLT::vector(4, 64); 208 const LLT V5S64 = LLT::vector(5, 64); 209 const LLT V6S64 = LLT::vector(6, 64); 210 const LLT V7S64 = LLT::vector(7, 64); 211 const LLT V8S64 = LLT::vector(8, 64); 212 const LLT V16S64 = LLT::vector(16, 64); 213 214 std::initializer_list<LLT> AllS32Vectors = 215 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 216 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 217 std::initializer_list<LLT> AllS64Vectors = 218 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 219 220 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 221 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 222 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 223 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 224 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 225 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 226 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 227 228 const LLT CodePtr = FlatPtr; 229 230 const std::initializer_list<LLT> AddrSpaces64 = { 231 GlobalPtr, ConstantPtr, FlatPtr 232 }; 233 234 const std::initializer_list<LLT> AddrSpaces32 = { 235 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 236 }; 237 238 const std::initializer_list<LLT> FPTypesBase = { 239 S32, S64 240 }; 241 242 const std::initializer_list<LLT> FPTypes16 = { 243 S32, S64, S16 244 }; 245 246 const std::initializer_list<LLT> FPTypesPK16 = { 247 S32, S64, S16, V2S16 248 }; 249 250 const LLT MinLegalScalarShiftTy = ST.has16BitInsts() ? S16 : S32; 251 252 setAction({G_BRCOND, S1}, Legal); // VCC branches 253 setAction({G_BRCOND, S32}, Legal); // SCC branches 254 255 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 256 // elements for v3s16 257 getActionDefinitionsBuilder(G_PHI) 258 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 259 .legalFor(AllS32Vectors) 260 .legalFor(AllS64Vectors) 261 .legalFor(AddrSpaces64) 262 .legalFor(AddrSpaces32) 263 .clampScalar(0, S32, S256) 264 .widenScalarToNextPow2(0, 32) 265 .clampMaxNumElements(0, S32, 16) 266 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 267 .legalIf(isPointer(0)); 268 269 if (ST.has16BitInsts()) { 270 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 271 .legalFor({S32, S16}) 272 .clampScalar(0, S16, S32) 273 .scalarize(0); 274 } else { 275 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 276 .legalFor({S32}) 277 .clampScalar(0, S32, S32) 278 .scalarize(0); 279 } 280 281 // FIXME: Not really legal. Placeholder for custom lowering. 282 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 283 .legalFor({S32, S64}) 284 .clampScalar(0, S32, S64) 285 .widenScalarToNextPow2(0, 32) 286 .scalarize(0); 287 288 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 289 .legalFor({S32}) 290 .clampScalar(0, S32, S32) 291 .scalarize(0); 292 293 // Report legal for any types we can handle anywhere. For the cases only legal 294 // on the SALU, RegBankSelect will be able to re-legalize. 295 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 296 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 297 .clampScalar(0, S32, S64) 298 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 299 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 300 .widenScalarToNextPow2(0) 301 .scalarize(0); 302 303 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 304 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 305 .legalFor({{S32, S1}, {S32, S32}}) 306 .clampScalar(0, S32, S32) 307 .scalarize(0); // TODO: Implement. 308 309 getActionDefinitionsBuilder(G_BITCAST) 310 // Don't worry about the size constraint. 311 .legalIf(all(isRegisterType(0), isRegisterType(1))) 312 // FIXME: Testing hack 313 .legalForCartesianProduct({S16, LLT::vector(2, 8), }) 314 .lower(); 315 316 317 getActionDefinitionsBuilder(G_CONSTANT) 318 .legalFor({S1, S32, S64, S16, GlobalPtr, 319 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 320 .clampScalar(0, S32, S64) 321 .widenScalarToNextPow2(0) 322 .legalIf(isPointer(0)); 323 324 getActionDefinitionsBuilder(G_FCONSTANT) 325 .legalFor({S32, S64, S16}) 326 .clampScalar(0, S16, S64); 327 328 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 329 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 330 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 331 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 332 .clampScalarOrElt(0, S32, S1024) 333 .legalIf(isMultiple32(0)) 334 .widenScalarToNextPow2(0, 32) 335 .clampMaxNumElements(0, S32, 16); 336 337 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 338 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 339 .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr}); 340 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 341 342 auto &FPOpActions = getActionDefinitionsBuilder( 343 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 344 .legalFor({S32, S64}); 345 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 346 .customFor({S32, S64}); 347 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 348 .customFor({S32, S64}); 349 350 if (ST.has16BitInsts()) { 351 if (ST.hasVOP3PInsts()) 352 FPOpActions.legalFor({S16, V2S16}); 353 else 354 FPOpActions.legalFor({S16}); 355 356 TrigActions.customFor({S16}); 357 FDIVActions.customFor({S16}); 358 } 359 360 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 361 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 362 363 if (ST.hasVOP3PInsts()) { 364 MinNumMaxNum.customFor(FPTypesPK16) 365 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 366 .clampMaxNumElements(0, S16, 2) 367 .clampScalar(0, S16, S64) 368 .scalarize(0); 369 } else if (ST.has16BitInsts()) { 370 MinNumMaxNum.customFor(FPTypes16) 371 .clampScalar(0, S16, S64) 372 .scalarize(0); 373 } else { 374 MinNumMaxNum.customFor(FPTypesBase) 375 .clampScalar(0, S32, S64) 376 .scalarize(0); 377 } 378 379 if (ST.hasVOP3PInsts()) 380 FPOpActions.clampMaxNumElements(0, S16, 2); 381 382 FPOpActions 383 .scalarize(0) 384 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 385 386 TrigActions 387 .scalarize(0) 388 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 389 390 FDIVActions 391 .scalarize(0) 392 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 393 394 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 395 .legalFor(FPTypesPK16) 396 .clampMaxNumElements(0, S16, 2) 397 .scalarize(0) 398 .clampScalar(0, S16, S64); 399 400 if (ST.has16BitInsts()) { 401 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 402 .legalFor({S32, S64, S16}) 403 .scalarize(0) 404 .clampScalar(0, S16, S64); 405 } else { 406 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 407 .legalFor({S32, S64}) 408 .scalarize(0) 409 .clampScalar(0, S32, S64); 410 } 411 412 getActionDefinitionsBuilder(G_FPTRUNC) 413 .legalFor({{S32, S64}, {S16, S32}}) 414 .scalarize(0); 415 416 getActionDefinitionsBuilder(G_FPEXT) 417 .legalFor({{S64, S32}, {S32, S16}}) 418 .lowerFor({{S64, S16}}) // FIXME: Implement 419 .scalarize(0); 420 421 getActionDefinitionsBuilder(G_FSUB) 422 // Use actual fsub instruction 423 .legalFor({S32}) 424 // Must use fadd + fneg 425 .lowerFor({S64, S16, V2S16}) 426 .scalarize(0) 427 .clampScalar(0, S32, S64); 428 429 // Whether this is legal depends on the floating point mode for the function. 430 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 431 if (ST.hasMadF16()) 432 FMad.customFor({S32, S16}); 433 else 434 FMad.customFor({S32}); 435 FMad.scalarize(0) 436 .lower(); 437 438 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 439 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 440 {S32, S1}, {S64, S1}, {S16, S1}, 441 {S96, S32}, 442 // FIXME: Hack 443 {S64, LLT::scalar(33)}, 444 {S32, S8}, {S32, LLT::scalar(24)}}) 445 .scalarize(0) 446 .clampScalar(0, S32, S64); 447 448 // TODO: Split s1->s64 during regbankselect for VALU. 449 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 450 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 451 .lowerFor({{S32, S64}}) 452 .lowerIf(typeIs(1, S1)) 453 .customFor({{S64, S64}}); 454 if (ST.has16BitInsts()) 455 IToFP.legalFor({{S16, S16}}); 456 IToFP.clampScalar(1, S32, S64) 457 .scalarize(0); 458 459 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 460 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 461 .customFor({{S64, S64}}); 462 if (ST.has16BitInsts()) 463 FPToI.legalFor({{S16, S16}}); 464 else 465 FPToI.minScalar(1, S32); 466 467 FPToI.minScalar(0, S32) 468 .scalarize(0) 469 .lower(); 470 471 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 472 .scalarize(0) 473 .lower(); 474 475 if (ST.has16BitInsts()) { 476 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 477 .legalFor({S16, S32, S64}) 478 .clampScalar(0, S16, S64) 479 .scalarize(0); 480 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 481 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 482 .legalFor({S32, S64}) 483 .clampScalar(0, S32, S64) 484 .scalarize(0); 485 } else { 486 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 487 .legalFor({S32}) 488 .customFor({S64}) 489 .clampScalar(0, S32, S64) 490 .scalarize(0); 491 } 492 493 getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK}) 494 .scalarize(0) 495 .alwaysLegal(); 496 497 auto &CmpBuilder = 498 getActionDefinitionsBuilder(G_ICMP) 499 // The compare output type differs based on the register bank of the output, 500 // so make both s1 and s32 legal. 501 // 502 // Scalar compares producing output in scc will be promoted to s32, as that 503 // is the allocatable register type that will be needed for the copy from 504 // scc. This will be promoted during RegBankSelect, and we assume something 505 // before that won't try to use s32 result types. 506 // 507 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 508 // bank. 509 .legalForCartesianProduct( 510 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 511 .legalForCartesianProduct( 512 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 513 if (ST.has16BitInsts()) { 514 CmpBuilder.legalFor({{S1, S16}}); 515 } 516 517 CmpBuilder 518 .widenScalarToNextPow2(1) 519 .clampScalar(1, S32, S64) 520 .scalarize(0) 521 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 522 523 getActionDefinitionsBuilder(G_FCMP) 524 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 525 .widenScalarToNextPow2(1) 526 .clampScalar(1, S32, S64) 527 .scalarize(0); 528 529 // FIXME: fexp, flog2, flog10 needs to be custom lowered. 530 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, 531 G_FLOG2}) 532 .legalFor({S32}) 533 .scalarize(0); 534 535 getActionDefinitionsBuilder({G_FLOG, G_FLOG10}) 536 .customFor({S32}) 537 .clampScalar(0, S32, S32) 538 .scalarize(0); 539 540 // The 64-bit versions produce 32-bit results, but only on the SALU. 541 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, 542 G_CTTZ, G_CTTZ_ZERO_UNDEF, 543 G_CTPOP}) 544 .legalFor({{S32, S32}, {S32, S64}}) 545 .clampScalar(0, S32, S32) 546 .clampScalar(1, S32, S64) 547 .scalarize(0) 548 .widenScalarToNextPow2(0, 32) 549 .widenScalarToNextPow2(1, 32); 550 551 // TODO: Expand for > s32 552 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE}) 553 .legalFor({S32}) 554 .clampScalar(0, S32, S32) 555 .scalarize(0); 556 557 if (ST.has16BitInsts()) { 558 if (ST.hasVOP3PInsts()) { 559 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 560 .legalFor({S32, S16, V2S16}) 561 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 562 .clampMaxNumElements(0, S16, 2) 563 .clampScalar(0, S16, S32) 564 .widenScalarToNextPow2(0) 565 .scalarize(0); 566 } else { 567 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 568 .legalFor({S32, S16}) 569 .widenScalarToNextPow2(0) 570 .clampScalar(0, S16, S32) 571 .scalarize(0); 572 } 573 } else { 574 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 575 .legalFor({S32}) 576 .clampScalar(0, S32, S32) 577 .widenScalarToNextPow2(0) 578 .scalarize(0); 579 } 580 581 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 582 return [=](const LegalityQuery &Query) { 583 return Query.Types[TypeIdx0].getSizeInBits() < 584 Query.Types[TypeIdx1].getSizeInBits(); 585 }; 586 }; 587 588 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 589 return [=](const LegalityQuery &Query) { 590 return Query.Types[TypeIdx0].getSizeInBits() > 591 Query.Types[TypeIdx1].getSizeInBits(); 592 }; 593 }; 594 595 getActionDefinitionsBuilder(G_INTTOPTR) 596 // List the common cases 597 .legalForCartesianProduct(AddrSpaces64, {S64}) 598 .legalForCartesianProduct(AddrSpaces32, {S32}) 599 .scalarize(0) 600 // Accept any address space as long as the size matches 601 .legalIf(sameSize(0, 1)) 602 .widenScalarIf(smallerThan(1, 0), 603 [](const LegalityQuery &Query) { 604 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 605 }) 606 .narrowScalarIf(greaterThan(1, 0), 607 [](const LegalityQuery &Query) { 608 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 609 }); 610 611 getActionDefinitionsBuilder(G_PTRTOINT) 612 // List the common cases 613 .legalForCartesianProduct(AddrSpaces64, {S64}) 614 .legalForCartesianProduct(AddrSpaces32, {S32}) 615 .scalarize(0) 616 // Accept any address space as long as the size matches 617 .legalIf(sameSize(0, 1)) 618 .widenScalarIf(smallerThan(0, 1), 619 [](const LegalityQuery &Query) { 620 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 621 }) 622 .narrowScalarIf( 623 greaterThan(0, 1), 624 [](const LegalityQuery &Query) { 625 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 626 }); 627 628 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 629 .scalarize(0) 630 .custom(); 631 632 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 633 // handle some operations by just promoting the register during 634 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 635 auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned { 636 switch (AS) { 637 // FIXME: Private element size. 638 case AMDGPUAS::PRIVATE_ADDRESS: 639 return 32; 640 // FIXME: Check subtarget 641 case AMDGPUAS::LOCAL_ADDRESS: 642 return ST.useDS128() ? 128 : 64; 643 644 // Treat constant and global as identical. SMRD loads are sometimes usable 645 // for global loads (ideally constant address space should be eliminated) 646 // depending on the context. Legality cannot be context dependent, but 647 // RegBankSelect can split the load as necessary depending on the pointer 648 // register bank/uniformity and if the memory is invariant or not written in 649 // a kernel. 650 case AMDGPUAS::CONSTANT_ADDRESS: 651 case AMDGPUAS::GLOBAL_ADDRESS: 652 return 512; 653 default: 654 return 128; 655 } 656 }; 657 658 const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool { 659 const LLT DstTy = Query.Types[0]; 660 661 // Split vector extloads. 662 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 663 unsigned Align = Query.MMODescrs[0].AlignInBits; 664 665 if (MemSize < DstTy.getSizeInBits()) 666 MemSize = std::max(MemSize, Align); 667 668 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 669 return true; 670 671 const LLT PtrTy = Query.Types[1]; 672 unsigned AS = PtrTy.getAddressSpace(); 673 if (MemSize > maxSizeForAddrSpace(AS)) 674 return true; 675 676 // Catch weird sized loads that don't evenly divide into the access sizes 677 // TODO: May be able to widen depending on alignment etc. 678 unsigned NumRegs = MemSize / 32; 679 if (NumRegs == 3 && !ST.hasDwordx3LoadStores()) 680 return true; 681 682 if (Align < MemSize) { 683 const SITargetLowering *TLI = ST.getTargetLowering(); 684 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 685 } 686 687 return false; 688 }; 689 690 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 691 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 692 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 693 694 // TODO: Refine based on subtargets which support unaligned access or 128-bit 695 // LDS 696 // TODO: Unsupported flat for SI. 697 698 for (unsigned Op : {G_LOAD, G_STORE}) { 699 const bool IsStore = Op == G_STORE; 700 701 auto &Actions = getActionDefinitionsBuilder(Op); 702 // Whitelist the common cases. 703 // TODO: Pointer loads 704 // TODO: Wide constant loads 705 // TODO: Only CI+ has 3x loads 706 // TODO: Loads to s16 on gfx9 707 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 708 {V2S32, GlobalPtr, 64, GlobalAlign32}, 709 {V3S32, GlobalPtr, 96, GlobalAlign32}, 710 {S96, GlobalPtr, 96, GlobalAlign32}, 711 {V4S32, GlobalPtr, 128, GlobalAlign32}, 712 {S128, GlobalPtr, 128, GlobalAlign32}, 713 {S64, GlobalPtr, 64, GlobalAlign32}, 714 {V2S64, GlobalPtr, 128, GlobalAlign32}, 715 {V2S16, GlobalPtr, 32, GlobalAlign32}, 716 {S32, GlobalPtr, 8, GlobalAlign8}, 717 {S32, GlobalPtr, 16, GlobalAlign16}, 718 719 {S32, LocalPtr, 32, 32}, 720 {S64, LocalPtr, 64, 32}, 721 {V2S32, LocalPtr, 64, 32}, 722 {S32, LocalPtr, 8, 8}, 723 {S32, LocalPtr, 16, 16}, 724 {V2S16, LocalPtr, 32, 32}, 725 726 {S32, PrivatePtr, 32, 32}, 727 {S32, PrivatePtr, 8, 8}, 728 {S32, PrivatePtr, 16, 16}, 729 {V2S16, PrivatePtr, 32, 32}, 730 731 {S32, FlatPtr, 32, GlobalAlign32}, 732 {S32, FlatPtr, 16, GlobalAlign16}, 733 {S32, FlatPtr, 8, GlobalAlign8}, 734 {V2S16, FlatPtr, 32, GlobalAlign32}, 735 736 {S32, ConstantPtr, 32, GlobalAlign32}, 737 {V2S32, ConstantPtr, 64, GlobalAlign32}, 738 {V3S32, ConstantPtr, 96, GlobalAlign32}, 739 {V4S32, ConstantPtr, 128, GlobalAlign32}, 740 {S64, ConstantPtr, 64, GlobalAlign32}, 741 {S128, ConstantPtr, 128, GlobalAlign32}, 742 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 743 Actions 744 .customIf(typeIs(1, Constant32Ptr)) 745 .narrowScalarIf( 746 [=](const LegalityQuery &Query) -> bool { 747 return !Query.Types[0].isVector() && needToSplitLoad(Query); 748 }, 749 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 750 const LLT DstTy = Query.Types[0]; 751 const LLT PtrTy = Query.Types[1]; 752 753 const unsigned DstSize = DstTy.getSizeInBits(); 754 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 755 756 // Split extloads. 757 if (DstSize > MemSize) 758 return std::make_pair(0, LLT::scalar(MemSize)); 759 760 if (DstSize > 32 && (DstSize % 32 != 0)) { 761 // FIXME: Need a way to specify non-extload of larger size if 762 // suitably aligned. 763 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 764 } 765 766 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 767 if (MemSize > MaxSize) 768 return std::make_pair(0, LLT::scalar(MaxSize)); 769 770 unsigned Align = Query.MMODescrs[0].AlignInBits; 771 return std::make_pair(0, LLT::scalar(Align)); 772 }) 773 .fewerElementsIf( 774 [=](const LegalityQuery &Query) -> bool { 775 return Query.Types[0].isVector() && needToSplitLoad(Query); 776 }, 777 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 778 const LLT DstTy = Query.Types[0]; 779 const LLT PtrTy = Query.Types[1]; 780 781 LLT EltTy = DstTy.getElementType(); 782 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 783 784 // Split if it's too large for the address space. 785 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 786 unsigned NumElts = DstTy.getNumElements(); 787 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 788 789 // FIXME: Refine when odd breakdowns handled 790 // The scalars will need to be re-legalized. 791 if (NumPieces == 1 || NumPieces >= NumElts || 792 NumElts % NumPieces != 0) 793 return std::make_pair(0, EltTy); 794 795 return std::make_pair(0, 796 LLT::vector(NumElts / NumPieces, EltTy)); 797 } 798 799 // Need to split because of alignment. 800 unsigned Align = Query.MMODescrs[0].AlignInBits; 801 unsigned EltSize = EltTy.getSizeInBits(); 802 if (EltSize > Align && 803 (EltSize / Align < DstTy.getNumElements())) { 804 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 805 } 806 807 // May need relegalization for the scalars. 808 return std::make_pair(0, EltTy); 809 }) 810 .minScalar(0, S32); 811 812 if (IsStore) 813 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 814 815 // TODO: Need a bitcast lower option? 816 Actions 817 .legalIf([=](const LegalityQuery &Query) { 818 const LLT Ty0 = Query.Types[0]; 819 unsigned Size = Ty0.getSizeInBits(); 820 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 821 unsigned Align = Query.MMODescrs[0].AlignInBits; 822 823 // FIXME: Widening store from alignment not valid. 824 if (MemSize < Size) 825 MemSize = std::max(MemSize, Align); 826 827 // No extending vector loads. 828 if (Size > MemSize && Ty0.isVector()) 829 return false; 830 831 switch (MemSize) { 832 case 8: 833 case 16: 834 return Size == 32; 835 case 32: 836 case 64: 837 case 128: 838 return true; 839 case 96: 840 return ST.hasDwordx3LoadStores(); 841 case 256: 842 case 512: 843 return true; 844 default: 845 return false; 846 } 847 }) 848 .widenScalarToNextPow2(0) 849 // TODO: v3s32->v4s32 with alignment 850 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 851 } 852 853 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 854 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 855 {S32, GlobalPtr, 16, 2 * 8}, 856 {S32, LocalPtr, 8, 8}, 857 {S32, LocalPtr, 16, 16}, 858 {S32, PrivatePtr, 8, 8}, 859 {S32, PrivatePtr, 16, 16}, 860 {S32, ConstantPtr, 8, 8}, 861 {S32, ConstantPtr, 16, 2 * 8}}); 862 if (ST.hasFlatAddressSpace()) { 863 ExtLoads.legalForTypesWithMemDesc( 864 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 865 } 866 867 ExtLoads.clampScalar(0, S32, S32) 868 .widenScalarToNextPow2(0) 869 .unsupportedIfMemSizeNotPow2() 870 .lower(); 871 872 auto &Atomics = getActionDefinitionsBuilder( 873 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 874 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 875 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 876 G_ATOMICRMW_UMIN}) 877 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 878 {S64, GlobalPtr}, {S64, LocalPtr}}); 879 if (ST.hasFlatAddressSpace()) { 880 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 881 } 882 883 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 884 .legalFor({{S32, LocalPtr}}); 885 886 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 887 // demarshalling 888 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 889 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 890 {S32, FlatPtr}, {S64, FlatPtr}}) 891 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 892 {S32, RegionPtr}, {S64, RegionPtr}}); 893 // TODO: Pointer types, any 32-bit or 64-bit vector 894 895 // Condition should be s32 for scalar, s1 for vector. 896 getActionDefinitionsBuilder(G_SELECT) 897 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 898 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 899 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 900 .clampScalar(0, S16, S64) 901 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 902 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 903 .scalarize(1) 904 .clampMaxNumElements(0, S32, 2) 905 .clampMaxNumElements(0, LocalPtr, 2) 906 .clampMaxNumElements(0, PrivatePtr, 2) 907 .scalarize(0) 908 .widenScalarToNextPow2(0) 909 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 910 911 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 912 // be more flexible with the shift amount type. 913 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 914 .legalFor({{S32, S32}, {S64, S32}}); 915 if (ST.has16BitInsts()) { 916 if (ST.hasVOP3PInsts()) { 917 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 918 .clampMaxNumElements(0, S16, 2); 919 } else 920 Shifts.legalFor({{S16, S32}, {S16, S16}}); 921 922 // TODO: Support 16-bit shift amounts 923 Shifts.clampScalar(1, S32, S32); 924 Shifts.clampScalar(0, S16, S64); 925 Shifts.widenScalarToNextPow2(0, 16); 926 } else { 927 // Make sure we legalize the shift amount type first, as the general 928 // expansion for the shifted type will produce much worse code if it hasn't 929 // been truncated already. 930 Shifts.clampScalar(1, S32, S32); 931 Shifts.clampScalar(0, S32, S64); 932 Shifts.widenScalarToNextPow2(0, 32); 933 } 934 Shifts.scalarize(0); 935 936 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 937 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 938 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 939 unsigned IdxTypeIdx = 2; 940 941 getActionDefinitionsBuilder(Op) 942 .customIf([=](const LegalityQuery &Query) { 943 const LLT EltTy = Query.Types[EltTypeIdx]; 944 const LLT VecTy = Query.Types[VecTypeIdx]; 945 const LLT IdxTy = Query.Types[IdxTypeIdx]; 946 return (EltTy.getSizeInBits() == 16 || 947 EltTy.getSizeInBits() % 32 == 0) && 948 VecTy.getSizeInBits() % 32 == 0 && 949 VecTy.getSizeInBits() <= 1024 && 950 IdxTy.getSizeInBits() == 32; 951 }) 952 .clampScalar(EltTypeIdx, S32, S64) 953 .clampScalar(VecTypeIdx, S32, S64) 954 .clampScalar(IdxTypeIdx, S32, S32); 955 } 956 957 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 958 .unsupportedIf([=](const LegalityQuery &Query) { 959 const LLT &EltTy = Query.Types[1].getElementType(); 960 return Query.Types[0] != EltTy; 961 }); 962 963 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 964 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 965 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 966 967 // FIXME: Doesn't handle extract of illegal sizes. 968 getActionDefinitionsBuilder(Op) 969 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 970 // FIXME: Multiples of 16 should not be legal. 971 .legalIf([=](const LegalityQuery &Query) { 972 const LLT BigTy = Query.Types[BigTyIdx]; 973 const LLT LitTy = Query.Types[LitTyIdx]; 974 return (BigTy.getSizeInBits() % 32 == 0) && 975 (LitTy.getSizeInBits() % 16 == 0); 976 }) 977 .widenScalarIf( 978 [=](const LegalityQuery &Query) { 979 const LLT BigTy = Query.Types[BigTyIdx]; 980 return (BigTy.getScalarSizeInBits() < 16); 981 }, 982 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 983 .widenScalarIf( 984 [=](const LegalityQuery &Query) { 985 const LLT LitTy = Query.Types[LitTyIdx]; 986 return (LitTy.getScalarSizeInBits() < 16); 987 }, 988 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 989 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 990 .widenScalarToNextPow2(BigTyIdx, 32); 991 992 } 993 994 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 995 .legalForCartesianProduct(AllS32Vectors, {S32}) 996 .legalForCartesianProduct(AllS64Vectors, {S64}) 997 .clampNumElements(0, V16S32, V32S32) 998 .clampNumElements(0, V2S64, V16S64) 999 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1000 1001 if (ST.hasScalarPackInsts()) 1002 BuildVector.legalFor({V2S16, S32}); 1003 1004 BuildVector 1005 .minScalarSameAs(1, 0) 1006 .legalIf(isRegisterType(0)) 1007 .minScalarOrElt(0, S32); 1008 1009 if (ST.hasScalarPackInsts()) { 1010 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1011 .legalFor({V2S16, S32}) 1012 .lower(); 1013 } else { 1014 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1015 .lower(); 1016 } 1017 1018 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1019 .legalIf(isRegisterType(0)); 1020 1021 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1022 // pre-legalize. 1023 if (ST.hasVOP3PInsts()) { 1024 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1025 .customFor({V2S16, V2S16}) 1026 .lower(); 1027 } else 1028 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1029 1030 // Merge/Unmerge 1031 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1032 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1033 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1034 1035 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1036 const LLT &Ty = Query.Types[TypeIdx]; 1037 if (Ty.isVector()) { 1038 const LLT &EltTy = Ty.getElementType(); 1039 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 1040 return true; 1041 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1042 return true; 1043 } 1044 return false; 1045 }; 1046 1047 auto &Builder = getActionDefinitionsBuilder(Op) 1048 // Try to widen to s16 first for small types. 1049 // TODO: Only do this on targets with legal s16 shifts 1050 .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1051 1052 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1053 .lowerFor({{S16, V2S16}}) 1054 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1055 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1056 elementTypeIs(1, S16)), 1057 changeTo(1, V2S16)) 1058 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1059 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1060 // valid. 1061 .clampScalar(LitTyIdx, S32, S256) 1062 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1063 // Break up vectors with weird elements into scalars 1064 .fewerElementsIf( 1065 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 1066 scalarize(0)) 1067 .fewerElementsIf( 1068 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 1069 scalarize(1)) 1070 .clampScalar(BigTyIdx, S32, S1024); 1071 1072 if (Op == G_MERGE_VALUES) { 1073 Builder.widenScalarIf( 1074 // TODO: Use 16-bit shifts if legal for 8-bit values? 1075 [=](const LegalityQuery &Query) { 1076 const LLT Ty = Query.Types[LitTyIdx]; 1077 return Ty.getSizeInBits() < 32; 1078 }, 1079 changeTo(LitTyIdx, S32)); 1080 } 1081 1082 Builder.widenScalarIf( 1083 [=](const LegalityQuery &Query) { 1084 const LLT Ty = Query.Types[BigTyIdx]; 1085 return !isPowerOf2_32(Ty.getSizeInBits()) && 1086 Ty.getSizeInBits() % 16 != 0; 1087 }, 1088 [=](const LegalityQuery &Query) { 1089 // Pick the next power of 2, or a multiple of 64 over 128. 1090 // Whichever is smaller. 1091 const LLT &Ty = Query.Types[BigTyIdx]; 1092 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1093 if (NewSizeInBits >= 256) { 1094 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1095 if (RoundedTo < NewSizeInBits) 1096 NewSizeInBits = RoundedTo; 1097 } 1098 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1099 }) 1100 .legalIf([=](const LegalityQuery &Query) { 1101 const LLT &BigTy = Query.Types[BigTyIdx]; 1102 const LLT &LitTy = Query.Types[LitTyIdx]; 1103 1104 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1105 return false; 1106 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1107 return false; 1108 1109 return BigTy.getSizeInBits() % 16 == 0 && 1110 LitTy.getSizeInBits() % 16 == 0 && 1111 BigTy.getSizeInBits() <= 1024; 1112 }) 1113 // Any vectors left are the wrong size. Scalarize them. 1114 .scalarize(0) 1115 .scalarize(1); 1116 } 1117 1118 // TODO: Make legal for s32, s64. s64 case needs break down in regbankselect. 1119 getActionDefinitionsBuilder(G_SEXT_INREG) 1120 .clampScalar(0, MinLegalScalarShiftTy, S64) 1121 .lower(); 1122 1123 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1124 .legalFor({S64}); 1125 1126 getActionDefinitionsBuilder({ 1127 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1128 G_FCOPYSIGN, 1129 1130 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1131 G_READ_REGISTER, 1132 G_WRITE_REGISTER, 1133 1134 G_SADDO, G_SSUBO, 1135 1136 // TODO: Implement 1137 G_FMINIMUM, G_FMAXIMUM 1138 }).lower(); 1139 1140 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1141 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1142 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1143 .unsupported(); 1144 1145 computeTables(); 1146 verify(*ST.getInstrInfo()); 1147 } 1148 1149 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1150 MachineRegisterInfo &MRI, 1151 MachineIRBuilder &B, 1152 GISelChangeObserver &Observer) const { 1153 switch (MI.getOpcode()) { 1154 case TargetOpcode::G_ADDRSPACE_CAST: 1155 return legalizeAddrSpaceCast(MI, MRI, B); 1156 case TargetOpcode::G_FRINT: 1157 return legalizeFrint(MI, MRI, B); 1158 case TargetOpcode::G_FCEIL: 1159 return legalizeFceil(MI, MRI, B); 1160 case TargetOpcode::G_INTRINSIC_TRUNC: 1161 return legalizeIntrinsicTrunc(MI, MRI, B); 1162 case TargetOpcode::G_SITOFP: 1163 return legalizeITOFP(MI, MRI, B, true); 1164 case TargetOpcode::G_UITOFP: 1165 return legalizeITOFP(MI, MRI, B, false); 1166 case TargetOpcode::G_FPTOSI: 1167 return legalizeFPTOI(MI, MRI, B, true); 1168 case TargetOpcode::G_FPTOUI: 1169 return legalizeFPTOI(MI, MRI, B, false); 1170 case TargetOpcode::G_FMINNUM: 1171 case TargetOpcode::G_FMAXNUM: 1172 case TargetOpcode::G_FMINNUM_IEEE: 1173 case TargetOpcode::G_FMAXNUM_IEEE: 1174 return legalizeMinNumMaxNum(MI, MRI, B); 1175 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1176 return legalizeExtractVectorElt(MI, MRI, B); 1177 case TargetOpcode::G_INSERT_VECTOR_ELT: 1178 return legalizeInsertVectorElt(MI, MRI, B); 1179 case TargetOpcode::G_SHUFFLE_VECTOR: 1180 return legalizeShuffleVector(MI, MRI, B); 1181 case TargetOpcode::G_FSIN: 1182 case TargetOpcode::G_FCOS: 1183 return legalizeSinCos(MI, MRI, B); 1184 case TargetOpcode::G_GLOBAL_VALUE: 1185 return legalizeGlobalValue(MI, MRI, B); 1186 case TargetOpcode::G_LOAD: 1187 return legalizeLoad(MI, MRI, B, Observer); 1188 case TargetOpcode::G_FMAD: 1189 return legalizeFMad(MI, MRI, B); 1190 case TargetOpcode::G_FDIV: 1191 return legalizeFDIV(MI, MRI, B); 1192 case TargetOpcode::G_ATOMIC_CMPXCHG: 1193 return legalizeAtomicCmpXChg(MI, MRI, B); 1194 case TargetOpcode::G_FLOG: 1195 return legalizeFlog(MI, B, 1.0f / numbers::log2ef); 1196 case TargetOpcode::G_FLOG10: 1197 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1198 default: 1199 return false; 1200 } 1201 1202 llvm_unreachable("expected switch to return"); 1203 } 1204 1205 Register AMDGPULegalizerInfo::getSegmentAperture( 1206 unsigned AS, 1207 MachineRegisterInfo &MRI, 1208 MachineIRBuilder &B) const { 1209 MachineFunction &MF = B.getMF(); 1210 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1211 const LLT S32 = LLT::scalar(32); 1212 1213 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1214 1215 if (ST.hasApertureRegs()) { 1216 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1217 // getreg. 1218 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1219 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1220 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1221 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1222 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1223 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1224 unsigned Encoding = 1225 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1226 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1227 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1228 1229 Register ApertureReg = MRI.createGenericVirtualRegister(S32); 1230 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1231 1232 B.buildInstr(AMDGPU::S_GETREG_B32) 1233 .addDef(GetReg) 1234 .addImm(Encoding); 1235 MRI.setType(GetReg, S32); 1236 1237 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1238 B.buildInstr(TargetOpcode::G_SHL) 1239 .addDef(ApertureReg) 1240 .addUse(GetReg) 1241 .addUse(ShiftAmt.getReg(0)); 1242 1243 return ApertureReg; 1244 } 1245 1246 Register QueuePtr = MRI.createGenericVirtualRegister( 1247 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1248 1249 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1250 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1251 return Register(); 1252 1253 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1254 // private_segment_aperture_base_hi. 1255 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1256 1257 // TODO: can we be smarter about machine pointer info? 1258 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1259 MachineMemOperand *MMO = MF.getMachineMemOperand( 1260 PtrInfo, 1261 MachineMemOperand::MOLoad | 1262 MachineMemOperand::MODereferenceable | 1263 MachineMemOperand::MOInvariant, 1264 4, 1265 MinAlign(64, StructOffset)); 1266 1267 Register LoadResult = MRI.createGenericVirtualRegister(S32); 1268 Register LoadAddr; 1269 1270 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1271 B.buildLoad(LoadResult, LoadAddr, *MMO); 1272 return LoadResult; 1273 } 1274 1275 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1276 MachineInstr &MI, MachineRegisterInfo &MRI, 1277 MachineIRBuilder &B) const { 1278 MachineFunction &MF = B.getMF(); 1279 1280 B.setInstr(MI); 1281 1282 const LLT S32 = LLT::scalar(32); 1283 Register Dst = MI.getOperand(0).getReg(); 1284 Register Src = MI.getOperand(1).getReg(); 1285 1286 LLT DstTy = MRI.getType(Dst); 1287 LLT SrcTy = MRI.getType(Src); 1288 unsigned DestAS = DstTy.getAddressSpace(); 1289 unsigned SrcAS = SrcTy.getAddressSpace(); 1290 1291 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1292 // vector element. 1293 assert(!DstTy.isVector()); 1294 1295 const AMDGPUTargetMachine &TM 1296 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1297 1298 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1299 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1300 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1301 return true; 1302 } 1303 1304 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1305 // Truncate. 1306 B.buildExtract(Dst, Src, 0); 1307 MI.eraseFromParent(); 1308 return true; 1309 } 1310 1311 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1312 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1313 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1314 1315 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1316 // another. Merge operands are required to be the same type, but creating an 1317 // extra ptrtoint would be kind of pointless. 1318 auto HighAddr = B.buildConstant( 1319 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1320 B.buildMerge(Dst, {Src, HighAddr.getReg(0)}); 1321 MI.eraseFromParent(); 1322 return true; 1323 } 1324 1325 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1326 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1327 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1328 unsigned NullVal = TM.getNullPointerValue(DestAS); 1329 1330 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1331 auto FlatNull = B.buildConstant(SrcTy, 0); 1332 1333 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy); 1334 1335 // Extract low 32-bits of the pointer. 1336 B.buildExtract(PtrLo32, Src, 0); 1337 1338 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1339 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); 1340 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1341 1342 MI.eraseFromParent(); 1343 return true; 1344 } 1345 1346 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1347 return false; 1348 1349 if (!ST.hasFlatAddressSpace()) 1350 return false; 1351 1352 auto SegmentNull = 1353 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1354 auto FlatNull = 1355 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1356 1357 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1358 if (!ApertureReg.isValid()) 1359 return false; 1360 1361 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1362 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); 1363 1364 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy); 1365 1366 // Coerce the type of the low half of the result so we can use merge_values. 1367 Register SrcAsInt = MRI.createGenericVirtualRegister(S32); 1368 B.buildInstr(TargetOpcode::G_PTRTOINT) 1369 .addDef(SrcAsInt) 1370 .addUse(Src); 1371 1372 // TODO: Should we allow mismatched types but matching sizes in merges to 1373 // avoid the ptrtoint? 1374 B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); 1375 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); 1376 1377 MI.eraseFromParent(); 1378 return true; 1379 } 1380 1381 bool AMDGPULegalizerInfo::legalizeFrint( 1382 MachineInstr &MI, MachineRegisterInfo &MRI, 1383 MachineIRBuilder &B) const { 1384 B.setInstr(MI); 1385 1386 Register Src = MI.getOperand(1).getReg(); 1387 LLT Ty = MRI.getType(Src); 1388 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1389 1390 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1391 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1392 1393 auto C1 = B.buildFConstant(Ty, C1Val); 1394 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1395 1396 // TODO: Should this propagate fast-math-flags? 1397 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1398 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1399 1400 auto C2 = B.buildFConstant(Ty, C2Val); 1401 auto Fabs = B.buildFAbs(Ty, Src); 1402 1403 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1404 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1405 return true; 1406 } 1407 1408 bool AMDGPULegalizerInfo::legalizeFceil( 1409 MachineInstr &MI, MachineRegisterInfo &MRI, 1410 MachineIRBuilder &B) const { 1411 B.setInstr(MI); 1412 1413 const LLT S1 = LLT::scalar(1); 1414 const LLT S64 = LLT::scalar(64); 1415 1416 Register Src = MI.getOperand(1).getReg(); 1417 assert(MRI.getType(Src) == S64); 1418 1419 // result = trunc(src) 1420 // if (src > 0.0 && src != result) 1421 // result += 1.0 1422 1423 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src}); 1424 1425 const auto Zero = B.buildFConstant(S64, 0.0); 1426 const auto One = B.buildFConstant(S64, 1.0); 1427 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1428 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1429 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1430 auto Add = B.buildSelect(S64, And, One, Zero); 1431 1432 // TODO: Should this propagate fast-math-flags? 1433 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1434 return true; 1435 } 1436 1437 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1438 MachineIRBuilder &B) { 1439 const unsigned FractBits = 52; 1440 const unsigned ExpBits = 11; 1441 LLT S32 = LLT::scalar(32); 1442 1443 auto Const0 = B.buildConstant(S32, FractBits - 32); 1444 auto Const1 = B.buildConstant(S32, ExpBits); 1445 1446 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1447 .addUse(Const0.getReg(0)) 1448 .addUse(Const1.getReg(0)); 1449 1450 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1451 } 1452 1453 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1454 MachineInstr &MI, MachineRegisterInfo &MRI, 1455 MachineIRBuilder &B) const { 1456 B.setInstr(MI); 1457 1458 const LLT S1 = LLT::scalar(1); 1459 const LLT S32 = LLT::scalar(32); 1460 const LLT S64 = LLT::scalar(64); 1461 1462 Register Src = MI.getOperand(1).getReg(); 1463 assert(MRI.getType(Src) == S64); 1464 1465 // TODO: Should this use extract since the low half is unused? 1466 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1467 Register Hi = Unmerge.getReg(1); 1468 1469 // Extract the upper half, since this is where we will find the sign and 1470 // exponent. 1471 auto Exp = extractF64Exponent(Hi, B); 1472 1473 const unsigned FractBits = 52; 1474 1475 // Extract the sign bit. 1476 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1477 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1478 1479 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1480 1481 const auto Zero32 = B.buildConstant(S32, 0); 1482 1483 // Extend back to 64-bits. 1484 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); 1485 1486 auto Shr = B.buildAShr(S64, FractMask, Exp); 1487 auto Not = B.buildNot(S64, Shr); 1488 auto Tmp0 = B.buildAnd(S64, Src, Not); 1489 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1490 1491 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1492 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1493 1494 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1495 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1496 return true; 1497 } 1498 1499 bool AMDGPULegalizerInfo::legalizeITOFP( 1500 MachineInstr &MI, MachineRegisterInfo &MRI, 1501 MachineIRBuilder &B, bool Signed) const { 1502 B.setInstr(MI); 1503 1504 Register Dst = MI.getOperand(0).getReg(); 1505 Register Src = MI.getOperand(1).getReg(); 1506 1507 const LLT S64 = LLT::scalar(64); 1508 const LLT S32 = LLT::scalar(32); 1509 1510 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1511 1512 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1513 1514 auto CvtHi = Signed ? 1515 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1516 B.buildUITOFP(S64, Unmerge.getReg(1)); 1517 1518 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1519 1520 auto ThirtyTwo = B.buildConstant(S32, 32); 1521 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1522 .addUse(CvtHi.getReg(0)) 1523 .addUse(ThirtyTwo.getReg(0)); 1524 1525 // TODO: Should this propagate fast-math-flags? 1526 B.buildFAdd(Dst, LdExp, CvtLo); 1527 MI.eraseFromParent(); 1528 return true; 1529 } 1530 1531 // TODO: Copied from DAG implementation. Verify logic and document how this 1532 // actually works. 1533 bool AMDGPULegalizerInfo::legalizeFPTOI( 1534 MachineInstr &MI, MachineRegisterInfo &MRI, 1535 MachineIRBuilder &B, bool Signed) const { 1536 B.setInstr(MI); 1537 1538 Register Dst = MI.getOperand(0).getReg(); 1539 Register Src = MI.getOperand(1).getReg(); 1540 1541 const LLT S64 = LLT::scalar(64); 1542 const LLT S32 = LLT::scalar(32); 1543 1544 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1545 1546 unsigned Flags = MI.getFlags(); 1547 1548 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1549 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1550 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1551 1552 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1553 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1554 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1555 1556 auto Hi = Signed ? 1557 B.buildFPTOSI(S32, FloorMul) : 1558 B.buildFPTOUI(S32, FloorMul); 1559 auto Lo = B.buildFPTOUI(S32, Fma); 1560 1561 B.buildMerge(Dst, { Lo.getReg(0), Hi.getReg(0) }); 1562 MI.eraseFromParent(); 1563 1564 return true; 1565 } 1566 1567 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1568 MachineInstr &MI, MachineRegisterInfo &MRI, 1569 MachineIRBuilder &B) const { 1570 MachineFunction &MF = B.getMF(); 1571 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1572 1573 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1574 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1575 1576 // With ieee_mode disabled, the instructions have the correct behavior 1577 // already for G_FMINNUM/G_FMAXNUM 1578 if (!MFI->getMode().IEEE) 1579 return !IsIEEEOp; 1580 1581 if (IsIEEEOp) 1582 return true; 1583 1584 MachineIRBuilder HelperBuilder(MI); 1585 GISelObserverWrapper DummyObserver; 1586 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1587 HelperBuilder.setInstr(MI); 1588 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1589 } 1590 1591 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1592 MachineInstr &MI, MachineRegisterInfo &MRI, 1593 MachineIRBuilder &B) const { 1594 // TODO: Should move some of this into LegalizerHelper. 1595 1596 // TODO: Promote dynamic indexing of s16 to s32 1597 // TODO: Dynamic s64 indexing is only legal for SGPR. 1598 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); 1599 if (!IdxVal) // Dynamic case will be selected to register indexing. 1600 return true; 1601 1602 Register Dst = MI.getOperand(0).getReg(); 1603 Register Vec = MI.getOperand(1).getReg(); 1604 1605 LLT VecTy = MRI.getType(Vec); 1606 LLT EltTy = VecTy.getElementType(); 1607 assert(EltTy == MRI.getType(Dst)); 1608 1609 B.setInstr(MI); 1610 1611 if (IdxVal.getValue() < VecTy.getNumElements()) 1612 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); 1613 else 1614 B.buildUndef(Dst); 1615 1616 MI.eraseFromParent(); 1617 return true; 1618 } 1619 1620 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1621 MachineInstr &MI, MachineRegisterInfo &MRI, 1622 MachineIRBuilder &B) const { 1623 // TODO: Should move some of this into LegalizerHelper. 1624 1625 // TODO: Promote dynamic indexing of s16 to s32 1626 // TODO: Dynamic s64 indexing is only legal for SGPR. 1627 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); 1628 if (!IdxVal) // Dynamic case will be selected to register indexing. 1629 return true; 1630 1631 Register Dst = MI.getOperand(0).getReg(); 1632 Register Vec = MI.getOperand(1).getReg(); 1633 Register Ins = MI.getOperand(2).getReg(); 1634 1635 LLT VecTy = MRI.getType(Vec); 1636 LLT EltTy = VecTy.getElementType(); 1637 assert(EltTy == MRI.getType(Ins)); 1638 1639 B.setInstr(MI); 1640 1641 if (IdxVal.getValue() < VecTy.getNumElements()) 1642 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); 1643 else 1644 B.buildUndef(Dst); 1645 1646 MI.eraseFromParent(); 1647 return true; 1648 } 1649 1650 static bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask) { 1651 assert(Mask.size() == 2); 1652 1653 // If one half is undef, the other is trivially in the same reg. 1654 if (Mask[0] == -1 || Mask[1] == -1) 1655 return true; 1656 return ((Mask[0] == 0 || Mask[0] == 1) && (Mask[1] == 0 || Mask[1] == 1)) || 1657 ((Mask[0] == 2 || Mask[0] == 3) && (Mask[1] == 2 || Mask[1] == 3)); 1658 } 1659 1660 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1661 MachineInstr &MI, MachineRegisterInfo &MRI, 1662 MachineIRBuilder &B) const { 1663 const LLT V2S16 = LLT::vector(2, 16); 1664 1665 Register Dst = MI.getOperand(0).getReg(); 1666 Register Src0 = MI.getOperand(1).getReg(); 1667 LLT DstTy = MRI.getType(Dst); 1668 LLT SrcTy = MRI.getType(Src0); 1669 1670 if (SrcTy == V2S16 && DstTy == V2S16 && 1671 isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1672 return true; 1673 1674 MachineIRBuilder HelperBuilder(MI); 1675 GISelObserverWrapper DummyObserver; 1676 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1677 HelperBuilder.setInstr(MI); 1678 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1679 } 1680 1681 bool AMDGPULegalizerInfo::legalizeSinCos( 1682 MachineInstr &MI, MachineRegisterInfo &MRI, 1683 MachineIRBuilder &B) const { 1684 B.setInstr(MI); 1685 1686 Register DstReg = MI.getOperand(0).getReg(); 1687 Register SrcReg = MI.getOperand(1).getReg(); 1688 LLT Ty = MRI.getType(DstReg); 1689 unsigned Flags = MI.getFlags(); 1690 1691 Register TrigVal; 1692 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1693 if (ST.hasTrigReducedRange()) { 1694 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1695 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1696 .addUse(MulVal.getReg(0)) 1697 .setMIFlags(Flags).getReg(0); 1698 } else 1699 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1700 1701 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1702 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1703 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1704 .addUse(TrigVal) 1705 .setMIFlags(Flags); 1706 MI.eraseFromParent(); 1707 return true; 1708 } 1709 1710 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1711 Register DstReg, LLT PtrTy, 1712 MachineIRBuilder &B, const GlobalValue *GV, 1713 unsigned Offset, unsigned GAFlags) const { 1714 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1715 // to the following code sequence: 1716 // 1717 // For constant address space: 1718 // s_getpc_b64 s[0:1] 1719 // s_add_u32 s0, s0, $symbol 1720 // s_addc_u32 s1, s1, 0 1721 // 1722 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1723 // a fixup or relocation is emitted to replace $symbol with a literal 1724 // constant, which is a pc-relative offset from the encoding of the $symbol 1725 // operand to the global variable. 1726 // 1727 // For global address space: 1728 // s_getpc_b64 s[0:1] 1729 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1730 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1731 // 1732 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1733 // fixups or relocations are emitted to replace $symbol@*@lo and 1734 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1735 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1736 // operand to the global variable. 1737 // 1738 // What we want here is an offset from the value returned by s_getpc 1739 // (which is the address of the s_add_u32 instruction) to the global 1740 // variable, but since the encoding of $symbol starts 4 bytes after the start 1741 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1742 // small. This requires us to add 4 to the global variable offset in order to 1743 // compute the correct address. 1744 1745 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1746 1747 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1748 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1749 1750 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1751 .addDef(PCReg); 1752 1753 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1754 if (GAFlags == SIInstrInfo::MO_NONE) 1755 MIB.addImm(0); 1756 else 1757 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1758 1759 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1760 1761 if (PtrTy.getSizeInBits() == 32) 1762 B.buildExtract(DstReg, PCReg, 0); 1763 return true; 1764 } 1765 1766 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1767 MachineInstr &MI, MachineRegisterInfo &MRI, 1768 MachineIRBuilder &B) const { 1769 Register DstReg = MI.getOperand(0).getReg(); 1770 LLT Ty = MRI.getType(DstReg); 1771 unsigned AS = Ty.getAddressSpace(); 1772 1773 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1774 MachineFunction &MF = B.getMF(); 1775 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1776 B.setInstr(MI); 1777 1778 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1779 if (!MFI->isEntryFunction()) { 1780 const Function &Fn = MF.getFunction(); 1781 DiagnosticInfoUnsupported BadLDSDecl( 1782 Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 1783 Fn.getContext().diagnose(BadLDSDecl); 1784 } 1785 1786 // TODO: We could emit code to handle the initialization somewhere. 1787 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1788 const SITargetLowering *TLI = ST.getTargetLowering(); 1789 if (!TLI->shouldUseLDSConstAddress(GV)) { 1790 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 1791 return true; // Leave in place; 1792 } 1793 1794 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1795 MI.eraseFromParent(); 1796 return true; 1797 } 1798 1799 const Function &Fn = MF.getFunction(); 1800 DiagnosticInfoUnsupported BadInit( 1801 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1802 Fn.getContext().diagnose(BadInit); 1803 return true; 1804 } 1805 1806 const SITargetLowering *TLI = ST.getTargetLowering(); 1807 1808 if (TLI->shouldEmitFixup(GV)) { 1809 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 1810 MI.eraseFromParent(); 1811 return true; 1812 } 1813 1814 if (TLI->shouldEmitPCReloc(GV)) { 1815 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 1816 MI.eraseFromParent(); 1817 return true; 1818 } 1819 1820 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1821 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 1822 1823 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 1824 MachinePointerInfo::getGOT(MF), 1825 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1826 MachineMemOperand::MOInvariant, 1827 8 /*Size*/, 8 /*Align*/); 1828 1829 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 1830 1831 if (Ty.getSizeInBits() == 32) { 1832 // Truncate if this is a 32-bit constant adrdess. 1833 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 1834 B.buildExtract(DstReg, Load, 0); 1835 } else 1836 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 1837 1838 MI.eraseFromParent(); 1839 return true; 1840 } 1841 1842 bool AMDGPULegalizerInfo::legalizeLoad( 1843 MachineInstr &MI, MachineRegisterInfo &MRI, 1844 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 1845 B.setInstr(MI); 1846 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1847 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 1848 Observer.changingInstr(MI); 1849 MI.getOperand(1).setReg(Cast.getReg(0)); 1850 Observer.changedInstr(MI); 1851 return true; 1852 } 1853 1854 bool AMDGPULegalizerInfo::legalizeFMad( 1855 MachineInstr &MI, MachineRegisterInfo &MRI, 1856 MachineIRBuilder &B) const { 1857 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 1858 assert(Ty.isScalar()); 1859 1860 MachineFunction &MF = B.getMF(); 1861 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1862 1863 // TODO: Always legal with future ftz flag. 1864 if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals) 1865 return true; 1866 if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals) 1867 return true; 1868 1869 1870 MachineIRBuilder HelperBuilder(MI); 1871 GISelObserverWrapper DummyObserver; 1872 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1873 HelperBuilder.setMBB(*MI.getParent()); 1874 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 1875 } 1876 1877 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 1878 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 1879 Register DstReg = MI.getOperand(0).getReg(); 1880 Register PtrReg = MI.getOperand(1).getReg(); 1881 Register CmpVal = MI.getOperand(2).getReg(); 1882 Register NewVal = MI.getOperand(3).getReg(); 1883 1884 assert(SITargetLowering::isFlatGlobalAddrSpace( 1885 MRI.getType(PtrReg).getAddressSpace()) && 1886 "this should not have been custom lowered"); 1887 1888 LLT ValTy = MRI.getType(CmpVal); 1889 LLT VecTy = LLT::vector(2, ValTy); 1890 1891 B.setInstr(MI); 1892 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 1893 1894 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 1895 .addDef(DstReg) 1896 .addUse(PtrReg) 1897 .addUse(PackedVal) 1898 .setMemRefs(MI.memoperands()); 1899 1900 MI.eraseFromParent(); 1901 return true; 1902 } 1903 1904 bool AMDGPULegalizerInfo::legalizeFlog( 1905 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 1906 Register Dst = MI.getOperand(0).getReg(); 1907 Register Src = MI.getOperand(1).getReg(); 1908 LLT Ty = B.getMRI()->getType(Dst); 1909 unsigned Flags = MI.getFlags(); 1910 B.setInstr(MI); 1911 1912 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 1913 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 1914 1915 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 1916 MI.eraseFromParent(); 1917 return true; 1918 } 1919 1920 // Return the use branch instruction, otherwise null if the usage is invalid. 1921 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 1922 MachineRegisterInfo &MRI, 1923 MachineInstr *&Br) { 1924 Register CondDef = MI.getOperand(0).getReg(); 1925 if (!MRI.hasOneNonDBGUse(CondDef)) 1926 return nullptr; 1927 1928 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 1929 if (UseMI.getParent() != MI.getParent() || 1930 UseMI.getOpcode() != AMDGPU::G_BRCOND) 1931 return nullptr; 1932 1933 // Make sure the cond br is followed by a G_BR 1934 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 1935 if (Next != MI.getParent()->end()) { 1936 if (Next->getOpcode() != AMDGPU::G_BR) 1937 return nullptr; 1938 Br = &*Next; 1939 } 1940 1941 return &UseMI; 1942 } 1943 1944 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 1945 Register Reg, LLT Ty) const { 1946 Register LiveIn = MRI.getLiveInVirtReg(Reg); 1947 if (LiveIn) 1948 return LiveIn; 1949 1950 Register NewReg = MRI.createGenericVirtualRegister(Ty); 1951 MRI.addLiveIn(Reg, NewReg); 1952 return NewReg; 1953 } 1954 1955 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 1956 const ArgDescriptor *Arg) const { 1957 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 1958 return false; // TODO: Handle these 1959 1960 assert(Arg->getRegister().isPhysical()); 1961 1962 MachineRegisterInfo &MRI = *B.getMRI(); 1963 1964 LLT Ty = MRI.getType(DstReg); 1965 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 1966 1967 if (Arg->isMasked()) { 1968 // TODO: Should we try to emit this once in the entry block? 1969 const LLT S32 = LLT::scalar(32); 1970 const unsigned Mask = Arg->getMask(); 1971 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 1972 1973 Register AndMaskSrc = LiveIn; 1974 1975 if (Shift != 0) { 1976 auto ShiftAmt = B.buildConstant(S32, Shift); 1977 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 1978 } 1979 1980 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 1981 } else 1982 B.buildCopy(DstReg, LiveIn); 1983 1984 // Insert the argument copy if it doens't already exist. 1985 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 1986 if (!MRI.getVRegDef(LiveIn)) { 1987 // FIXME: Should have scoped insert pt 1988 MachineBasicBlock &OrigInsBB = B.getMBB(); 1989 auto OrigInsPt = B.getInsertPt(); 1990 1991 MachineBasicBlock &EntryMBB = B.getMF().front(); 1992 EntryMBB.addLiveIn(Arg->getRegister()); 1993 B.setInsertPt(EntryMBB, EntryMBB.begin()); 1994 B.buildCopy(LiveIn, Arg->getRegister()); 1995 1996 B.setInsertPt(OrigInsBB, OrigInsPt); 1997 } 1998 1999 return true; 2000 } 2001 2002 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2003 MachineInstr &MI, 2004 MachineRegisterInfo &MRI, 2005 MachineIRBuilder &B, 2006 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2007 B.setInstr(MI); 2008 2009 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2010 2011 const ArgDescriptor *Arg; 2012 const TargetRegisterClass *RC; 2013 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 2014 if (!Arg) { 2015 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2016 return false; 2017 } 2018 2019 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 2020 MI.eraseFromParent(); 2021 return true; 2022 } 2023 2024 return false; 2025 } 2026 2027 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2028 MachineRegisterInfo &MRI, 2029 MachineIRBuilder &B) const { 2030 B.setInstr(MI); 2031 Register Dst = MI.getOperand(0).getReg(); 2032 LLT DstTy = MRI.getType(Dst); 2033 LLT S16 = LLT::scalar(16); 2034 LLT S32 = LLT::scalar(32); 2035 LLT S64 = LLT::scalar(64); 2036 2037 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2038 return true; 2039 2040 if (DstTy == S16) 2041 return legalizeFDIV16(MI, MRI, B); 2042 if (DstTy == S32) 2043 return legalizeFDIV32(MI, MRI, B); 2044 if (DstTy == S64) 2045 return legalizeFDIV64(MI, MRI, B); 2046 2047 return false; 2048 } 2049 2050 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2051 MachineRegisterInfo &MRI, 2052 MachineIRBuilder &B) const { 2053 Register Res = MI.getOperand(0).getReg(); 2054 Register LHS = MI.getOperand(1).getReg(); 2055 Register RHS = MI.getOperand(2).getReg(); 2056 2057 uint16_t Flags = MI.getFlags(); 2058 2059 LLT ResTy = MRI.getType(Res); 2060 LLT S32 = LLT::scalar(32); 2061 LLT S64 = LLT::scalar(64); 2062 2063 const MachineFunction &MF = B.getMF(); 2064 bool Unsafe = 2065 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2066 2067 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2068 return false; 2069 2070 if (!Unsafe && ResTy == S32 && 2071 MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals) 2072 return false; 2073 2074 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2075 // 1 / x -> RCP(x) 2076 if (CLHS->isExactlyValue(1.0)) { 2077 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2078 .addUse(RHS) 2079 .setMIFlags(Flags); 2080 2081 MI.eraseFromParent(); 2082 return true; 2083 } 2084 2085 // -1 / x -> RCP( FNEG(x) ) 2086 if (CLHS->isExactlyValue(-1.0)) { 2087 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2088 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2089 .addUse(FNeg.getReg(0)) 2090 .setMIFlags(Flags); 2091 2092 MI.eraseFromParent(); 2093 return true; 2094 } 2095 } 2096 2097 // x / y -> x * (1.0 / y) 2098 if (Unsafe) { 2099 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2100 .addUse(RHS) 2101 .setMIFlags(Flags); 2102 B.buildFMul(Res, LHS, RCP, Flags); 2103 2104 MI.eraseFromParent(); 2105 return true; 2106 } 2107 2108 return false; 2109 } 2110 2111 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2112 MachineRegisterInfo &MRI, 2113 MachineIRBuilder &B) const { 2114 B.setInstr(MI); 2115 Register Res = MI.getOperand(0).getReg(); 2116 Register LHS = MI.getOperand(1).getReg(); 2117 Register RHS = MI.getOperand(2).getReg(); 2118 2119 uint16_t Flags = MI.getFlags(); 2120 2121 LLT S16 = LLT::scalar(16); 2122 LLT S32 = LLT::scalar(32); 2123 2124 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2125 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2126 2127 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2128 .addUse(RHSExt.getReg(0)) 2129 .setMIFlags(Flags); 2130 2131 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2132 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2133 2134 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2135 .addUse(RDst.getReg(0)) 2136 .addUse(RHS) 2137 .addUse(LHS) 2138 .setMIFlags(Flags); 2139 2140 MI.eraseFromParent(); 2141 return true; 2142 } 2143 2144 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2145 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2146 static void toggleSPDenormMode(bool Enable, 2147 MachineIRBuilder &B, 2148 const GCNSubtarget &ST, 2149 AMDGPU::SIModeRegisterDefaults Mode) { 2150 // Set SP denorm mode to this value. 2151 unsigned SPDenormMode = 2152 Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; 2153 2154 if (ST.hasDenormModeInst()) { 2155 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2156 unsigned DPDenormModeDefault = Mode.FP64FP16Denormals 2157 ? FP_DENORM_FLUSH_NONE 2158 : FP_DENORM_FLUSH_IN_FLUSH_OUT; 2159 2160 unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2161 B.buildInstr(AMDGPU::S_DENORM_MODE) 2162 .addImm(NewDenormModeValue); 2163 2164 } else { 2165 // Select FP32 bit field in mode register. 2166 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2167 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2168 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2169 2170 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2171 .addImm(SPDenormMode) 2172 .addImm(SPDenormModeBitField); 2173 } 2174 } 2175 2176 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2177 MachineRegisterInfo &MRI, 2178 MachineIRBuilder &B) const { 2179 B.setInstr(MI); 2180 Register Res = MI.getOperand(0).getReg(); 2181 Register LHS = MI.getOperand(1).getReg(); 2182 Register RHS = MI.getOperand(2).getReg(); 2183 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2184 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2185 2186 uint16_t Flags = MI.getFlags(); 2187 2188 LLT S32 = LLT::scalar(32); 2189 LLT S1 = LLT::scalar(1); 2190 2191 auto One = B.buildFConstant(S32, 1.0f); 2192 2193 auto DenominatorScaled = 2194 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2195 .addUse(RHS) 2196 .addUse(LHS) 2197 .addImm(1) 2198 .setMIFlags(Flags); 2199 auto NumeratorScaled = 2200 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2201 .addUse(LHS) 2202 .addUse(RHS) 2203 .addImm(0) 2204 .setMIFlags(Flags); 2205 2206 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2207 .addUse(DenominatorScaled.getReg(0)) 2208 .setMIFlags(Flags); 2209 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2210 2211 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2212 // aren't modeled as reading it. 2213 if (!Mode.FP32Denormals) 2214 toggleSPDenormMode(true, B, ST, Mode); 2215 2216 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2217 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2218 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2219 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2220 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2221 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2222 2223 if (!Mode.FP32Denormals) 2224 toggleSPDenormMode(false, B, ST, Mode); 2225 2226 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2227 .addUse(Fma4.getReg(0)) 2228 .addUse(Fma1.getReg(0)) 2229 .addUse(Fma3.getReg(0)) 2230 .addUse(NumeratorScaled.getReg(1)) 2231 .setMIFlags(Flags); 2232 2233 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2234 .addUse(Fmas.getReg(0)) 2235 .addUse(RHS) 2236 .addUse(LHS) 2237 .setMIFlags(Flags); 2238 2239 MI.eraseFromParent(); 2240 return true; 2241 } 2242 2243 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2244 MachineRegisterInfo &MRI, 2245 MachineIRBuilder &B) const { 2246 B.setInstr(MI); 2247 Register Res = MI.getOperand(0).getReg(); 2248 Register LHS = MI.getOperand(1).getReg(); 2249 Register RHS = MI.getOperand(2).getReg(); 2250 2251 uint16_t Flags = MI.getFlags(); 2252 2253 LLT S64 = LLT::scalar(64); 2254 LLT S1 = LLT::scalar(1); 2255 2256 auto One = B.buildFConstant(S64, 1.0); 2257 2258 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2259 .addUse(LHS) 2260 .addUse(RHS) 2261 .addImm(1) 2262 .setMIFlags(Flags); 2263 2264 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2265 2266 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2267 .addUse(DivScale0.getReg(0)) 2268 .setMIFlags(Flags); 2269 2270 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2271 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2272 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2273 2274 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2275 .addUse(LHS) 2276 .addUse(RHS) 2277 .addImm(0) 2278 .setMIFlags(Flags); 2279 2280 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2281 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); 2282 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2283 2284 Register Scale; 2285 if (!ST.hasUsableDivScaleConditionOutput()) { 2286 // Workaround a hardware bug on SI where the condition output from div_scale 2287 // is not usable. 2288 2289 Scale = MRI.createGenericVirtualRegister(S1); 2290 2291 LLT S32 = LLT::scalar(32); 2292 2293 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2294 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2295 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2296 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 2297 2298 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 2299 Scale1Unmerge.getReg(1)); 2300 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 2301 Scale0Unmerge.getReg(1)); 2302 B.buildXor(Scale, CmpNum, CmpDen); 2303 } else { 2304 Scale = DivScale1.getReg(1); 2305 } 2306 2307 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 2308 .addUse(Fma4.getReg(0)) 2309 .addUse(Fma3.getReg(0)) 2310 .addUse(Mul.getReg(0)) 2311 .addUse(Scale) 2312 .setMIFlags(Flags); 2313 2314 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 2315 .addUse(Fmas.getReg(0)) 2316 .addUse(RHS) 2317 .addUse(LHS) 2318 .setMIFlags(Flags); 2319 2320 MI.eraseFromParent(); 2321 return true; 2322 } 2323 2324 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 2325 MachineRegisterInfo &MRI, 2326 MachineIRBuilder &B) const { 2327 B.setInstr(MI); 2328 Register Res = MI.getOperand(0).getReg(); 2329 Register LHS = MI.getOperand(2).getReg(); 2330 Register RHS = MI.getOperand(3).getReg(); 2331 uint16_t Flags = MI.getFlags(); 2332 2333 LLT S32 = LLT::scalar(32); 2334 LLT S1 = LLT::scalar(1); 2335 2336 auto Abs = B.buildFAbs(S32, RHS, Flags); 2337 const APFloat C0Val(1.0f); 2338 2339 auto C0 = B.buildConstant(S32, 0x6f800000); 2340 auto C1 = B.buildConstant(S32, 0x2f800000); 2341 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 2342 2343 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 2344 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 2345 2346 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 2347 2348 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2349 .addUse(Mul0.getReg(0)) 2350 .setMIFlags(Flags); 2351 2352 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 2353 2354 B.buildFMul(Res, Sel, Mul1, Flags); 2355 2356 MI.eraseFromParent(); 2357 return true; 2358 } 2359 2360 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 2361 MachineRegisterInfo &MRI, 2362 MachineIRBuilder &B) const { 2363 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2364 if (!MFI->isEntryFunction()) { 2365 return legalizePreloadedArgIntrin(MI, MRI, B, 2366 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 2367 } 2368 2369 B.setInstr(MI); 2370 2371 uint64_t Offset = 2372 ST.getTargetLowering()->getImplicitParameterOffset( 2373 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 2374 Register DstReg = MI.getOperand(0).getReg(); 2375 LLT DstTy = MRI.getType(DstReg); 2376 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 2377 2378 const ArgDescriptor *Arg; 2379 const TargetRegisterClass *RC; 2380 std::tie(Arg, RC) 2381 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2382 if (!Arg) 2383 return false; 2384 2385 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 2386 if (!loadInputValue(KernargPtrReg, B, Arg)) 2387 return false; 2388 2389 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 2390 MI.eraseFromParent(); 2391 return true; 2392 } 2393 2394 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 2395 MachineRegisterInfo &MRI, 2396 MachineIRBuilder &B, 2397 unsigned AddrSpace) const { 2398 B.setInstr(MI); 2399 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 2400 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 2401 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 2402 MI.eraseFromParent(); 2403 return true; 2404 } 2405 2406 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 2407 // offset (the offset that is included in bounds checking and swizzling, to be 2408 // split between the instruction's voffset and immoffset fields) and soffset 2409 // (the offset that is excluded from bounds checking and swizzling, to go in 2410 // the instruction's soffset field). This function takes the first kind of 2411 // offset and figures out how to split it between voffset and immoffset. 2412 std::tuple<Register, unsigned, unsigned> 2413 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 2414 Register OrigOffset) const { 2415 const unsigned MaxImm = 4095; 2416 Register BaseReg; 2417 unsigned TotalConstOffset; 2418 MachineInstr *OffsetDef; 2419 const LLT S32 = LLT::scalar(32); 2420 2421 std::tie(BaseReg, TotalConstOffset, OffsetDef) 2422 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 2423 2424 unsigned ImmOffset = TotalConstOffset; 2425 2426 // If the immediate value is too big for the immoffset field, put the value 2427 // and -4096 into the immoffset field so that the value that is copied/added 2428 // for the voffset field is a multiple of 4096, and it stands more chance 2429 // of being CSEd with the copy/add for another similar load/store. 2430 // However, do not do that rounding down to a multiple of 4096 if that is a 2431 // negative number, as it appears to be illegal to have a negative offset 2432 // in the vgpr, even if adding the immediate offset makes it positive. 2433 unsigned Overflow = ImmOffset & ~MaxImm; 2434 ImmOffset -= Overflow; 2435 if ((int32_t)Overflow < 0) { 2436 Overflow += ImmOffset; 2437 ImmOffset = 0; 2438 } 2439 2440 if (Overflow != 0) { 2441 if (!BaseReg) { 2442 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 2443 } else { 2444 auto OverflowVal = B.buildConstant(S32, Overflow); 2445 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 2446 } 2447 } 2448 2449 if (!BaseReg) 2450 BaseReg = B.buildConstant(S32, 0).getReg(0); 2451 2452 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 2453 } 2454 2455 /// Handle register layout difference for f16 images for some subtargets. 2456 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 2457 MachineRegisterInfo &MRI, 2458 Register Reg) const { 2459 if (!ST.hasUnpackedD16VMem()) 2460 return Reg; 2461 2462 const LLT S16 = LLT::scalar(16); 2463 const LLT S32 = LLT::scalar(32); 2464 LLT StoreVT = MRI.getType(Reg); 2465 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 2466 2467 auto Unmerge = B.buildUnmerge(S16, Reg); 2468 2469 SmallVector<Register, 4> WideRegs; 2470 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 2471 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 2472 2473 int NumElts = StoreVT.getNumElements(); 2474 2475 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 2476 } 2477 2478 Register AMDGPULegalizerInfo::fixStoreSourceType( 2479 MachineIRBuilder &B, Register VData, bool IsFormat) const { 2480 MachineRegisterInfo *MRI = B.getMRI(); 2481 LLT Ty = MRI->getType(VData); 2482 2483 const LLT S16 = LLT::scalar(16); 2484 2485 // Fixup illegal register types for i8 stores. 2486 if (Ty == LLT::scalar(8) || Ty == S16) { 2487 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 2488 return AnyExt; 2489 } 2490 2491 if (Ty.isVector()) { 2492 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 2493 if (IsFormat) 2494 return handleD16VData(B, *MRI, VData); 2495 } 2496 } 2497 2498 return VData; 2499 } 2500 2501 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 2502 MachineRegisterInfo &MRI, 2503 MachineIRBuilder &B, 2504 bool IsTyped, 2505 bool IsFormat) const { 2506 B.setInstr(MI); 2507 2508 Register VData = MI.getOperand(1).getReg(); 2509 LLT Ty = MRI.getType(VData); 2510 LLT EltTy = Ty.getScalarType(); 2511 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 2512 const LLT S32 = LLT::scalar(32); 2513 2514 VData = fixStoreSourceType(B, VData, IsFormat); 2515 Register RSrc = MI.getOperand(2).getReg(); 2516 2517 MachineMemOperand *MMO = *MI.memoperands_begin(); 2518 const int MemSize = MMO->getSize(); 2519 2520 unsigned ImmOffset; 2521 unsigned TotalOffset; 2522 2523 // The typed intrinsics add an immediate after the registers. 2524 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 2525 2526 // The struct intrinsic variants add one additional operand over raw. 2527 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 2528 Register VIndex; 2529 int OpOffset = 0; 2530 if (HasVIndex) { 2531 VIndex = MI.getOperand(3).getReg(); 2532 OpOffset = 1; 2533 } 2534 2535 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 2536 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 2537 2538 unsigned Format = 0; 2539 if (IsTyped) { 2540 Format = MI.getOperand(5 + OpOffset).getImm(); 2541 ++OpOffset; 2542 } 2543 2544 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 2545 2546 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 2547 if (TotalOffset != 0) 2548 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 2549 2550 unsigned Opc; 2551 if (IsTyped) { 2552 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 2553 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 2554 } else if (IsFormat) { 2555 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 2556 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 2557 } else { 2558 switch (MemSize) { 2559 case 1: 2560 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 2561 break; 2562 case 2: 2563 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 2564 break; 2565 default: 2566 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 2567 break; 2568 } 2569 } 2570 2571 if (!VIndex) 2572 VIndex = B.buildConstant(S32, 0).getReg(0); 2573 2574 auto MIB = B.buildInstr(Opc) 2575 .addUse(VData) // vdata 2576 .addUse(RSrc) // rsrc 2577 .addUse(VIndex) // vindex 2578 .addUse(VOffset) // voffset 2579 .addUse(SOffset) // soffset 2580 .addImm(ImmOffset); // offset(imm) 2581 2582 if (IsTyped) 2583 MIB.addImm(Format); 2584 2585 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 2586 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 2587 .addMemOperand(MMO); 2588 2589 MI.eraseFromParent(); 2590 return true; 2591 } 2592 2593 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 2594 MachineRegisterInfo &MRI, 2595 MachineIRBuilder &B, 2596 bool IsFormat, 2597 bool IsTyped) const { 2598 B.setInstr(MI); 2599 2600 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 2601 MachineMemOperand *MMO = *MI.memoperands_begin(); 2602 const int MemSize = MMO->getSize(); 2603 const LLT S32 = LLT::scalar(32); 2604 2605 Register Dst = MI.getOperand(0).getReg(); 2606 Register RSrc = MI.getOperand(2).getReg(); 2607 2608 // The typed intrinsics add an immediate after the registers. 2609 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 2610 2611 // The struct intrinsic variants add one additional operand over raw. 2612 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 2613 Register VIndex; 2614 int OpOffset = 0; 2615 if (HasVIndex) { 2616 VIndex = MI.getOperand(3).getReg(); 2617 OpOffset = 1; 2618 } 2619 2620 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 2621 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 2622 2623 unsigned Format = 0; 2624 if (IsTyped) { 2625 Format = MI.getOperand(5 + OpOffset).getImm(); 2626 ++OpOffset; 2627 } 2628 2629 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 2630 unsigned ImmOffset; 2631 unsigned TotalOffset; 2632 2633 LLT Ty = MRI.getType(Dst); 2634 LLT EltTy = Ty.getScalarType(); 2635 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 2636 const bool Unpacked = ST.hasUnpackedD16VMem(); 2637 2638 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 2639 if (TotalOffset != 0) 2640 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 2641 2642 unsigned Opc; 2643 2644 if (IsTyped) { 2645 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 2646 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 2647 } else if (IsFormat) { 2648 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 2649 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 2650 } else { 2651 switch (MemSize) { 2652 case 1: 2653 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 2654 break; 2655 case 2: 2656 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 2657 break; 2658 default: 2659 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 2660 break; 2661 } 2662 } 2663 2664 Register LoadDstReg; 2665 2666 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 2667 LLT UnpackedTy = Ty.changeElementSize(32); 2668 2669 if (IsExtLoad) 2670 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 2671 else if (Unpacked && IsD16 && Ty.isVector()) 2672 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 2673 else 2674 LoadDstReg = Dst; 2675 2676 if (!VIndex) 2677 VIndex = B.buildConstant(S32, 0).getReg(0); 2678 2679 auto MIB = B.buildInstr(Opc) 2680 .addDef(LoadDstReg) // vdata 2681 .addUse(RSrc) // rsrc 2682 .addUse(VIndex) // vindex 2683 .addUse(VOffset) // voffset 2684 .addUse(SOffset) // soffset 2685 .addImm(ImmOffset); // offset(imm) 2686 2687 if (IsTyped) 2688 MIB.addImm(Format); 2689 2690 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 2691 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 2692 .addMemOperand(MMO); 2693 2694 if (LoadDstReg != Dst) { 2695 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2696 2697 // Widen result for extending loads was widened. 2698 if (IsExtLoad) 2699 B.buildTrunc(Dst, LoadDstReg); 2700 else { 2701 // Repack to original 16-bit vector result 2702 // FIXME: G_TRUNC should work, but legalization currently fails 2703 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 2704 SmallVector<Register, 4> Repack; 2705 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 2706 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 2707 B.buildMerge(Dst, Repack); 2708 } 2709 } 2710 2711 MI.eraseFromParent(); 2712 return true; 2713 } 2714 2715 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 2716 MachineIRBuilder &B, 2717 bool IsInc) const { 2718 B.setInstr(MI); 2719 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 2720 AMDGPU::G_AMDGPU_ATOMIC_DEC; 2721 B.buildInstr(Opc) 2722 .addDef(MI.getOperand(0).getReg()) 2723 .addUse(MI.getOperand(2).getReg()) 2724 .addUse(MI.getOperand(3).getReg()) 2725 .cloneMemRefs(MI); 2726 MI.eraseFromParent(); 2727 return true; 2728 } 2729 2730 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 2731 switch (IntrID) { 2732 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 2733 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 2734 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 2735 case Intrinsic::amdgcn_raw_buffer_atomic_add: 2736 case Intrinsic::amdgcn_struct_buffer_atomic_add: 2737 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 2738 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 2739 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 2740 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 2741 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 2742 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 2743 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 2744 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 2745 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 2746 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 2747 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 2748 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 2749 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 2750 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 2751 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 2752 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 2753 case Intrinsic::amdgcn_raw_buffer_atomic_and: 2754 case Intrinsic::amdgcn_struct_buffer_atomic_and: 2755 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 2756 case Intrinsic::amdgcn_raw_buffer_atomic_or: 2757 case Intrinsic::amdgcn_struct_buffer_atomic_or: 2758 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 2759 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 2760 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 2761 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 2762 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 2763 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 2764 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 2765 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 2766 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 2767 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 2768 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 2769 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 2770 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 2771 default: 2772 llvm_unreachable("unhandled atomic opcode"); 2773 } 2774 } 2775 2776 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 2777 MachineIRBuilder &B, 2778 Intrinsic::ID IID) const { 2779 B.setInstr(MI); 2780 2781 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 2782 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 2783 2784 Register Dst = MI.getOperand(0).getReg(); 2785 Register VData = MI.getOperand(2).getReg(); 2786 2787 Register CmpVal; 2788 int OpOffset = 0; 2789 2790 if (IsCmpSwap) { 2791 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 2792 ++OpOffset; 2793 } 2794 2795 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 2796 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 2797 2798 // The struct intrinsic variants add one additional operand over raw. 2799 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 2800 Register VIndex; 2801 if (HasVIndex) { 2802 VIndex = MI.getOperand(4 + OpOffset).getReg(); 2803 ++OpOffset; 2804 } 2805 2806 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 2807 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 2808 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 2809 2810 MachineMemOperand *MMO = *MI.memoperands_begin(); 2811 2812 unsigned ImmOffset; 2813 unsigned TotalOffset; 2814 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 2815 if (TotalOffset != 0) 2816 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 2817 2818 if (!VIndex) 2819 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 2820 2821 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 2822 .addDef(Dst) 2823 .addUse(VData); // vdata 2824 2825 if (IsCmpSwap) 2826 MIB.addReg(CmpVal); 2827 2828 MIB.addUse(RSrc) // rsrc 2829 .addUse(VIndex) // vindex 2830 .addUse(VOffset) // voffset 2831 .addUse(SOffset) // soffset 2832 .addImm(ImmOffset) // offset(imm) 2833 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 2834 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 2835 .addMemOperand(MMO); 2836 2837 MI.eraseFromParent(); 2838 return true; 2839 } 2840 2841 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 2842 MachineInstr &MI, MachineIRBuilder &B, 2843 GISelChangeObserver &Observer, 2844 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 2845 // We are only processing the operands of d16 image operations on subtargets 2846 // that use the unpacked register layout. 2847 if (!ST.hasUnpackedD16VMem()) 2848 return true; 2849 2850 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 2851 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 2852 2853 if (BaseOpcode->Atomic) // No d16 atomics 2854 return true; 2855 2856 MachineRegisterInfo *MRI = B.getMRI(); 2857 const LLT S32 = LLT::scalar(32); 2858 const LLT S16 = LLT::scalar(16); 2859 2860 if (BaseOpcode->Store) { 2861 Register VData = MI.getOperand(1).getReg(); 2862 LLT Ty = MRI->getType(VData); 2863 if (!Ty.isVector() || Ty.getElementType() != S16) 2864 return true; 2865 2866 B.setInstr(MI); 2867 2868 Observer.changingInstr(MI); 2869 MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData)); 2870 Observer.changedInstr(MI); 2871 return true; 2872 } 2873 2874 // Must be an image load. 2875 Register DstReg = MI.getOperand(0).getReg(); 2876 LLT Ty = MRI->getType(DstReg); 2877 if (!Ty.isVector() || Ty.getElementType() != S16) 2878 return true; 2879 2880 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 2881 2882 LLT WidenedTy = Ty.changeElementType(S32); 2883 Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy); 2884 2885 Observer.changingInstr(MI); 2886 MI.getOperand(0).setReg(WideDstReg); 2887 Observer.changedInstr(MI); 2888 2889 // FIXME: Just vector trunc should be sufficent, but legalization currently 2890 // broken. 2891 auto Unmerge = B.buildUnmerge(S32, WideDstReg); 2892 2893 int NumOps = Unmerge->getNumOperands() - 1; 2894 SmallVector<Register, 4> RemergeParts(NumOps); 2895 for (int I = 0; I != NumOps; ++I) 2896 RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0); 2897 2898 B.buildBuildVector(DstReg, RemergeParts); 2899 return true; 2900 } 2901 2902 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 2903 MachineIRBuilder &B, 2904 GISelChangeObserver &Observer) const { 2905 MachineRegisterInfo &MRI = *B.getMRI(); 2906 2907 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 2908 auto IntrID = MI.getIntrinsicID(); 2909 switch (IntrID) { 2910 case Intrinsic::amdgcn_if: 2911 case Intrinsic::amdgcn_else: { 2912 MachineInstr *Br = nullptr; 2913 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 2914 const SIRegisterInfo *TRI 2915 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 2916 2917 B.setInstr(*BrCond); 2918 Register Def = MI.getOperand(1).getReg(); 2919 Register Use = MI.getOperand(3).getReg(); 2920 2921 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 2922 if (Br) 2923 BrTarget = Br->getOperand(0).getMBB(); 2924 2925 if (IntrID == Intrinsic::amdgcn_if) { 2926 B.buildInstr(AMDGPU::SI_IF) 2927 .addDef(Def) 2928 .addUse(Use) 2929 .addMBB(BrTarget); 2930 } else { 2931 B.buildInstr(AMDGPU::SI_ELSE) 2932 .addDef(Def) 2933 .addUse(Use) 2934 .addMBB(BrTarget) 2935 .addImm(0); 2936 } 2937 2938 if (Br) 2939 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 2940 2941 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 2942 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 2943 MI.eraseFromParent(); 2944 BrCond->eraseFromParent(); 2945 return true; 2946 } 2947 2948 return false; 2949 } 2950 case Intrinsic::amdgcn_loop: { 2951 MachineInstr *Br = nullptr; 2952 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 2953 const SIRegisterInfo *TRI 2954 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 2955 2956 B.setInstr(*BrCond); 2957 2958 // FIXME: Need to adjust branch targets based on unconditional branch. 2959 Register Reg = MI.getOperand(2).getReg(); 2960 B.buildInstr(AMDGPU::SI_LOOP) 2961 .addUse(Reg) 2962 .addMBB(BrCond->getOperand(1).getMBB()); 2963 MI.eraseFromParent(); 2964 BrCond->eraseFromParent(); 2965 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 2966 return true; 2967 } 2968 2969 return false; 2970 } 2971 case Intrinsic::amdgcn_kernarg_segment_ptr: 2972 return legalizePreloadedArgIntrin( 2973 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2974 case Intrinsic::amdgcn_implicitarg_ptr: 2975 return legalizeImplicitArgPtr(MI, MRI, B); 2976 case Intrinsic::amdgcn_workitem_id_x: 2977 return legalizePreloadedArgIntrin(MI, MRI, B, 2978 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 2979 case Intrinsic::amdgcn_workitem_id_y: 2980 return legalizePreloadedArgIntrin(MI, MRI, B, 2981 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 2982 case Intrinsic::amdgcn_workitem_id_z: 2983 return legalizePreloadedArgIntrin(MI, MRI, B, 2984 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 2985 case Intrinsic::amdgcn_workgroup_id_x: 2986 return legalizePreloadedArgIntrin(MI, MRI, B, 2987 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 2988 case Intrinsic::amdgcn_workgroup_id_y: 2989 return legalizePreloadedArgIntrin(MI, MRI, B, 2990 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 2991 case Intrinsic::amdgcn_workgroup_id_z: 2992 return legalizePreloadedArgIntrin(MI, MRI, B, 2993 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 2994 case Intrinsic::amdgcn_dispatch_ptr: 2995 return legalizePreloadedArgIntrin(MI, MRI, B, 2996 AMDGPUFunctionArgInfo::DISPATCH_PTR); 2997 case Intrinsic::amdgcn_queue_ptr: 2998 return legalizePreloadedArgIntrin(MI, MRI, B, 2999 AMDGPUFunctionArgInfo::QUEUE_PTR); 3000 case Intrinsic::amdgcn_implicit_buffer_ptr: 3001 return legalizePreloadedArgIntrin( 3002 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 3003 case Intrinsic::amdgcn_dispatch_id: 3004 return legalizePreloadedArgIntrin(MI, MRI, B, 3005 AMDGPUFunctionArgInfo::DISPATCH_ID); 3006 case Intrinsic::amdgcn_fdiv_fast: 3007 return legalizeFDIVFastIntrin(MI, MRI, B); 3008 case Intrinsic::amdgcn_is_shared: 3009 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 3010 case Intrinsic::amdgcn_is_private: 3011 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 3012 case Intrinsic::amdgcn_wavefrontsize: { 3013 B.setInstr(MI); 3014 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 3015 MI.eraseFromParent(); 3016 return true; 3017 } 3018 case Intrinsic::amdgcn_raw_buffer_store: 3019 case Intrinsic::amdgcn_struct_buffer_store: 3020 return legalizeBufferStore(MI, MRI, B, false, false); 3021 case Intrinsic::amdgcn_raw_buffer_store_format: 3022 case Intrinsic::amdgcn_struct_buffer_store_format: 3023 return legalizeBufferStore(MI, MRI, B, false, true); 3024 case Intrinsic::amdgcn_raw_tbuffer_store: 3025 case Intrinsic::amdgcn_struct_tbuffer_store: 3026 return legalizeBufferStore(MI, MRI, B, true, true); 3027 case Intrinsic::amdgcn_raw_buffer_load: 3028 case Intrinsic::amdgcn_struct_buffer_load: 3029 return legalizeBufferLoad(MI, MRI, B, false, false); 3030 case Intrinsic::amdgcn_raw_buffer_load_format: 3031 case Intrinsic::amdgcn_struct_buffer_load_format: 3032 return legalizeBufferLoad(MI, MRI, B, true, false); 3033 case Intrinsic::amdgcn_raw_tbuffer_load: 3034 case Intrinsic::amdgcn_struct_tbuffer_load: 3035 return legalizeBufferLoad(MI, MRI, B, true, true); 3036 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3037 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3038 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3039 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3040 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3041 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3042 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3043 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3044 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3045 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3046 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3047 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3048 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3049 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3050 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3051 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3052 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3053 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3054 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3055 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3056 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3057 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3058 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3059 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3060 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3061 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3062 return legalizeBufferAtomic(MI, B, IntrID); 3063 case Intrinsic::amdgcn_atomic_inc: 3064 return legalizeAtomicIncDec(MI, B, true); 3065 case Intrinsic::amdgcn_atomic_dec: 3066 return legalizeAtomicIncDec(MI, B, false); 3067 default: { 3068 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 3069 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 3070 return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr); 3071 return true; 3072 } 3073 } 3074 3075 return true; 3076 } 3077