1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/TargetOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/Support/Debug.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Hack until load/store selection patterns support any tuple of legal types. 40 static cl::opt<bool> EnableNewLegality( 41 "amdgpu-global-isel-new-legality", 42 cl::desc("Use GlobalISel desired legality, rather than try to use" 43 "rules compatible with selection patterns"), 44 cl::init(false), 45 cl::ReallyHidden); 46 47 static constexpr unsigned MaxRegisterSize = 1024; 48 49 // Round the number of elements to the next power of two elements 50 static LLT getPow2VectorType(LLT Ty) { 51 unsigned NElts = Ty.getNumElements(); 52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53 return Ty.changeNumElements(Pow2NElts); 54 } 55 56 // Round the number of bits to the next power of two bits 57 static LLT getPow2ScalarType(LLT Ty) { 58 unsigned Bits = Ty.getSizeInBits(); 59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 60 return LLT::scalar(Pow2Bits); 61 } 62 63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 64 return [=](const LegalityQuery &Query) { 65 const LLT Ty = Query.Types[TypeIdx]; 66 return Ty.isVector() && 67 Ty.getNumElements() % 2 != 0 && 68 Ty.getElementType().getSizeInBits() < 32 && 69 Ty.getSizeInBits() % 32 != 0; 70 }; 71 } 72 73 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 74 return [=](const LegalityQuery &Query) { 75 const LLT Ty = Query.Types[TypeIdx]; 76 const LLT EltTy = Ty.getScalarType(); 77 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 78 }; 79 } 80 81 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 82 return [=](const LegalityQuery &Query) { 83 const LLT Ty = Query.Types[TypeIdx]; 84 const LLT EltTy = Ty.getElementType(); 85 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 86 }; 87 } 88 89 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 90 return [=](const LegalityQuery &Query) { 91 const LLT Ty = Query.Types[TypeIdx]; 92 const LLT EltTy = Ty.getElementType(); 93 unsigned Size = Ty.getSizeInBits(); 94 unsigned Pieces = (Size + 63) / 64; 95 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 96 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 97 }; 98 } 99 100 // Increase the number of vector elements to reach the next multiple of 32-bit 101 // type. 102 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 103 return [=](const LegalityQuery &Query) { 104 const LLT Ty = Query.Types[TypeIdx]; 105 106 const LLT EltTy = Ty.getElementType(); 107 const int Size = Ty.getSizeInBits(); 108 const int EltSize = EltTy.getSizeInBits(); 109 const int NextMul32 = (Size + 31) / 32; 110 111 assert(EltSize < 32); 112 113 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 114 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 115 }; 116 } 117 118 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 unsigned Size = Ty.getSizeInBits(); 122 123 LLT CoercedTy; 124 if (Size < 32) { 125 // <2 x s8> -> s16 126 assert(Size == 16); 127 CoercedTy = LLT::scalar(16); 128 } else 129 CoercedTy = LLT::scalarOrVector(Size / 32, 32); 130 131 return std::make_pair(TypeIdx, CoercedTy); 132 }; 133 } 134 135 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 136 return [=](const LegalityQuery &Query) { 137 const LLT QueryTy = Query.Types[TypeIdx]; 138 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 139 }; 140 } 141 142 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 143 return [=](const LegalityQuery &Query) { 144 const LLT QueryTy = Query.Types[TypeIdx]; 145 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 146 }; 147 } 148 149 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 150 return [=](const LegalityQuery &Query) { 151 const LLT QueryTy = Query.Types[TypeIdx]; 152 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 153 }; 154 } 155 156 static bool isRegisterSize(unsigned Size) { 157 return Size % 32 == 0 && Size <= MaxRegisterSize; 158 } 159 160 static bool isRegisterVectorElementType(LLT EltTy) { 161 const int EltSize = EltTy.getSizeInBits(); 162 return EltSize == 16 || EltSize % 32 == 0; 163 } 164 165 static bool isRegisterVectorType(LLT Ty) { 166 const int EltSize = Ty.getElementType().getSizeInBits(); 167 return EltSize == 32 || EltSize == 64 || 168 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 169 EltSize == 128 || EltSize == 256; 170 } 171 172 static bool isRegisterType(LLT Ty) { 173 if (!isRegisterSize(Ty.getSizeInBits())) 174 return false; 175 176 if (Ty.isVector()) 177 return isRegisterVectorType(Ty); 178 179 return true; 180 } 181 182 // Any combination of 32 or 64-bit elements up the maximum register size, and 183 // multiples of v2s16. 184 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 185 return [=](const LegalityQuery &Query) { 186 return isRegisterType(Query.Types[TypeIdx]); 187 }; 188 } 189 190 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 191 return [=](const LegalityQuery &Query) { 192 const LLT QueryTy = Query.Types[TypeIdx]; 193 if (!QueryTy.isVector()) 194 return false; 195 const LLT EltTy = QueryTy.getElementType(); 196 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 197 }; 198 } 199 200 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 201 return [=](const LegalityQuery &Query) { 202 const LLT Ty = Query.Types[TypeIdx]; 203 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 204 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 205 }; 206 } 207 208 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 209 // handle some operations by just promoting the register during 210 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 211 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 212 bool IsLoad) { 213 switch (AS) { 214 case AMDGPUAS::PRIVATE_ADDRESS: 215 // FIXME: Private element size. 216 return 32; 217 case AMDGPUAS::LOCAL_ADDRESS: 218 return ST.useDS128() ? 128 : 64; 219 case AMDGPUAS::GLOBAL_ADDRESS: 220 case AMDGPUAS::CONSTANT_ADDRESS: 221 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 222 // Treat constant and global as identical. SMRD loads are sometimes usable for 223 // global loads (ideally constant address space should be eliminated) 224 // depending on the context. Legality cannot be context dependent, but 225 // RegBankSelect can split the load as necessary depending on the pointer 226 // register bank/uniformity and if the memory is invariant or not written in a 227 // kernel. 228 return IsLoad ? 512 : 128; 229 default: 230 // Flat addresses may contextually need to be split to 32-bit parts if they 231 // may alias scratch depending on the subtarget. 232 return 128; 233 } 234 } 235 236 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 237 const LegalityQuery &Query, 238 unsigned Opcode) { 239 const LLT Ty = Query.Types[0]; 240 241 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 242 const bool IsLoad = Opcode != AMDGPU::G_STORE; 243 244 unsigned RegSize = Ty.getSizeInBits(); 245 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 246 unsigned Align = Query.MMODescrs[0].AlignInBits; 247 unsigned AS = Query.Types[1].getAddressSpace(); 248 249 // All of these need to be custom lowered to cast the pointer operand. 250 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 251 return false; 252 253 // TODO: We should be able to widen loads if the alignment is high enough, but 254 // we also need to modify the memory access size. 255 #if 0 256 // Accept widening loads based on alignment. 257 if (IsLoad && MemSize < Size) 258 MemSize = std::max(MemSize, Align); 259 #endif 260 261 // Only 1-byte and 2-byte to 32-bit extloads are valid. 262 if (MemSize != RegSize && RegSize != 32) 263 return false; 264 265 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 266 return false; 267 268 switch (MemSize) { 269 case 8: 270 case 16: 271 case 32: 272 case 64: 273 case 128: 274 break; 275 case 96: 276 if (!ST.hasDwordx3LoadStores()) 277 return false; 278 break; 279 case 256: 280 case 512: 281 // These may contextually need to be broken down. 282 break; 283 default: 284 return false; 285 } 286 287 assert(RegSize >= MemSize); 288 289 if (Align < MemSize) { 290 const SITargetLowering *TLI = ST.getTargetLowering(); 291 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8)) 292 return false; 293 } 294 295 return true; 296 } 297 298 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 299 // workaround this. Eventually it should ignore the type for loads and only care 300 // about the size. Return true in cases where we will workaround this for now by 301 // bitcasting. 302 static bool loadStoreBitcastWorkaround(const LLT Ty) { 303 if (EnableNewLegality) 304 return false; 305 306 const unsigned Size = Ty.getSizeInBits(); 307 if (Size <= 64) 308 return false; 309 if (!Ty.isVector()) 310 return true; 311 unsigned EltSize = Ty.getElementType().getSizeInBits(); 312 return EltSize != 32 && EltSize != 64; 313 } 314 315 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 316 unsigned Opcode) { 317 const LLT Ty = Query.Types[0]; 318 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 319 !loadStoreBitcastWorkaround(Ty); 320 } 321 322 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 323 const GCNTargetMachine &TM) 324 : ST(ST_) { 325 using namespace TargetOpcode; 326 327 auto GetAddrSpacePtr = [&TM](unsigned AS) { 328 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 329 }; 330 331 const LLT S1 = LLT::scalar(1); 332 const LLT S16 = LLT::scalar(16); 333 const LLT S32 = LLT::scalar(32); 334 const LLT S64 = LLT::scalar(64); 335 const LLT S128 = LLT::scalar(128); 336 const LLT S256 = LLT::scalar(256); 337 const LLT S512 = LLT::scalar(512); 338 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 339 340 const LLT V2S16 = LLT::vector(2, 16); 341 const LLT V4S16 = LLT::vector(4, 16); 342 343 const LLT V2S32 = LLT::vector(2, 32); 344 const LLT V3S32 = LLT::vector(3, 32); 345 const LLT V4S32 = LLT::vector(4, 32); 346 const LLT V5S32 = LLT::vector(5, 32); 347 const LLT V6S32 = LLT::vector(6, 32); 348 const LLT V7S32 = LLT::vector(7, 32); 349 const LLT V8S32 = LLT::vector(8, 32); 350 const LLT V9S32 = LLT::vector(9, 32); 351 const LLT V10S32 = LLT::vector(10, 32); 352 const LLT V11S32 = LLT::vector(11, 32); 353 const LLT V12S32 = LLT::vector(12, 32); 354 const LLT V13S32 = LLT::vector(13, 32); 355 const LLT V14S32 = LLT::vector(14, 32); 356 const LLT V15S32 = LLT::vector(15, 32); 357 const LLT V16S32 = LLT::vector(16, 32); 358 const LLT V32S32 = LLT::vector(32, 32); 359 360 const LLT V2S64 = LLT::vector(2, 64); 361 const LLT V3S64 = LLT::vector(3, 64); 362 const LLT V4S64 = LLT::vector(4, 64); 363 const LLT V5S64 = LLT::vector(5, 64); 364 const LLT V6S64 = LLT::vector(6, 64); 365 const LLT V7S64 = LLT::vector(7, 64); 366 const LLT V8S64 = LLT::vector(8, 64); 367 const LLT V16S64 = LLT::vector(16, 64); 368 369 std::initializer_list<LLT> AllS32Vectors = 370 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 371 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 372 std::initializer_list<LLT> AllS64Vectors = 373 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 374 375 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 376 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 377 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 378 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 379 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 380 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 381 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 382 383 const LLT CodePtr = FlatPtr; 384 385 const std::initializer_list<LLT> AddrSpaces64 = { 386 GlobalPtr, ConstantPtr, FlatPtr 387 }; 388 389 const std::initializer_list<LLT> AddrSpaces32 = { 390 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 391 }; 392 393 const std::initializer_list<LLT> FPTypesBase = { 394 S32, S64 395 }; 396 397 const std::initializer_list<LLT> FPTypes16 = { 398 S32, S64, S16 399 }; 400 401 const std::initializer_list<LLT> FPTypesPK16 = { 402 S32, S64, S16, V2S16 403 }; 404 405 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 406 407 setAction({G_BRCOND, S1}, Legal); // VCC branches 408 setAction({G_BRCOND, S32}, Legal); // SCC branches 409 410 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 411 // elements for v3s16 412 getActionDefinitionsBuilder(G_PHI) 413 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 414 .legalFor(AllS32Vectors) 415 .legalFor(AllS64Vectors) 416 .legalFor(AddrSpaces64) 417 .legalFor(AddrSpaces32) 418 .clampScalar(0, S32, S256) 419 .widenScalarToNextPow2(0, 32) 420 .clampMaxNumElements(0, S32, 16) 421 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 422 .legalIf(isPointer(0)); 423 424 if (ST.hasVOP3PInsts()) { 425 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 426 .legalFor({S32, S16, V2S16}) 427 .clampScalar(0, S16, S32) 428 .clampMaxNumElements(0, S16, 2) 429 .scalarize(0) 430 .widenScalarToNextPow2(0, 32); 431 } else if (ST.has16BitInsts()) { 432 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 433 .legalFor({S32, S16}) 434 .clampScalar(0, S16, S32) 435 .scalarize(0) 436 .widenScalarToNextPow2(0, 32); 437 } else { 438 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 439 .legalFor({S32}) 440 .clampScalar(0, S32, S32) 441 .scalarize(0); 442 } 443 444 // FIXME: Not really legal. Placeholder for custom lowering. 445 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 446 .customFor({S32, S64}) 447 .clampScalar(0, S32, S64) 448 .widenScalarToNextPow2(0, 32) 449 .scalarize(0); 450 451 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 452 .legalFor({S32}) 453 .clampScalar(0, S32, S32) 454 .scalarize(0); 455 456 // Report legal for any types we can handle anywhere. For the cases only legal 457 // on the SALU, RegBankSelect will be able to re-legalize. 458 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 459 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 460 .clampScalar(0, S32, S64) 461 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 462 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 463 .widenScalarToNextPow2(0) 464 .scalarize(0); 465 466 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 467 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 468 .legalFor({{S32, S1}, {S32, S32}}) 469 .minScalar(0, S32) 470 // TODO: .scalarize(0) 471 .lower(); 472 473 getActionDefinitionsBuilder(G_BITCAST) 474 // Don't worry about the size constraint. 475 .legalIf(all(isRegisterType(0), isRegisterType(1))) 476 .lower(); 477 478 479 getActionDefinitionsBuilder(G_CONSTANT) 480 .legalFor({S1, S32, S64, S16, GlobalPtr, 481 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 482 .clampScalar(0, S32, S64) 483 .widenScalarToNextPow2(0) 484 .legalIf(isPointer(0)); 485 486 getActionDefinitionsBuilder(G_FCONSTANT) 487 .legalFor({S32, S64, S16}) 488 .clampScalar(0, S16, S64); 489 490 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 491 .legalIf(isRegisterType(0)) 492 // s1 and s16 are special cases because they have legal operations on 493 // them, but don't really occupy registers in the normal way. 494 .legalFor({S1, S16}) 495 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 496 .clampScalarOrElt(0, S32, MaxScalar) 497 .widenScalarToNextPow2(0, 32) 498 .clampMaxNumElements(0, S32, 16); 499 500 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 501 502 // If the amount is divergent, we have to do a wave reduction to get the 503 // maximum value, so this is expanded during RegBankSelect. 504 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 505 .legalFor({{PrivatePtr, S32}}); 506 507 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 508 .unsupportedFor({PrivatePtr}) 509 .custom(); 510 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 511 512 auto &FPOpActions = getActionDefinitionsBuilder( 513 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 514 .legalFor({S32, S64}); 515 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 516 .customFor({S32, S64}); 517 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 518 .customFor({S32, S64}); 519 520 if (ST.has16BitInsts()) { 521 if (ST.hasVOP3PInsts()) 522 FPOpActions.legalFor({S16, V2S16}); 523 else 524 FPOpActions.legalFor({S16}); 525 526 TrigActions.customFor({S16}); 527 FDIVActions.customFor({S16}); 528 } 529 530 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 531 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 532 533 if (ST.hasVOP3PInsts()) { 534 MinNumMaxNum.customFor(FPTypesPK16) 535 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 536 .clampMaxNumElements(0, S16, 2) 537 .clampScalar(0, S16, S64) 538 .scalarize(0); 539 } else if (ST.has16BitInsts()) { 540 MinNumMaxNum.customFor(FPTypes16) 541 .clampScalar(0, S16, S64) 542 .scalarize(0); 543 } else { 544 MinNumMaxNum.customFor(FPTypesBase) 545 .clampScalar(0, S32, S64) 546 .scalarize(0); 547 } 548 549 if (ST.hasVOP3PInsts()) 550 FPOpActions.clampMaxNumElements(0, S16, 2); 551 552 FPOpActions 553 .scalarize(0) 554 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 555 556 TrigActions 557 .scalarize(0) 558 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 559 560 FDIVActions 561 .scalarize(0) 562 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 563 564 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 565 .legalFor(FPTypesPK16) 566 .clampMaxNumElements(0, S16, 2) 567 .scalarize(0) 568 .clampScalar(0, S16, S64); 569 570 if (ST.has16BitInsts()) { 571 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 572 .legalFor({S32, S64, S16}) 573 .scalarize(0) 574 .clampScalar(0, S16, S64); 575 } else { 576 getActionDefinitionsBuilder(G_FSQRT) 577 .legalFor({S32, S64}) 578 .scalarize(0) 579 .clampScalar(0, S32, S64); 580 581 if (ST.hasFractBug()) { 582 getActionDefinitionsBuilder(G_FFLOOR) 583 .customFor({S64}) 584 .legalFor({S32, S64}) 585 .scalarize(0) 586 .clampScalar(0, S32, S64); 587 } else { 588 getActionDefinitionsBuilder(G_FFLOOR) 589 .legalFor({S32, S64}) 590 .scalarize(0) 591 .clampScalar(0, S32, S64); 592 } 593 } 594 595 getActionDefinitionsBuilder(G_FPTRUNC) 596 .legalFor({{S32, S64}, {S16, S32}}) 597 .scalarize(0) 598 .lower(); 599 600 getActionDefinitionsBuilder(G_FPEXT) 601 .legalFor({{S64, S32}, {S32, S16}}) 602 .lowerFor({{S64, S16}}) // FIXME: Implement 603 .scalarize(0); 604 605 getActionDefinitionsBuilder(G_FSUB) 606 // Use actual fsub instruction 607 .legalFor({S32}) 608 // Must use fadd + fneg 609 .lowerFor({S64, S16, V2S16}) 610 .scalarize(0) 611 .clampScalar(0, S32, S64); 612 613 // Whether this is legal depends on the floating point mode for the function. 614 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 615 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 616 FMad.customFor({S32, S16}); 617 else if (ST.hasMadMacF32Insts()) 618 FMad.customFor({S32}); 619 else if (ST.hasMadF16()) 620 FMad.customFor({S16}); 621 FMad.scalarize(0) 622 .lower(); 623 624 // TODO: Do we need to clamp maximum bitwidth? 625 getActionDefinitionsBuilder(G_TRUNC) 626 .legalIf(isScalar(0)) 627 .legalFor({{V2S16, V2S32}}) 628 .clampMaxNumElements(0, S16, 2) 629 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 630 // situations (like an invalid implicit use), we don't want to infinite loop 631 // in the legalizer. 632 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 633 .alwaysLegal(); 634 635 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 636 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 637 {S32, S1}, {S64, S1}, {S16, S1}}) 638 .scalarize(0) 639 .clampScalar(0, S32, S64) 640 .widenScalarToNextPow2(1, 32); 641 642 // TODO: Split s1->s64 during regbankselect for VALU. 643 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 644 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 645 .lowerFor({{S32, S64}}) 646 .lowerIf(typeIs(1, S1)) 647 .customFor({{S64, S64}}); 648 if (ST.has16BitInsts()) 649 IToFP.legalFor({{S16, S16}}); 650 IToFP.clampScalar(1, S32, S64) 651 .scalarize(0) 652 .widenScalarToNextPow2(1); 653 654 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 655 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 656 .customFor({{S64, S64}}); 657 if (ST.has16BitInsts()) 658 FPToI.legalFor({{S16, S16}}); 659 else 660 FPToI.minScalar(1, S32); 661 662 FPToI.minScalar(0, S32) 663 .scalarize(0) 664 .lower(); 665 666 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 667 .scalarize(0) 668 .lower(); 669 670 if (ST.has16BitInsts()) { 671 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 672 .legalFor({S16, S32, S64}) 673 .clampScalar(0, S16, S64) 674 .scalarize(0); 675 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 676 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 677 .legalFor({S32, S64}) 678 .clampScalar(0, S32, S64) 679 .scalarize(0); 680 } else { 681 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 682 .legalFor({S32}) 683 .customFor({S64}) 684 .clampScalar(0, S32, S64) 685 .scalarize(0); 686 } 687 688 // FIXME: Clamp offset operand. 689 getActionDefinitionsBuilder(G_PTR_ADD) 690 .legalIf(isPointer(0)) 691 .scalarize(0); 692 693 getActionDefinitionsBuilder(G_PTRMASK) 694 .legalIf(typeInSet(1, {S64, S32})) 695 .minScalar(1, S32) 696 .maxScalarIf(sizeIs(0, 32), 1, S32) 697 .maxScalarIf(sizeIs(0, 64), 1, S64) 698 .scalarize(0); 699 700 auto &CmpBuilder = 701 getActionDefinitionsBuilder(G_ICMP) 702 // The compare output type differs based on the register bank of the output, 703 // so make both s1 and s32 legal. 704 // 705 // Scalar compares producing output in scc will be promoted to s32, as that 706 // is the allocatable register type that will be needed for the copy from 707 // scc. This will be promoted during RegBankSelect, and we assume something 708 // before that won't try to use s32 result types. 709 // 710 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 711 // bank. 712 .legalForCartesianProduct( 713 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 714 .legalForCartesianProduct( 715 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 716 if (ST.has16BitInsts()) { 717 CmpBuilder.legalFor({{S1, S16}}); 718 } 719 720 CmpBuilder 721 .widenScalarToNextPow2(1) 722 .clampScalar(1, S32, S64) 723 .scalarize(0) 724 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 725 726 getActionDefinitionsBuilder(G_FCMP) 727 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 728 .widenScalarToNextPow2(1) 729 .clampScalar(1, S32, S64) 730 .scalarize(0); 731 732 // FIXME: fpow has a selection pattern that should move to custom lowering. 733 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 734 if (ST.has16BitInsts()) 735 Exp2Ops.legalFor({S32, S16}); 736 else 737 Exp2Ops.legalFor({S32}); 738 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 739 Exp2Ops.scalarize(0); 740 741 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 742 if (ST.has16BitInsts()) 743 ExpOps.customFor({{S32}, {S16}}); 744 else 745 ExpOps.customFor({S32}); 746 ExpOps.clampScalar(0, MinScalarFPTy, S32) 747 .scalarize(0); 748 749 // The 64-bit versions produce 32-bit results, but only on the SALU. 750 getActionDefinitionsBuilder(G_CTPOP) 751 .legalFor({{S32, S32}, {S32, S64}}) 752 .clampScalar(0, S32, S32) 753 .clampScalar(1, S32, S64) 754 .scalarize(0) 755 .widenScalarToNextPow2(0, 32) 756 .widenScalarToNextPow2(1, 32); 757 758 // The hardware instructions return a different result on 0 than the generic 759 // instructions expect. The hardware produces -1, but these produce the 760 // bitwidth. 761 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 762 .scalarize(0) 763 .clampScalar(0, S32, S32) 764 .clampScalar(1, S32, S64) 765 .widenScalarToNextPow2(0, 32) 766 .widenScalarToNextPow2(1, 32) 767 .lower(); 768 769 // The 64-bit versions produce 32-bit results, but only on the SALU. 770 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 771 .legalFor({{S32, S32}, {S32, S64}}) 772 .clampScalar(0, S32, S32) 773 .clampScalar(1, S32, S64) 774 .scalarize(0) 775 .widenScalarToNextPow2(0, 32) 776 .widenScalarToNextPow2(1, 32); 777 778 getActionDefinitionsBuilder(G_BITREVERSE) 779 .legalFor({S32}) 780 .clampScalar(0, S32, S32) 781 .scalarize(0); 782 783 if (ST.has16BitInsts()) { 784 getActionDefinitionsBuilder(G_BSWAP) 785 .legalFor({S16, S32, V2S16}) 786 .clampMaxNumElements(0, S16, 2) 787 // FIXME: Fixing non-power-of-2 before clamp is workaround for 788 // narrowScalar limitation. 789 .widenScalarToNextPow2(0) 790 .clampScalar(0, S16, S32) 791 .scalarize(0); 792 793 if (ST.hasVOP3PInsts()) { 794 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 795 .legalFor({S32, S16, V2S16}) 796 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 797 .clampMaxNumElements(0, S16, 2) 798 .minScalar(0, S16) 799 .widenScalarToNextPow2(0) 800 .scalarize(0) 801 .lower(); 802 } else { 803 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 804 .legalFor({S32, S16}) 805 .widenScalarToNextPow2(0) 806 .minScalar(0, S16) 807 .scalarize(0) 808 .lower(); 809 } 810 } else { 811 // TODO: Should have same legality without v_perm_b32 812 getActionDefinitionsBuilder(G_BSWAP) 813 .legalFor({S32}) 814 .lowerIf(scalarNarrowerThan(0, 32)) 815 // FIXME: Fixing non-power-of-2 before clamp is workaround for 816 // narrowScalar limitation. 817 .widenScalarToNextPow2(0) 818 .maxScalar(0, S32) 819 .scalarize(0) 820 .lower(); 821 822 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 823 .legalFor({S32}) 824 .minScalar(0, S32) 825 .widenScalarToNextPow2(0) 826 .scalarize(0) 827 .lower(); 828 } 829 830 getActionDefinitionsBuilder(G_INTTOPTR) 831 // List the common cases 832 .legalForCartesianProduct(AddrSpaces64, {S64}) 833 .legalForCartesianProduct(AddrSpaces32, {S32}) 834 .scalarize(0) 835 // Accept any address space as long as the size matches 836 .legalIf(sameSize(0, 1)) 837 .widenScalarIf(smallerThan(1, 0), 838 [](const LegalityQuery &Query) { 839 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 840 }) 841 .narrowScalarIf(largerThan(1, 0), 842 [](const LegalityQuery &Query) { 843 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 844 }); 845 846 getActionDefinitionsBuilder(G_PTRTOINT) 847 // List the common cases 848 .legalForCartesianProduct(AddrSpaces64, {S64}) 849 .legalForCartesianProduct(AddrSpaces32, {S32}) 850 .scalarize(0) 851 // Accept any address space as long as the size matches 852 .legalIf(sameSize(0, 1)) 853 .widenScalarIf(smallerThan(0, 1), 854 [](const LegalityQuery &Query) { 855 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 856 }) 857 .narrowScalarIf( 858 largerThan(0, 1), 859 [](const LegalityQuery &Query) { 860 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 861 }); 862 863 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 864 .scalarize(0) 865 .custom(); 866 867 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 868 bool IsLoad) -> bool { 869 const LLT DstTy = Query.Types[0]; 870 871 // Split vector extloads. 872 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 873 unsigned Align = Query.MMODescrs[0].AlignInBits; 874 875 if (MemSize < DstTy.getSizeInBits()) 876 MemSize = std::max(MemSize, Align); 877 878 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 879 return true; 880 881 const LLT PtrTy = Query.Types[1]; 882 unsigned AS = PtrTy.getAddressSpace(); 883 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 884 return true; 885 886 // Catch weird sized loads that don't evenly divide into the access sizes 887 // TODO: May be able to widen depending on alignment etc. 888 unsigned NumRegs = (MemSize + 31) / 32; 889 if (NumRegs == 3) { 890 if (!ST.hasDwordx3LoadStores()) 891 return true; 892 } else { 893 // If the alignment allows, these should have been widened. 894 if (!isPowerOf2_32(NumRegs)) 895 return true; 896 } 897 898 if (Align < MemSize) { 899 const SITargetLowering *TLI = ST.getTargetLowering(); 900 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 901 } 902 903 return false; 904 }; 905 906 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query, 907 unsigned Opc) -> bool { 908 unsigned Size = Query.Types[0].getSizeInBits(); 909 if (isPowerOf2_32(Size)) 910 return false; 911 912 if (Size == 96 && ST.hasDwordx3LoadStores()) 913 return false; 914 915 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 916 if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc)) 917 return false; 918 919 unsigned Align = Query.MMODescrs[0].AlignInBits; 920 unsigned RoundedSize = NextPowerOf2(Size); 921 return (Align >= RoundedSize); 922 }; 923 924 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 925 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 926 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 927 928 // TODO: Refine based on subtargets which support unaligned access or 128-bit 929 // LDS 930 // TODO: Unsupported flat for SI. 931 932 for (unsigned Op : {G_LOAD, G_STORE}) { 933 const bool IsStore = Op == G_STORE; 934 935 auto &Actions = getActionDefinitionsBuilder(Op); 936 // Explicitly list some common cases. 937 // TODO: Does this help compile time at all? 938 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 939 {V2S32, GlobalPtr, 64, GlobalAlign32}, 940 {V4S32, GlobalPtr, 128, GlobalAlign32}, 941 {S64, GlobalPtr, 64, GlobalAlign32}, 942 {V2S64, GlobalPtr, 128, GlobalAlign32}, 943 {V2S16, GlobalPtr, 32, GlobalAlign32}, 944 {S32, GlobalPtr, 8, GlobalAlign8}, 945 {S32, GlobalPtr, 16, GlobalAlign16}, 946 947 {S32, LocalPtr, 32, 32}, 948 {S64, LocalPtr, 64, 32}, 949 {V2S32, LocalPtr, 64, 32}, 950 {S32, LocalPtr, 8, 8}, 951 {S32, LocalPtr, 16, 16}, 952 {V2S16, LocalPtr, 32, 32}, 953 954 {S32, PrivatePtr, 32, 32}, 955 {S32, PrivatePtr, 8, 8}, 956 {S32, PrivatePtr, 16, 16}, 957 {V2S16, PrivatePtr, 32, 32}, 958 959 {S32, ConstantPtr, 32, GlobalAlign32}, 960 {V2S32, ConstantPtr, 64, GlobalAlign32}, 961 {V4S32, ConstantPtr, 128, GlobalAlign32}, 962 {S64, ConstantPtr, 64, GlobalAlign32}, 963 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 964 Actions.legalIf( 965 [=](const LegalityQuery &Query) -> bool { 966 return isLoadStoreLegal(ST, Query, Op); 967 }); 968 969 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 970 // 64-bits. 971 // 972 // TODO: Should generalize bitcast action into coerce, which will also cover 973 // inserting addrspacecasts. 974 Actions.customIf(typeIs(1, Constant32Ptr)); 975 976 // Turn any illegal element vectors into something easier to deal 977 // with. These will ultimately produce 32-bit scalar shifts to extract the 978 // parts anyway. 979 // 980 // For odd 16-bit element vectors, prefer to split those into pieces with 981 // 16-bit vector parts. 982 Actions.bitcastIf( 983 [=](const LegalityQuery &Query) -> bool { 984 const LLT Ty = Query.Types[0]; 985 986 // Do not cast an extload/truncstore. 987 if (Ty.getSizeInBits() != Query.MMODescrs[0].SizeInBits) 988 return false; 989 990 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 991 return true; 992 const unsigned Size = Ty.getSizeInBits(); 993 return Ty.isVector() && isRegisterSize(Size) && 994 !isRegisterVectorElementType(Ty.getElementType()); 995 }, bitcastToRegisterType(0)); 996 997 Actions 998 .customIf(typeIs(1, Constant32Ptr)) 999 // Widen suitably aligned loads by loading extra elements. 1000 .moreElementsIf([=](const LegalityQuery &Query) { 1001 const LLT Ty = Query.Types[0]; 1002 return Op == G_LOAD && Ty.isVector() && 1003 shouldWidenLoadResult(Query, Op); 1004 }, moreElementsToNextPow2(0)) 1005 .widenScalarIf([=](const LegalityQuery &Query) { 1006 const LLT Ty = Query.Types[0]; 1007 return Op == G_LOAD && !Ty.isVector() && 1008 shouldWidenLoadResult(Query, Op); 1009 }, widenScalarOrEltToNextPow2(0)) 1010 .narrowScalarIf( 1011 [=](const LegalityQuery &Query) -> bool { 1012 return !Query.Types[0].isVector() && 1013 needToSplitMemOp(Query, Op == G_LOAD); 1014 }, 1015 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1016 const LLT DstTy = Query.Types[0]; 1017 const LLT PtrTy = Query.Types[1]; 1018 1019 const unsigned DstSize = DstTy.getSizeInBits(); 1020 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1021 1022 // Split extloads. 1023 if (DstSize > MemSize) 1024 return std::make_pair(0, LLT::scalar(MemSize)); 1025 1026 if (!isPowerOf2_32(DstSize)) { 1027 // We're probably decomposing an odd sized store. Try to split 1028 // to the widest type. TODO: Account for alignment. As-is it 1029 // should be OK, since the new parts will be further legalized. 1030 unsigned FloorSize = PowerOf2Floor(DstSize); 1031 return std::make_pair(0, LLT::scalar(FloorSize)); 1032 } 1033 1034 if (DstSize > 32 && (DstSize % 32 != 0)) { 1035 // FIXME: Need a way to specify non-extload of larger size if 1036 // suitably aligned. 1037 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1038 } 1039 1040 unsigned MaxSize = maxSizeForAddrSpace(ST, 1041 PtrTy.getAddressSpace(), 1042 Op == G_LOAD); 1043 if (MemSize > MaxSize) 1044 return std::make_pair(0, LLT::scalar(MaxSize)); 1045 1046 unsigned Align = Query.MMODescrs[0].AlignInBits; 1047 return std::make_pair(0, LLT::scalar(Align)); 1048 }) 1049 .fewerElementsIf( 1050 [=](const LegalityQuery &Query) -> bool { 1051 return Query.Types[0].isVector() && 1052 needToSplitMemOp(Query, Op == G_LOAD); 1053 }, 1054 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1055 const LLT DstTy = Query.Types[0]; 1056 const LLT PtrTy = Query.Types[1]; 1057 1058 LLT EltTy = DstTy.getElementType(); 1059 unsigned MaxSize = maxSizeForAddrSpace(ST, 1060 PtrTy.getAddressSpace(), 1061 Op == G_LOAD); 1062 1063 // FIXME: Handle widened to power of 2 results better. This ends 1064 // up scalarizing. 1065 // FIXME: 3 element stores scalarized on SI 1066 1067 // Split if it's too large for the address space. 1068 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1069 unsigned NumElts = DstTy.getNumElements(); 1070 unsigned EltSize = EltTy.getSizeInBits(); 1071 1072 if (MaxSize % EltSize == 0) { 1073 return std::make_pair( 1074 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1075 } 1076 1077 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1078 1079 // FIXME: Refine when odd breakdowns handled 1080 // The scalars will need to be re-legalized. 1081 if (NumPieces == 1 || NumPieces >= NumElts || 1082 NumElts % NumPieces != 0) 1083 return std::make_pair(0, EltTy); 1084 1085 return std::make_pair(0, 1086 LLT::vector(NumElts / NumPieces, EltTy)); 1087 } 1088 1089 // FIXME: We could probably handle weird extending loads better. 1090 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1091 if (DstTy.getSizeInBits() > MemSize) 1092 return std::make_pair(0, EltTy); 1093 1094 unsigned EltSize = EltTy.getSizeInBits(); 1095 unsigned DstSize = DstTy.getSizeInBits(); 1096 if (!isPowerOf2_32(DstSize)) { 1097 // We're probably decomposing an odd sized store. Try to split 1098 // to the widest type. TODO: Account for alignment. As-is it 1099 // should be OK, since the new parts will be further legalized. 1100 unsigned FloorSize = PowerOf2Floor(DstSize); 1101 return std::make_pair( 1102 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1103 } 1104 1105 // Need to split because of alignment. 1106 unsigned Align = Query.MMODescrs[0].AlignInBits; 1107 if (EltSize > Align && 1108 (EltSize / Align < DstTy.getNumElements())) { 1109 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1110 } 1111 1112 // May need relegalization for the scalars. 1113 return std::make_pair(0, EltTy); 1114 }) 1115 .minScalar(0, S32); 1116 1117 if (IsStore) 1118 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1119 1120 // TODO: Need a bitcast lower option? 1121 Actions 1122 .widenScalarToNextPow2(0) 1123 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 1124 } 1125 1126 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1127 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1128 {S32, GlobalPtr, 16, 2 * 8}, 1129 {S32, LocalPtr, 8, 8}, 1130 {S32, LocalPtr, 16, 16}, 1131 {S32, PrivatePtr, 8, 8}, 1132 {S32, PrivatePtr, 16, 16}, 1133 {S32, ConstantPtr, 8, 8}, 1134 {S32, ConstantPtr, 16, 2 * 8}}); 1135 if (ST.hasFlatAddressSpace()) { 1136 ExtLoads.legalForTypesWithMemDesc( 1137 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1138 } 1139 1140 ExtLoads.clampScalar(0, S32, S32) 1141 .widenScalarToNextPow2(0) 1142 .unsupportedIfMemSizeNotPow2() 1143 .lower(); 1144 1145 auto &Atomics = getActionDefinitionsBuilder( 1146 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1147 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1148 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1149 G_ATOMICRMW_UMIN}) 1150 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1151 {S64, GlobalPtr}, {S64, LocalPtr}}); 1152 if (ST.hasFlatAddressSpace()) { 1153 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1154 } 1155 1156 if (ST.hasLDSFPAtomics()) { 1157 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1158 .legalFor({{S32, LocalPtr}}); 1159 } 1160 1161 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1162 // demarshalling 1163 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1164 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1165 {S32, FlatPtr}, {S64, FlatPtr}}) 1166 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1167 {S32, RegionPtr}, {S64, RegionPtr}}); 1168 // TODO: Pointer types, any 32-bit or 64-bit vector 1169 1170 // Condition should be s32 for scalar, s1 for vector. 1171 getActionDefinitionsBuilder(G_SELECT) 1172 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1173 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1174 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1175 .clampScalar(0, S16, S64) 1176 .scalarize(1) 1177 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1178 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1179 .clampMaxNumElements(0, S32, 2) 1180 .clampMaxNumElements(0, LocalPtr, 2) 1181 .clampMaxNumElements(0, PrivatePtr, 2) 1182 .scalarize(0) 1183 .widenScalarToNextPow2(0) 1184 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1185 1186 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1187 // be more flexible with the shift amount type. 1188 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1189 .legalFor({{S32, S32}, {S64, S32}}); 1190 if (ST.has16BitInsts()) { 1191 if (ST.hasVOP3PInsts()) { 1192 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1193 .clampMaxNumElements(0, S16, 2); 1194 } else 1195 Shifts.legalFor({{S16, S16}}); 1196 1197 // TODO: Support 16-bit shift amounts for all types 1198 Shifts.widenScalarIf( 1199 [=](const LegalityQuery &Query) { 1200 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1201 // 32-bit amount. 1202 const LLT ValTy = Query.Types[0]; 1203 const LLT AmountTy = Query.Types[1]; 1204 return ValTy.getSizeInBits() <= 16 && 1205 AmountTy.getSizeInBits() < 16; 1206 }, changeTo(1, S16)); 1207 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1208 Shifts.clampScalar(1, S32, S32); 1209 Shifts.clampScalar(0, S16, S64); 1210 Shifts.widenScalarToNextPow2(0, 16); 1211 } else { 1212 // Make sure we legalize the shift amount type first, as the general 1213 // expansion for the shifted type will produce much worse code if it hasn't 1214 // been truncated already. 1215 Shifts.clampScalar(1, S32, S32); 1216 Shifts.clampScalar(0, S32, S64); 1217 Shifts.widenScalarToNextPow2(0, 32); 1218 } 1219 Shifts.scalarize(0); 1220 1221 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1222 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1223 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1224 unsigned IdxTypeIdx = 2; 1225 1226 getActionDefinitionsBuilder(Op) 1227 .customIf([=](const LegalityQuery &Query) { 1228 const LLT EltTy = Query.Types[EltTypeIdx]; 1229 const LLT VecTy = Query.Types[VecTypeIdx]; 1230 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1231 return (EltTy.getSizeInBits() == 16 || 1232 EltTy.getSizeInBits() % 32 == 0) && 1233 VecTy.getSizeInBits() % 32 == 0 && 1234 VecTy.getSizeInBits() <= MaxRegisterSize && 1235 IdxTy.getSizeInBits() == 32; 1236 }) 1237 .clampScalar(EltTypeIdx, S32, S64) 1238 .clampScalar(VecTypeIdx, S32, S64) 1239 .clampScalar(IdxTypeIdx, S32, S32); 1240 } 1241 1242 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1243 .unsupportedIf([=](const LegalityQuery &Query) { 1244 const LLT &EltTy = Query.Types[1].getElementType(); 1245 return Query.Types[0] != EltTy; 1246 }); 1247 1248 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1249 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1250 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1251 1252 // FIXME: Doesn't handle extract of illegal sizes. 1253 getActionDefinitionsBuilder(Op) 1254 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1255 // FIXME: Multiples of 16 should not be legal. 1256 .legalIf([=](const LegalityQuery &Query) { 1257 const LLT BigTy = Query.Types[BigTyIdx]; 1258 const LLT LitTy = Query.Types[LitTyIdx]; 1259 return (BigTy.getSizeInBits() % 32 == 0) && 1260 (LitTy.getSizeInBits() % 16 == 0); 1261 }) 1262 .widenScalarIf( 1263 [=](const LegalityQuery &Query) { 1264 const LLT BigTy = Query.Types[BigTyIdx]; 1265 return (BigTy.getScalarSizeInBits() < 16); 1266 }, 1267 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1268 .widenScalarIf( 1269 [=](const LegalityQuery &Query) { 1270 const LLT LitTy = Query.Types[LitTyIdx]; 1271 return (LitTy.getScalarSizeInBits() < 16); 1272 }, 1273 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1274 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1275 .widenScalarToNextPow2(BigTyIdx, 32); 1276 1277 } 1278 1279 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1280 .legalForCartesianProduct(AllS32Vectors, {S32}) 1281 .legalForCartesianProduct(AllS64Vectors, {S64}) 1282 .clampNumElements(0, V16S32, V32S32) 1283 .clampNumElements(0, V2S64, V16S64) 1284 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1285 1286 if (ST.hasScalarPackInsts()) { 1287 BuildVector 1288 // FIXME: Should probably widen s1 vectors straight to s32 1289 .minScalarOrElt(0, S16) 1290 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1291 .minScalar(1, S32); 1292 1293 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1294 .legalFor({V2S16, S32}) 1295 .lower(); 1296 BuildVector.minScalarOrElt(0, S32); 1297 } else { 1298 BuildVector.customFor({V2S16, S16}); 1299 BuildVector.minScalarOrElt(0, S32); 1300 1301 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1302 .customFor({V2S16, S32}) 1303 .lower(); 1304 } 1305 1306 BuildVector.legalIf(isRegisterType(0)); 1307 1308 // FIXME: Clamp maximum size 1309 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1310 .legalIf(isRegisterType(0)); 1311 1312 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1313 // pre-legalize. 1314 if (ST.hasVOP3PInsts()) { 1315 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1316 .customFor({V2S16, V2S16}) 1317 .lower(); 1318 } else 1319 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1320 1321 // Merge/Unmerge 1322 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1323 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1324 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1325 1326 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1327 const LLT Ty = Query.Types[TypeIdx]; 1328 if (Ty.isVector()) { 1329 const LLT &EltTy = Ty.getElementType(); 1330 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1331 return true; 1332 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1333 return true; 1334 } 1335 return false; 1336 }; 1337 1338 auto &Builder = getActionDefinitionsBuilder(Op) 1339 .lowerFor({{S16, V2S16}}) 1340 .lowerIf([=](const LegalityQuery &Query) { 1341 const LLT BigTy = Query.Types[BigTyIdx]; 1342 return BigTy.getSizeInBits() == 32; 1343 }) 1344 // Try to widen to s16 first for small types. 1345 // TODO: Only do this on targets with legal s16 shifts 1346 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1347 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1348 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1349 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1350 elementTypeIs(1, S16)), 1351 changeTo(1, V2S16)) 1352 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1353 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1354 // valid. 1355 .clampScalar(LitTyIdx, S32, S512) 1356 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1357 // Break up vectors with weird elements into scalars 1358 .fewerElementsIf( 1359 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1360 scalarize(0)) 1361 .fewerElementsIf( 1362 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1363 scalarize(1)) 1364 .clampScalar(BigTyIdx, S32, MaxScalar); 1365 1366 if (Op == G_MERGE_VALUES) { 1367 Builder.widenScalarIf( 1368 // TODO: Use 16-bit shifts if legal for 8-bit values? 1369 [=](const LegalityQuery &Query) { 1370 const LLT Ty = Query.Types[LitTyIdx]; 1371 return Ty.getSizeInBits() < 32; 1372 }, 1373 changeTo(LitTyIdx, S32)); 1374 } 1375 1376 Builder.widenScalarIf( 1377 [=](const LegalityQuery &Query) { 1378 const LLT Ty = Query.Types[BigTyIdx]; 1379 return !isPowerOf2_32(Ty.getSizeInBits()) && 1380 Ty.getSizeInBits() % 16 != 0; 1381 }, 1382 [=](const LegalityQuery &Query) { 1383 // Pick the next power of 2, or a multiple of 64 over 128. 1384 // Whichever is smaller. 1385 const LLT &Ty = Query.Types[BigTyIdx]; 1386 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1387 if (NewSizeInBits >= 256) { 1388 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1389 if (RoundedTo < NewSizeInBits) 1390 NewSizeInBits = RoundedTo; 1391 } 1392 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1393 }) 1394 .legalIf([=](const LegalityQuery &Query) { 1395 const LLT &BigTy = Query.Types[BigTyIdx]; 1396 const LLT &LitTy = Query.Types[LitTyIdx]; 1397 1398 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1399 return false; 1400 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1401 return false; 1402 1403 return BigTy.getSizeInBits() % 16 == 0 && 1404 LitTy.getSizeInBits() % 16 == 0 && 1405 BigTy.getSizeInBits() <= MaxRegisterSize; 1406 }) 1407 // Any vectors left are the wrong size. Scalarize them. 1408 .scalarize(0) 1409 .scalarize(1); 1410 } 1411 1412 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1413 // RegBankSelect. 1414 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1415 .legalFor({{S32}, {S64}}); 1416 1417 if (ST.hasVOP3PInsts()) { 1418 SextInReg.lowerFor({{V2S16}}) 1419 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1420 // get more vector shift opportunities, since we'll get those when 1421 // expanded. 1422 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1423 } else if (ST.has16BitInsts()) { 1424 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1425 } else { 1426 // Prefer to promote to s32 before lowering if we don't have 16-bit 1427 // shifts. This avoid a lot of intermediate truncate and extend operations. 1428 SextInReg.lowerFor({{S32}, {S64}}); 1429 } 1430 1431 SextInReg 1432 .scalarize(0) 1433 .clampScalar(0, S32, S64) 1434 .lower(); 1435 1436 getActionDefinitionsBuilder(G_FSHR) 1437 .legalFor({{S32, S32}}) 1438 .scalarize(0) 1439 .lower(); 1440 1441 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1442 .legalFor({S64}); 1443 1444 getActionDefinitionsBuilder({ 1445 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1446 G_FCOPYSIGN, 1447 1448 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1449 G_READ_REGISTER, 1450 G_WRITE_REGISTER, 1451 1452 G_SADDO, G_SSUBO, 1453 1454 // TODO: Implement 1455 G_FMINIMUM, G_FMAXIMUM, 1456 G_FSHL 1457 }).lower(); 1458 1459 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1460 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1461 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1462 .unsupported(); 1463 1464 computeTables(); 1465 verify(*ST.getInstrInfo()); 1466 } 1467 1468 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1469 MachineInstr &MI) const { 1470 MachineIRBuilder &B = Helper.MIRBuilder; 1471 MachineRegisterInfo &MRI = *B.getMRI(); 1472 GISelChangeObserver &Observer = Helper.Observer; 1473 1474 switch (MI.getOpcode()) { 1475 case TargetOpcode::G_ADDRSPACE_CAST: 1476 return legalizeAddrSpaceCast(MI, MRI, B); 1477 case TargetOpcode::G_FRINT: 1478 return legalizeFrint(MI, MRI, B); 1479 case TargetOpcode::G_FCEIL: 1480 return legalizeFceil(MI, MRI, B); 1481 case TargetOpcode::G_INTRINSIC_TRUNC: 1482 return legalizeIntrinsicTrunc(MI, MRI, B); 1483 case TargetOpcode::G_SITOFP: 1484 return legalizeITOFP(MI, MRI, B, true); 1485 case TargetOpcode::G_UITOFP: 1486 return legalizeITOFP(MI, MRI, B, false); 1487 case TargetOpcode::G_FPTOSI: 1488 return legalizeFPTOI(MI, MRI, B, true); 1489 case TargetOpcode::G_FPTOUI: 1490 return legalizeFPTOI(MI, MRI, B, false); 1491 case TargetOpcode::G_FMINNUM: 1492 case TargetOpcode::G_FMAXNUM: 1493 case TargetOpcode::G_FMINNUM_IEEE: 1494 case TargetOpcode::G_FMAXNUM_IEEE: 1495 return legalizeMinNumMaxNum(Helper, MI); 1496 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1497 return legalizeExtractVectorElt(MI, MRI, B); 1498 case TargetOpcode::G_INSERT_VECTOR_ELT: 1499 return legalizeInsertVectorElt(MI, MRI, B); 1500 case TargetOpcode::G_SHUFFLE_VECTOR: 1501 return legalizeShuffleVector(MI, MRI, B); 1502 case TargetOpcode::G_FSIN: 1503 case TargetOpcode::G_FCOS: 1504 return legalizeSinCos(MI, MRI, B); 1505 case TargetOpcode::G_GLOBAL_VALUE: 1506 return legalizeGlobalValue(MI, MRI, B); 1507 case TargetOpcode::G_LOAD: 1508 return legalizeLoad(MI, MRI, B, Observer); 1509 case TargetOpcode::G_FMAD: 1510 return legalizeFMad(MI, MRI, B); 1511 case TargetOpcode::G_FDIV: 1512 return legalizeFDIV(MI, MRI, B); 1513 case TargetOpcode::G_UDIV: 1514 case TargetOpcode::G_UREM: 1515 return legalizeUDIV_UREM(MI, MRI, B); 1516 case TargetOpcode::G_SDIV: 1517 case TargetOpcode::G_SREM: 1518 return legalizeSDIV_SREM(MI, MRI, B); 1519 case TargetOpcode::G_ATOMIC_CMPXCHG: 1520 return legalizeAtomicCmpXChg(MI, MRI, B); 1521 case TargetOpcode::G_FLOG: 1522 return legalizeFlog(MI, B, numbers::ln2f); 1523 case TargetOpcode::G_FLOG10: 1524 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1525 case TargetOpcode::G_FEXP: 1526 return legalizeFExp(MI, B); 1527 case TargetOpcode::G_FPOW: 1528 return legalizeFPow(MI, B); 1529 case TargetOpcode::G_FFLOOR: 1530 return legalizeFFloor(MI, MRI, B); 1531 case TargetOpcode::G_BUILD_VECTOR: 1532 return legalizeBuildVector(MI, MRI, B); 1533 default: 1534 return false; 1535 } 1536 1537 llvm_unreachable("expected switch to return"); 1538 } 1539 1540 Register AMDGPULegalizerInfo::getSegmentAperture( 1541 unsigned AS, 1542 MachineRegisterInfo &MRI, 1543 MachineIRBuilder &B) const { 1544 MachineFunction &MF = B.getMF(); 1545 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1546 const LLT S32 = LLT::scalar(32); 1547 1548 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1549 1550 if (ST.hasApertureRegs()) { 1551 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1552 // getreg. 1553 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1554 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1555 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1556 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1557 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1558 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1559 unsigned Encoding = 1560 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1561 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1562 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1563 1564 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1565 1566 B.buildInstr(AMDGPU::S_GETREG_B32) 1567 .addDef(GetReg) 1568 .addImm(Encoding); 1569 MRI.setType(GetReg, S32); 1570 1571 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1572 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1573 } 1574 1575 Register QueuePtr = MRI.createGenericVirtualRegister( 1576 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1577 1578 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1579 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1580 return Register(); 1581 1582 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1583 // private_segment_aperture_base_hi. 1584 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1585 1586 // TODO: can we be smarter about machine pointer info? 1587 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1588 MachineMemOperand *MMO = MF.getMachineMemOperand( 1589 PtrInfo, 1590 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1591 MachineMemOperand::MOInvariant, 1592 4, commonAlignment(Align(64), StructOffset)); 1593 1594 Register LoadAddr; 1595 1596 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1597 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1598 } 1599 1600 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1601 MachineInstr &MI, MachineRegisterInfo &MRI, 1602 MachineIRBuilder &B) const { 1603 MachineFunction &MF = B.getMF(); 1604 1605 const LLT S32 = LLT::scalar(32); 1606 Register Dst = MI.getOperand(0).getReg(); 1607 Register Src = MI.getOperand(1).getReg(); 1608 1609 LLT DstTy = MRI.getType(Dst); 1610 LLT SrcTy = MRI.getType(Src); 1611 unsigned DestAS = DstTy.getAddressSpace(); 1612 unsigned SrcAS = SrcTy.getAddressSpace(); 1613 1614 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1615 // vector element. 1616 assert(!DstTy.isVector()); 1617 1618 const AMDGPUTargetMachine &TM 1619 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1620 1621 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1622 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1623 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1624 return true; 1625 } 1626 1627 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1628 // Truncate. 1629 B.buildExtract(Dst, Src, 0); 1630 MI.eraseFromParent(); 1631 return true; 1632 } 1633 1634 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1635 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1636 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1637 1638 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1639 // another. Merge operands are required to be the same type, but creating an 1640 // extra ptrtoint would be kind of pointless. 1641 auto HighAddr = B.buildConstant( 1642 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1643 B.buildMerge(Dst, {Src, HighAddr}); 1644 MI.eraseFromParent(); 1645 return true; 1646 } 1647 1648 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1649 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1650 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1651 unsigned NullVal = TM.getNullPointerValue(DestAS); 1652 1653 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1654 auto FlatNull = B.buildConstant(SrcTy, 0); 1655 1656 // Extract low 32-bits of the pointer. 1657 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1658 1659 auto CmpRes = 1660 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1661 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1662 1663 MI.eraseFromParent(); 1664 return true; 1665 } 1666 1667 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1668 return false; 1669 1670 if (!ST.hasFlatAddressSpace()) 1671 return false; 1672 1673 auto SegmentNull = 1674 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1675 auto FlatNull = 1676 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1677 1678 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1679 if (!ApertureReg.isValid()) 1680 return false; 1681 1682 auto CmpRes = 1683 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1684 1685 // Coerce the type of the low half of the result so we can use merge_values. 1686 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1687 1688 // TODO: Should we allow mismatched types but matching sizes in merges to 1689 // avoid the ptrtoint? 1690 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1691 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1692 1693 MI.eraseFromParent(); 1694 return true; 1695 } 1696 1697 bool AMDGPULegalizerInfo::legalizeFrint( 1698 MachineInstr &MI, MachineRegisterInfo &MRI, 1699 MachineIRBuilder &B) const { 1700 Register Src = MI.getOperand(1).getReg(); 1701 LLT Ty = MRI.getType(Src); 1702 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1703 1704 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1705 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1706 1707 auto C1 = B.buildFConstant(Ty, C1Val); 1708 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1709 1710 // TODO: Should this propagate fast-math-flags? 1711 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1712 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1713 1714 auto C2 = B.buildFConstant(Ty, C2Val); 1715 auto Fabs = B.buildFAbs(Ty, Src); 1716 1717 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1718 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1719 return true; 1720 } 1721 1722 bool AMDGPULegalizerInfo::legalizeFceil( 1723 MachineInstr &MI, MachineRegisterInfo &MRI, 1724 MachineIRBuilder &B) const { 1725 1726 const LLT S1 = LLT::scalar(1); 1727 const LLT S64 = LLT::scalar(64); 1728 1729 Register Src = MI.getOperand(1).getReg(); 1730 assert(MRI.getType(Src) == S64); 1731 1732 // result = trunc(src) 1733 // if (src > 0.0 && src != result) 1734 // result += 1.0 1735 1736 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1737 1738 const auto Zero = B.buildFConstant(S64, 0.0); 1739 const auto One = B.buildFConstant(S64, 1.0); 1740 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1741 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1742 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1743 auto Add = B.buildSelect(S64, And, One, Zero); 1744 1745 // TODO: Should this propagate fast-math-flags? 1746 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1747 return true; 1748 } 1749 1750 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1751 MachineIRBuilder &B) { 1752 const unsigned FractBits = 52; 1753 const unsigned ExpBits = 11; 1754 LLT S32 = LLT::scalar(32); 1755 1756 auto Const0 = B.buildConstant(S32, FractBits - 32); 1757 auto Const1 = B.buildConstant(S32, ExpBits); 1758 1759 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1760 .addUse(Const0.getReg(0)) 1761 .addUse(Const1.getReg(0)); 1762 1763 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1764 } 1765 1766 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1767 MachineInstr &MI, MachineRegisterInfo &MRI, 1768 MachineIRBuilder &B) const { 1769 const LLT S1 = LLT::scalar(1); 1770 const LLT S32 = LLT::scalar(32); 1771 const LLT S64 = LLT::scalar(64); 1772 1773 Register Src = MI.getOperand(1).getReg(); 1774 assert(MRI.getType(Src) == S64); 1775 1776 // TODO: Should this use extract since the low half is unused? 1777 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1778 Register Hi = Unmerge.getReg(1); 1779 1780 // Extract the upper half, since this is where we will find the sign and 1781 // exponent. 1782 auto Exp = extractF64Exponent(Hi, B); 1783 1784 const unsigned FractBits = 52; 1785 1786 // Extract the sign bit. 1787 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1788 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1789 1790 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1791 1792 const auto Zero32 = B.buildConstant(S32, 0); 1793 1794 // Extend back to 64-bits. 1795 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1796 1797 auto Shr = B.buildAShr(S64, FractMask, Exp); 1798 auto Not = B.buildNot(S64, Shr); 1799 auto Tmp0 = B.buildAnd(S64, Src, Not); 1800 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1801 1802 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1803 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1804 1805 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1806 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1807 return true; 1808 } 1809 1810 bool AMDGPULegalizerInfo::legalizeITOFP( 1811 MachineInstr &MI, MachineRegisterInfo &MRI, 1812 MachineIRBuilder &B, bool Signed) const { 1813 1814 Register Dst = MI.getOperand(0).getReg(); 1815 Register Src = MI.getOperand(1).getReg(); 1816 1817 const LLT S64 = LLT::scalar(64); 1818 const LLT S32 = LLT::scalar(32); 1819 1820 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1821 1822 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1823 1824 auto CvtHi = Signed ? 1825 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1826 B.buildUITOFP(S64, Unmerge.getReg(1)); 1827 1828 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1829 1830 auto ThirtyTwo = B.buildConstant(S32, 32); 1831 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1832 .addUse(CvtHi.getReg(0)) 1833 .addUse(ThirtyTwo.getReg(0)); 1834 1835 // TODO: Should this propagate fast-math-flags? 1836 B.buildFAdd(Dst, LdExp, CvtLo); 1837 MI.eraseFromParent(); 1838 return true; 1839 } 1840 1841 // TODO: Copied from DAG implementation. Verify logic and document how this 1842 // actually works. 1843 bool AMDGPULegalizerInfo::legalizeFPTOI( 1844 MachineInstr &MI, MachineRegisterInfo &MRI, 1845 MachineIRBuilder &B, bool Signed) const { 1846 1847 Register Dst = MI.getOperand(0).getReg(); 1848 Register Src = MI.getOperand(1).getReg(); 1849 1850 const LLT S64 = LLT::scalar(64); 1851 const LLT S32 = LLT::scalar(32); 1852 1853 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1854 1855 unsigned Flags = MI.getFlags(); 1856 1857 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1858 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1859 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1860 1861 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1862 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1863 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1864 1865 auto Hi = Signed ? 1866 B.buildFPTOSI(S32, FloorMul) : 1867 B.buildFPTOUI(S32, FloorMul); 1868 auto Lo = B.buildFPTOUI(S32, Fma); 1869 1870 B.buildMerge(Dst, { Lo, Hi }); 1871 MI.eraseFromParent(); 1872 1873 return true; 1874 } 1875 1876 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 1877 MachineInstr &MI) const { 1878 MachineFunction &MF = Helper.MIRBuilder.getMF(); 1879 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1880 1881 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1882 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1883 1884 // With ieee_mode disabled, the instructions have the correct behavior 1885 // already for G_FMINNUM/G_FMAXNUM 1886 if (!MFI->getMode().IEEE) 1887 return !IsIEEEOp; 1888 1889 if (IsIEEEOp) 1890 return true; 1891 1892 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1893 } 1894 1895 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1896 MachineInstr &MI, MachineRegisterInfo &MRI, 1897 MachineIRBuilder &B) const { 1898 // TODO: Should move some of this into LegalizerHelper. 1899 1900 // TODO: Promote dynamic indexing of s16 to s32 1901 1902 // FIXME: Artifact combiner probably should have replaced the truncated 1903 // constant before this, so we shouldn't need 1904 // getConstantVRegValWithLookThrough. 1905 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1906 MI.getOperand(2).getReg(), MRI); 1907 if (!IdxVal) // Dynamic case will be selected to register indexing. 1908 return true; 1909 1910 Register Dst = MI.getOperand(0).getReg(); 1911 Register Vec = MI.getOperand(1).getReg(); 1912 1913 LLT VecTy = MRI.getType(Vec); 1914 LLT EltTy = VecTy.getElementType(); 1915 assert(EltTy == MRI.getType(Dst)); 1916 1917 if (IdxVal->Value < VecTy.getNumElements()) 1918 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1919 else 1920 B.buildUndef(Dst); 1921 1922 MI.eraseFromParent(); 1923 return true; 1924 } 1925 1926 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1927 MachineInstr &MI, MachineRegisterInfo &MRI, 1928 MachineIRBuilder &B) const { 1929 // TODO: Should move some of this into LegalizerHelper. 1930 1931 // TODO: Promote dynamic indexing of s16 to s32 1932 1933 // FIXME: Artifact combiner probably should have replaced the truncated 1934 // constant before this, so we shouldn't need 1935 // getConstantVRegValWithLookThrough. 1936 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1937 MI.getOperand(3).getReg(), MRI); 1938 if (!IdxVal) // Dynamic case will be selected to register indexing. 1939 return true; 1940 1941 Register Dst = MI.getOperand(0).getReg(); 1942 Register Vec = MI.getOperand(1).getReg(); 1943 Register Ins = MI.getOperand(2).getReg(); 1944 1945 LLT VecTy = MRI.getType(Vec); 1946 LLT EltTy = VecTy.getElementType(); 1947 assert(EltTy == MRI.getType(Ins)); 1948 1949 if (IdxVal->Value < VecTy.getNumElements()) 1950 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 1951 else 1952 B.buildUndef(Dst); 1953 1954 MI.eraseFromParent(); 1955 return true; 1956 } 1957 1958 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1959 MachineInstr &MI, MachineRegisterInfo &MRI, 1960 MachineIRBuilder &B) const { 1961 const LLT V2S16 = LLT::vector(2, 16); 1962 1963 Register Dst = MI.getOperand(0).getReg(); 1964 Register Src0 = MI.getOperand(1).getReg(); 1965 LLT DstTy = MRI.getType(Dst); 1966 LLT SrcTy = MRI.getType(Src0); 1967 1968 if (SrcTy == V2S16 && DstTy == V2S16 && 1969 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1970 return true; 1971 1972 MachineIRBuilder HelperBuilder(MI); 1973 GISelObserverWrapper DummyObserver; 1974 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1975 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1976 } 1977 1978 bool AMDGPULegalizerInfo::legalizeSinCos( 1979 MachineInstr &MI, MachineRegisterInfo &MRI, 1980 MachineIRBuilder &B) const { 1981 1982 Register DstReg = MI.getOperand(0).getReg(); 1983 Register SrcReg = MI.getOperand(1).getReg(); 1984 LLT Ty = MRI.getType(DstReg); 1985 unsigned Flags = MI.getFlags(); 1986 1987 Register TrigVal; 1988 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 1989 if (ST.hasTrigReducedRange()) { 1990 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1991 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1992 .addUse(MulVal.getReg(0)) 1993 .setMIFlags(Flags).getReg(0); 1994 } else 1995 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1996 1997 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1998 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1999 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2000 .addUse(TrigVal) 2001 .setMIFlags(Flags); 2002 MI.eraseFromParent(); 2003 return true; 2004 } 2005 2006 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2007 MachineIRBuilder &B, 2008 const GlobalValue *GV, 2009 int64_t Offset, 2010 unsigned GAFlags) const { 2011 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2012 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2013 // to the following code sequence: 2014 // 2015 // For constant address space: 2016 // s_getpc_b64 s[0:1] 2017 // s_add_u32 s0, s0, $symbol 2018 // s_addc_u32 s1, s1, 0 2019 // 2020 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2021 // a fixup or relocation is emitted to replace $symbol with a literal 2022 // constant, which is a pc-relative offset from the encoding of the $symbol 2023 // operand to the global variable. 2024 // 2025 // For global address space: 2026 // s_getpc_b64 s[0:1] 2027 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2028 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2029 // 2030 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2031 // fixups or relocations are emitted to replace $symbol@*@lo and 2032 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2033 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2034 // operand to the global variable. 2035 // 2036 // What we want here is an offset from the value returned by s_getpc 2037 // (which is the address of the s_add_u32 instruction) to the global 2038 // variable, but since the encoding of $symbol starts 4 bytes after the start 2039 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2040 // small. This requires us to add 4 to the global variable offset in order to 2041 // compute the correct address. 2042 2043 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2044 2045 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2046 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2047 2048 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2049 .addDef(PCReg); 2050 2051 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2052 if (GAFlags == SIInstrInfo::MO_NONE) 2053 MIB.addImm(0); 2054 else 2055 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 2056 2057 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2058 2059 if (PtrTy.getSizeInBits() == 32) 2060 B.buildExtract(DstReg, PCReg, 0); 2061 return true; 2062 } 2063 2064 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2065 MachineInstr &MI, MachineRegisterInfo &MRI, 2066 MachineIRBuilder &B) const { 2067 Register DstReg = MI.getOperand(0).getReg(); 2068 LLT Ty = MRI.getType(DstReg); 2069 unsigned AS = Ty.getAddressSpace(); 2070 2071 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2072 MachineFunction &MF = B.getMF(); 2073 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2074 2075 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2076 if (!MFI->isEntryFunction()) { 2077 const Function &Fn = MF.getFunction(); 2078 DiagnosticInfoUnsupported BadLDSDecl( 2079 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2080 DS_Warning); 2081 Fn.getContext().diagnose(BadLDSDecl); 2082 2083 // We currently don't have a way to correctly allocate LDS objects that 2084 // aren't directly associated with a kernel. We do force inlining of 2085 // functions that use local objects. However, if these dead functions are 2086 // not eliminated, we don't want a compile time error. Just emit a warning 2087 // and a trap, since there should be no callable path here. 2088 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2089 B.buildUndef(DstReg); 2090 MI.eraseFromParent(); 2091 return true; 2092 } 2093 2094 // TODO: We could emit code to handle the initialization somewhere. 2095 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2096 const SITargetLowering *TLI = ST.getTargetLowering(); 2097 if (!TLI->shouldUseLDSConstAddress(GV)) { 2098 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2099 return true; // Leave in place; 2100 } 2101 2102 B.buildConstant( 2103 DstReg, 2104 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2105 MI.eraseFromParent(); 2106 return true; 2107 } 2108 2109 const Function &Fn = MF.getFunction(); 2110 DiagnosticInfoUnsupported BadInit( 2111 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2112 Fn.getContext().diagnose(BadInit); 2113 return true; 2114 } 2115 2116 const SITargetLowering *TLI = ST.getTargetLowering(); 2117 2118 if (TLI->shouldEmitFixup(GV)) { 2119 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2120 MI.eraseFromParent(); 2121 return true; 2122 } 2123 2124 if (TLI->shouldEmitPCReloc(GV)) { 2125 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2126 MI.eraseFromParent(); 2127 return true; 2128 } 2129 2130 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2131 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2132 2133 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2134 MachinePointerInfo::getGOT(MF), 2135 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2136 MachineMemOperand::MOInvariant, 2137 8 /*Size*/, Align(8)); 2138 2139 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2140 2141 if (Ty.getSizeInBits() == 32) { 2142 // Truncate if this is a 32-bit constant adrdess. 2143 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2144 B.buildExtract(DstReg, Load, 0); 2145 } else 2146 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2147 2148 MI.eraseFromParent(); 2149 return true; 2150 } 2151 2152 bool AMDGPULegalizerInfo::legalizeLoad( 2153 MachineInstr &MI, MachineRegisterInfo &MRI, 2154 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2155 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2156 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2157 Observer.changingInstr(MI); 2158 MI.getOperand(1).setReg(Cast.getReg(0)); 2159 Observer.changedInstr(MI); 2160 return true; 2161 } 2162 2163 bool AMDGPULegalizerInfo::legalizeFMad( 2164 MachineInstr &MI, MachineRegisterInfo &MRI, 2165 MachineIRBuilder &B) const { 2166 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2167 assert(Ty.isScalar()); 2168 2169 MachineFunction &MF = B.getMF(); 2170 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2171 2172 // TODO: Always legal with future ftz flag. 2173 // FIXME: Do we need just output? 2174 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2175 return true; 2176 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2177 return true; 2178 2179 MachineIRBuilder HelperBuilder(MI); 2180 GISelObserverWrapper DummyObserver; 2181 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2182 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2183 } 2184 2185 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2186 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2187 Register DstReg = MI.getOperand(0).getReg(); 2188 Register PtrReg = MI.getOperand(1).getReg(); 2189 Register CmpVal = MI.getOperand(2).getReg(); 2190 Register NewVal = MI.getOperand(3).getReg(); 2191 2192 assert(SITargetLowering::isFlatGlobalAddrSpace( 2193 MRI.getType(PtrReg).getAddressSpace()) && 2194 "this should not have been custom lowered"); 2195 2196 LLT ValTy = MRI.getType(CmpVal); 2197 LLT VecTy = LLT::vector(2, ValTy); 2198 2199 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2200 2201 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2202 .addDef(DstReg) 2203 .addUse(PtrReg) 2204 .addUse(PackedVal) 2205 .setMemRefs(MI.memoperands()); 2206 2207 MI.eraseFromParent(); 2208 return true; 2209 } 2210 2211 bool AMDGPULegalizerInfo::legalizeFlog( 2212 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2213 Register Dst = MI.getOperand(0).getReg(); 2214 Register Src = MI.getOperand(1).getReg(); 2215 LLT Ty = B.getMRI()->getType(Dst); 2216 unsigned Flags = MI.getFlags(); 2217 2218 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2219 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2220 2221 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2222 MI.eraseFromParent(); 2223 return true; 2224 } 2225 2226 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2227 MachineIRBuilder &B) const { 2228 Register Dst = MI.getOperand(0).getReg(); 2229 Register Src = MI.getOperand(1).getReg(); 2230 unsigned Flags = MI.getFlags(); 2231 LLT Ty = B.getMRI()->getType(Dst); 2232 2233 auto K = B.buildFConstant(Ty, numbers::log2e); 2234 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2235 B.buildFExp2(Dst, Mul, Flags); 2236 MI.eraseFromParent(); 2237 return true; 2238 } 2239 2240 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2241 MachineIRBuilder &B) const { 2242 Register Dst = MI.getOperand(0).getReg(); 2243 Register Src0 = MI.getOperand(1).getReg(); 2244 Register Src1 = MI.getOperand(2).getReg(); 2245 unsigned Flags = MI.getFlags(); 2246 LLT Ty = B.getMRI()->getType(Dst); 2247 const LLT S16 = LLT::scalar(16); 2248 const LLT S32 = LLT::scalar(32); 2249 2250 if (Ty == S32) { 2251 auto Log = B.buildFLog2(S32, Src0, Flags); 2252 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2253 .addUse(Log.getReg(0)) 2254 .addUse(Src1) 2255 .setMIFlags(Flags); 2256 B.buildFExp2(Dst, Mul, Flags); 2257 } else if (Ty == S16) { 2258 // There's no f16 fmul_legacy, so we need to convert for it. 2259 auto Log = B.buildFLog2(S16, Src0, Flags); 2260 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2261 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2262 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2263 .addUse(Ext0.getReg(0)) 2264 .addUse(Ext1.getReg(0)) 2265 .setMIFlags(Flags); 2266 2267 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2268 } else 2269 return false; 2270 2271 MI.eraseFromParent(); 2272 return true; 2273 } 2274 2275 // Find a source register, ignoring any possible source modifiers. 2276 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2277 Register ModSrc = OrigSrc; 2278 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2279 ModSrc = SrcFNeg->getOperand(1).getReg(); 2280 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2281 ModSrc = SrcFAbs->getOperand(1).getReg(); 2282 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2283 ModSrc = SrcFAbs->getOperand(1).getReg(); 2284 return ModSrc; 2285 } 2286 2287 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2288 MachineRegisterInfo &MRI, 2289 MachineIRBuilder &B) const { 2290 2291 const LLT S1 = LLT::scalar(1); 2292 const LLT S64 = LLT::scalar(64); 2293 Register Dst = MI.getOperand(0).getReg(); 2294 Register OrigSrc = MI.getOperand(1).getReg(); 2295 unsigned Flags = MI.getFlags(); 2296 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2297 "this should not have been custom lowered"); 2298 2299 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2300 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2301 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2302 // V_FRACT bug is: 2303 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2304 // 2305 // Convert floor(x) to (x - fract(x)) 2306 2307 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2308 .addUse(OrigSrc) 2309 .setMIFlags(Flags); 2310 2311 // Give source modifier matching some assistance before obscuring a foldable 2312 // pattern. 2313 2314 // TODO: We can avoid the neg on the fract? The input sign to fract 2315 // shouldn't matter? 2316 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2317 2318 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2319 2320 Register Min = MRI.createGenericVirtualRegister(S64); 2321 2322 // We don't need to concern ourselves with the snan handling difference, so 2323 // use the one which will directly select. 2324 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2325 if (MFI->getMode().IEEE) 2326 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2327 else 2328 B.buildFMinNum(Min, Fract, Const, Flags); 2329 2330 Register CorrectedFract = Min; 2331 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2332 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2333 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2334 } 2335 2336 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2337 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2338 2339 MI.eraseFromParent(); 2340 return true; 2341 } 2342 2343 // Turn an illegal packed v2s16 build vector into bit operations. 2344 // TODO: This should probably be a bitcast action in LegalizerHelper. 2345 bool AMDGPULegalizerInfo::legalizeBuildVector( 2346 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2347 Register Dst = MI.getOperand(0).getReg(); 2348 const LLT S32 = LLT::scalar(32); 2349 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2350 2351 Register Src0 = MI.getOperand(1).getReg(); 2352 Register Src1 = MI.getOperand(2).getReg(); 2353 assert(MRI.getType(Src0) == LLT::scalar(16)); 2354 2355 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2356 B.buildBitcast(Dst, Merge); 2357 2358 MI.eraseFromParent(); 2359 return true; 2360 } 2361 2362 // Return the use branch instruction, otherwise null if the usage is invalid. 2363 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2364 MachineRegisterInfo &MRI, 2365 MachineInstr *&Br, 2366 MachineBasicBlock *&UncondBrTarget) { 2367 Register CondDef = MI.getOperand(0).getReg(); 2368 if (!MRI.hasOneNonDBGUse(CondDef)) 2369 return nullptr; 2370 2371 MachineBasicBlock *Parent = MI.getParent(); 2372 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2373 if (UseMI.getParent() != Parent || 2374 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2375 return nullptr; 2376 2377 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2378 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2379 if (Next == Parent->end()) { 2380 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2381 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2382 return nullptr; 2383 UncondBrTarget = &*NextMBB; 2384 } else { 2385 if (Next->getOpcode() != AMDGPU::G_BR) 2386 return nullptr; 2387 Br = &*Next; 2388 UncondBrTarget = Br->getOperand(0).getMBB(); 2389 } 2390 2391 return &UseMI; 2392 } 2393 2394 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, 2395 MachineRegisterInfo &MRI, 2396 Register LiveIn, 2397 Register PhyReg) const { 2398 assert(PhyReg.isPhysical() && "Physical register expected"); 2399 2400 // Insert the live-in copy, if required, by defining destination virtual 2401 // register. 2402 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2403 if (!MRI.getVRegDef(LiveIn)) { 2404 // FIXME: Should have scoped insert pt 2405 MachineBasicBlock &OrigInsBB = B.getMBB(); 2406 auto OrigInsPt = B.getInsertPt(); 2407 2408 MachineBasicBlock &EntryMBB = B.getMF().front(); 2409 EntryMBB.addLiveIn(PhyReg); 2410 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2411 B.buildCopy(LiveIn, PhyReg); 2412 2413 B.setInsertPt(OrigInsBB, OrigInsPt); 2414 } 2415 2416 return LiveIn; 2417 } 2418 2419 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, 2420 MachineRegisterInfo &MRI, 2421 Register PhyReg, LLT Ty, 2422 bool InsertLiveInCopy) const { 2423 assert(PhyReg.isPhysical() && "Physical register expected"); 2424 2425 // Get or create virtual live-in regester 2426 Register LiveIn = MRI.getLiveInVirtReg(PhyReg); 2427 if (!LiveIn) { 2428 LiveIn = MRI.createGenericVirtualRegister(Ty); 2429 MRI.addLiveIn(PhyReg, LiveIn); 2430 } 2431 2432 // When the actual true copy required is from virtual register to physical 2433 // register (to be inserted later), live-in copy insertion from physical 2434 // to register virtual register is not required 2435 if (!InsertLiveInCopy) 2436 return LiveIn; 2437 2438 return insertLiveInCopy(B, MRI, LiveIn, PhyReg); 2439 } 2440 2441 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor( 2442 MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2443 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2444 const ArgDescriptor *Arg; 2445 const TargetRegisterClass *RC; 2446 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 2447 if (!Arg) { 2448 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2449 return nullptr; 2450 } 2451 return Arg; 2452 } 2453 2454 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2455 const ArgDescriptor *Arg) const { 2456 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2457 return false; // TODO: Handle these 2458 2459 Register SrcReg = Arg->getRegister(); 2460 assert(SrcReg.isPhysical() && "Physical register expected"); 2461 assert(DstReg.isVirtual() && "Virtual register expected"); 2462 2463 MachineRegisterInfo &MRI = *B.getMRI(); 2464 2465 LLT Ty = MRI.getType(DstReg); 2466 Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty); 2467 2468 if (Arg->isMasked()) { 2469 // TODO: Should we try to emit this once in the entry block? 2470 const LLT S32 = LLT::scalar(32); 2471 const unsigned Mask = Arg->getMask(); 2472 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2473 2474 Register AndMaskSrc = LiveIn; 2475 2476 if (Shift != 0) { 2477 auto ShiftAmt = B.buildConstant(S32, Shift); 2478 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2479 } 2480 2481 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2482 } else { 2483 B.buildCopy(DstReg, LiveIn); 2484 } 2485 2486 return true; 2487 } 2488 2489 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2490 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2491 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2492 2493 const ArgDescriptor *Arg = getArgDescriptor(B, ArgType); 2494 if (!Arg) 2495 return false; 2496 2497 if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg)) 2498 return false; 2499 2500 MI.eraseFromParent(); 2501 return true; 2502 } 2503 2504 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2505 MachineRegisterInfo &MRI, 2506 MachineIRBuilder &B) const { 2507 Register Dst = MI.getOperand(0).getReg(); 2508 LLT DstTy = MRI.getType(Dst); 2509 LLT S16 = LLT::scalar(16); 2510 LLT S32 = LLT::scalar(32); 2511 LLT S64 = LLT::scalar(64); 2512 2513 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2514 return true; 2515 2516 if (DstTy == S16) 2517 return legalizeFDIV16(MI, MRI, B); 2518 if (DstTy == S32) 2519 return legalizeFDIV32(MI, MRI, B); 2520 if (DstTy == S64) 2521 return legalizeFDIV64(MI, MRI, B); 2522 2523 return false; 2524 } 2525 2526 static Register buildDivRCP(MachineIRBuilder &B, Register Src) { 2527 const LLT S32 = LLT::scalar(32); 2528 2529 auto Cvt0 = B.buildUITOFP(S32, Src); 2530 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0}); 2531 auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000)); 2532 auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1); 2533 return B.buildFPTOUI(S32, Mul).getReg(0); 2534 } 2535 2536 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2537 Register DstReg, 2538 Register Num, 2539 Register Den, 2540 bool IsDiv) const { 2541 const LLT S1 = LLT::scalar(1); 2542 const LLT S32 = LLT::scalar(32); 2543 2544 // RCP = URECIP(Den) = 2^32 / Den + e 2545 // e is rounding error. 2546 auto RCP = buildDivRCP(B, Den); 2547 2548 // RCP_LO = mul(RCP, Den) 2549 auto RCP_LO = B.buildMul(S32, RCP, Den); 2550 2551 // RCP_HI = mulhu (RCP, Den) */ 2552 auto RCP_HI = B.buildUMulH(S32, RCP, Den); 2553 2554 // NEG_RCP_LO = -RCP_LO 2555 auto Zero = B.buildConstant(S32, 0); 2556 auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO); 2557 2558 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 2559 auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero); 2560 auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO); 2561 2562 // Calculate the rounding error from the URECIP instruction 2563 // E = mulhu(ABS_RCP_LO, RCP) 2564 auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP); 2565 2566 // RCP_A_E = RCP + E 2567 auto RCP_A_E = B.buildAdd(S32, RCP, E); 2568 2569 // RCP_S_E = RCP - E 2570 auto RCP_S_E = B.buildSub(S32, RCP, E); 2571 2572 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 2573 auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E); 2574 2575 // Quotient = mulhu(Tmp0, Num)stmp 2576 auto Quotient = B.buildUMulH(S32, Tmp0, Num); 2577 2578 // Num_S_Remainder = Quotient * Den 2579 auto Num_S_Remainder = B.buildMul(S32, Quotient, Den); 2580 2581 // Remainder = Num - Num_S_Remainder 2582 auto Remainder = B.buildSub(S32, Num, Num_S_Remainder); 2583 2584 // Remainder_GE_Den = Remainder >= Den 2585 auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den); 2586 2587 // Remainder_GE_Zero = Num >= Num_S_Remainder; 2588 auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1, 2589 Num, Num_S_Remainder); 2590 2591 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 2592 auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero); 2593 2594 // Calculate Division result: 2595 2596 // Quotient_A_One = Quotient + 1 2597 auto One = B.buildConstant(S32, 1); 2598 auto Quotient_A_One = B.buildAdd(S32, Quotient, One); 2599 2600 // Quotient_S_One = Quotient - 1 2601 auto Quotient_S_One = B.buildSub(S32, Quotient, One); 2602 2603 // Div = (Tmp1 ? Quotient_A_One : Quotient) 2604 auto Div = B.buildSelect(S32, Tmp1, Quotient_A_One, Quotient); 2605 2606 // Div = (Remainder_GE_Zero ? Div : Quotient_S_One) 2607 if (IsDiv) { 2608 B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One); 2609 } else { 2610 Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One); 2611 2612 // Calculate Rem result: 2613 auto Remainder_S_Den = B.buildSub(S32, Remainder, Den); 2614 2615 // Remainder_A_Den = Remainder + Den 2616 auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den); 2617 2618 // Rem = (Tmp1 ? Remainder_S_Den : Remainder) 2619 auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder); 2620 2621 // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den) 2622 B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den); 2623 } 2624 } 2625 2626 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2627 MachineRegisterInfo &MRI, 2628 MachineIRBuilder &B) const { 2629 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2630 Register DstReg = MI.getOperand(0).getReg(); 2631 Register Num = MI.getOperand(1).getReg(); 2632 Register Den = MI.getOperand(2).getReg(); 2633 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2634 MI.eraseFromParent(); 2635 return true; 2636 } 2637 2638 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2639 // 2640 // Return lo, hi of result 2641 // 2642 // %cvt.lo = G_UITOFP Val.lo 2643 // %cvt.hi = G_UITOFP Val.hi 2644 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2645 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2646 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2647 // %mul2 = G_FMUL %mul1, 2**(-32) 2648 // %trunc = G_INTRINSIC_TRUNC %mul2 2649 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2650 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2651 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2652 Register Val) { 2653 const LLT S32 = LLT::scalar(32); 2654 auto Unmerge = B.buildUnmerge(S32, Val); 2655 2656 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2657 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2658 2659 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2660 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2661 2662 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2663 auto Mul1 = 2664 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2665 2666 // 2**(-32) 2667 auto Mul2 = 2668 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2669 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2670 2671 // -(2**32) 2672 auto Mad2 = B.buildFMAD(S32, Trunc, 2673 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2674 2675 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2676 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2677 2678 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2679 } 2680 2681 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2682 Register DstReg, 2683 Register Numer, 2684 Register Denom, 2685 bool IsDiv) const { 2686 const LLT S32 = LLT::scalar(32); 2687 const LLT S64 = LLT::scalar(64); 2688 const LLT S1 = LLT::scalar(1); 2689 Register RcpLo, RcpHi; 2690 2691 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2692 2693 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2694 2695 auto Zero64 = B.buildConstant(S64, 0); 2696 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2697 2698 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2699 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2700 2701 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2702 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2703 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2704 2705 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2706 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2707 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2708 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2709 2710 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2711 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2712 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2713 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2714 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2715 2716 auto Zero32 = B.buildConstant(S32, 0); 2717 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2718 auto Add2_HiC = 2719 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2720 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2721 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2722 2723 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2724 Register NumerLo = UnmergeNumer.getReg(0); 2725 Register NumerHi = UnmergeNumer.getReg(1); 2726 2727 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2728 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2729 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2730 Register Mul3_Lo = UnmergeMul3.getReg(0); 2731 Register Mul3_Hi = UnmergeMul3.getReg(1); 2732 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2733 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2734 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2735 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2736 2737 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2738 Register DenomLo = UnmergeDenom.getReg(0); 2739 Register DenomHi = UnmergeDenom.getReg(1); 2740 2741 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2742 auto C1 = B.buildSExt(S32, CmpHi); 2743 2744 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2745 auto C2 = B.buildSExt(S32, CmpLo); 2746 2747 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2748 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2749 2750 // TODO: Here and below portions of the code can be enclosed into if/endif. 2751 // Currently control flow is unconditional and we have 4 selects after 2752 // potential endif to substitute PHIs. 2753 2754 // if C3 != 0 ... 2755 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2756 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2757 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2758 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2759 2760 auto One64 = B.buildConstant(S64, 1); 2761 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2762 2763 auto C4 = 2764 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2765 auto C5 = 2766 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2767 auto C6 = B.buildSelect( 2768 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2769 2770 // if (C6 != 0) 2771 auto Add4 = B.buildAdd(S64, Add3, One64); 2772 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2773 2774 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2775 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2776 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2777 2778 // endif C6 2779 // endif C3 2780 2781 if (IsDiv) { 2782 auto Sel1 = B.buildSelect( 2783 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2784 B.buildSelect(DstReg, 2785 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2786 } else { 2787 auto Sel2 = B.buildSelect( 2788 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2789 B.buildSelect(DstReg, 2790 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2791 } 2792 } 2793 2794 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2795 MachineRegisterInfo &MRI, 2796 MachineIRBuilder &B) const { 2797 const LLT S64 = LLT::scalar(64); 2798 const LLT S32 = LLT::scalar(32); 2799 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2800 Register DstReg = MI.getOperand(0).getReg(); 2801 Register Num = MI.getOperand(1).getReg(); 2802 Register Den = MI.getOperand(2).getReg(); 2803 LLT Ty = MRI.getType(DstReg); 2804 2805 if (Ty == S32) 2806 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2807 else if (Ty == S64) 2808 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 2809 else 2810 return false; 2811 2812 MI.eraseFromParent(); 2813 return true; 2814 2815 } 2816 2817 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2818 MachineRegisterInfo &MRI, 2819 MachineIRBuilder &B) const { 2820 const LLT S64 = LLT::scalar(64); 2821 const LLT S32 = LLT::scalar(32); 2822 2823 Register DstReg = MI.getOperand(0).getReg(); 2824 const LLT Ty = MRI.getType(DstReg); 2825 if (Ty != S32 && Ty != S64) 2826 return false; 2827 2828 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 2829 2830 Register LHS = MI.getOperand(1).getReg(); 2831 Register RHS = MI.getOperand(2).getReg(); 2832 2833 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 2834 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 2835 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 2836 2837 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 2838 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 2839 2840 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 2841 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 2842 2843 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 2844 if (Ty == S32) 2845 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 2846 else 2847 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 2848 2849 Register Sign; 2850 if (IsDiv) 2851 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 2852 else 2853 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 2854 2855 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 2856 B.buildSub(DstReg, UDivRem, Sign); 2857 2858 MI.eraseFromParent(); 2859 return true; 2860 } 2861 2862 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2863 MachineRegisterInfo &MRI, 2864 MachineIRBuilder &B) const { 2865 Register Res = MI.getOperand(0).getReg(); 2866 Register LHS = MI.getOperand(1).getReg(); 2867 Register RHS = MI.getOperand(2).getReg(); 2868 2869 uint16_t Flags = MI.getFlags(); 2870 2871 LLT ResTy = MRI.getType(Res); 2872 LLT S32 = LLT::scalar(32); 2873 LLT S64 = LLT::scalar(64); 2874 2875 const MachineFunction &MF = B.getMF(); 2876 bool Unsafe = 2877 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2878 2879 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2880 return false; 2881 2882 if (!Unsafe && ResTy == S32 && 2883 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2884 return false; 2885 2886 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2887 // 1 / x -> RCP(x) 2888 if (CLHS->isExactlyValue(1.0)) { 2889 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2890 .addUse(RHS) 2891 .setMIFlags(Flags); 2892 2893 MI.eraseFromParent(); 2894 return true; 2895 } 2896 2897 // -1 / x -> RCP( FNEG(x) ) 2898 if (CLHS->isExactlyValue(-1.0)) { 2899 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2900 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2901 .addUse(FNeg.getReg(0)) 2902 .setMIFlags(Flags); 2903 2904 MI.eraseFromParent(); 2905 return true; 2906 } 2907 } 2908 2909 // x / y -> x * (1.0 / y) 2910 if (Unsafe) { 2911 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2912 .addUse(RHS) 2913 .setMIFlags(Flags); 2914 B.buildFMul(Res, LHS, RCP, Flags); 2915 2916 MI.eraseFromParent(); 2917 return true; 2918 } 2919 2920 return false; 2921 } 2922 2923 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2924 MachineRegisterInfo &MRI, 2925 MachineIRBuilder &B) const { 2926 Register Res = MI.getOperand(0).getReg(); 2927 Register LHS = MI.getOperand(1).getReg(); 2928 Register RHS = MI.getOperand(2).getReg(); 2929 2930 uint16_t Flags = MI.getFlags(); 2931 2932 LLT S16 = LLT::scalar(16); 2933 LLT S32 = LLT::scalar(32); 2934 2935 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2936 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2937 2938 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2939 .addUse(RHSExt.getReg(0)) 2940 .setMIFlags(Flags); 2941 2942 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2943 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2944 2945 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2946 .addUse(RDst.getReg(0)) 2947 .addUse(RHS) 2948 .addUse(LHS) 2949 .setMIFlags(Flags); 2950 2951 MI.eraseFromParent(); 2952 return true; 2953 } 2954 2955 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2956 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2957 static void toggleSPDenormMode(bool Enable, 2958 MachineIRBuilder &B, 2959 const GCNSubtarget &ST, 2960 AMDGPU::SIModeRegisterDefaults Mode) { 2961 // Set SP denorm mode to this value. 2962 unsigned SPDenormMode = 2963 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2964 2965 if (ST.hasDenormModeInst()) { 2966 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2967 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2968 2969 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2970 B.buildInstr(AMDGPU::S_DENORM_MODE) 2971 .addImm(NewDenormModeValue); 2972 2973 } else { 2974 // Select FP32 bit field in mode register. 2975 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2976 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2977 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2978 2979 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2980 .addImm(SPDenormMode) 2981 .addImm(SPDenormModeBitField); 2982 } 2983 } 2984 2985 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2986 MachineRegisterInfo &MRI, 2987 MachineIRBuilder &B) const { 2988 Register Res = MI.getOperand(0).getReg(); 2989 Register LHS = MI.getOperand(1).getReg(); 2990 Register RHS = MI.getOperand(2).getReg(); 2991 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2992 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2993 2994 uint16_t Flags = MI.getFlags(); 2995 2996 LLT S32 = LLT::scalar(32); 2997 LLT S1 = LLT::scalar(1); 2998 2999 auto One = B.buildFConstant(S32, 1.0f); 3000 3001 auto DenominatorScaled = 3002 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3003 .addUse(LHS) 3004 .addUse(RHS) 3005 .addImm(0) 3006 .setMIFlags(Flags); 3007 auto NumeratorScaled = 3008 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3009 .addUse(LHS) 3010 .addUse(RHS) 3011 .addImm(1) 3012 .setMIFlags(Flags); 3013 3014 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3015 .addUse(DenominatorScaled.getReg(0)) 3016 .setMIFlags(Flags); 3017 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3018 3019 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3020 // aren't modeled as reading it. 3021 if (!Mode.allFP32Denormals()) 3022 toggleSPDenormMode(true, B, ST, Mode); 3023 3024 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3025 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3026 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3027 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3028 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3029 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3030 3031 if (!Mode.allFP32Denormals()) 3032 toggleSPDenormMode(false, B, ST, Mode); 3033 3034 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3035 .addUse(Fma4.getReg(0)) 3036 .addUse(Fma1.getReg(0)) 3037 .addUse(Fma3.getReg(0)) 3038 .addUse(NumeratorScaled.getReg(1)) 3039 .setMIFlags(Flags); 3040 3041 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3042 .addUse(Fmas.getReg(0)) 3043 .addUse(RHS) 3044 .addUse(LHS) 3045 .setMIFlags(Flags); 3046 3047 MI.eraseFromParent(); 3048 return true; 3049 } 3050 3051 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3052 MachineRegisterInfo &MRI, 3053 MachineIRBuilder &B) const { 3054 Register Res = MI.getOperand(0).getReg(); 3055 Register LHS = MI.getOperand(1).getReg(); 3056 Register RHS = MI.getOperand(2).getReg(); 3057 3058 uint16_t Flags = MI.getFlags(); 3059 3060 LLT S64 = LLT::scalar(64); 3061 LLT S1 = LLT::scalar(1); 3062 3063 auto One = B.buildFConstant(S64, 1.0); 3064 3065 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3066 .addUse(LHS) 3067 .addUse(RHS) 3068 .addImm(0) 3069 .setMIFlags(Flags); 3070 3071 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3072 3073 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3074 .addUse(DivScale0.getReg(0)) 3075 .setMIFlags(Flags); 3076 3077 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3078 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3079 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3080 3081 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3082 .addUse(LHS) 3083 .addUse(RHS) 3084 .addImm(1) 3085 .setMIFlags(Flags); 3086 3087 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3088 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3089 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3090 3091 Register Scale; 3092 if (!ST.hasUsableDivScaleConditionOutput()) { 3093 // Workaround a hardware bug on SI where the condition output from div_scale 3094 // is not usable. 3095 3096 LLT S32 = LLT::scalar(32); 3097 3098 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3099 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3100 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3101 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3102 3103 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3104 Scale1Unmerge.getReg(1)); 3105 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3106 Scale0Unmerge.getReg(1)); 3107 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3108 } else { 3109 Scale = DivScale1.getReg(1); 3110 } 3111 3112 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3113 .addUse(Fma4.getReg(0)) 3114 .addUse(Fma3.getReg(0)) 3115 .addUse(Mul.getReg(0)) 3116 .addUse(Scale) 3117 .setMIFlags(Flags); 3118 3119 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3120 .addUse(Fmas.getReg(0)) 3121 .addUse(RHS) 3122 .addUse(LHS) 3123 .setMIFlags(Flags); 3124 3125 MI.eraseFromParent(); 3126 return true; 3127 } 3128 3129 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3130 MachineRegisterInfo &MRI, 3131 MachineIRBuilder &B) const { 3132 Register Res = MI.getOperand(0).getReg(); 3133 Register LHS = MI.getOperand(2).getReg(); 3134 Register RHS = MI.getOperand(3).getReg(); 3135 uint16_t Flags = MI.getFlags(); 3136 3137 LLT S32 = LLT::scalar(32); 3138 LLT S1 = LLT::scalar(1); 3139 3140 auto Abs = B.buildFAbs(S32, RHS, Flags); 3141 const APFloat C0Val(1.0f); 3142 3143 auto C0 = B.buildConstant(S32, 0x6f800000); 3144 auto C1 = B.buildConstant(S32, 0x2f800000); 3145 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3146 3147 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3148 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3149 3150 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3151 3152 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3153 .addUse(Mul0.getReg(0)) 3154 .setMIFlags(Flags); 3155 3156 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3157 3158 B.buildFMul(Res, Sel, Mul1, Flags); 3159 3160 MI.eraseFromParent(); 3161 return true; 3162 } 3163 3164 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3165 MachineRegisterInfo &MRI, 3166 MachineIRBuilder &B) const { 3167 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3168 if (!MFI->isEntryFunction()) { 3169 return legalizePreloadedArgIntrin(MI, MRI, B, 3170 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3171 } 3172 3173 uint64_t Offset = 3174 ST.getTargetLowering()->getImplicitParameterOffset( 3175 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3176 Register DstReg = MI.getOperand(0).getReg(); 3177 LLT DstTy = MRI.getType(DstReg); 3178 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3179 3180 const ArgDescriptor *Arg; 3181 const TargetRegisterClass *RC; 3182 std::tie(Arg, RC) 3183 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3184 if (!Arg) 3185 return false; 3186 3187 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3188 if (!loadInputValue(KernargPtrReg, B, Arg)) 3189 return false; 3190 3191 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3192 MI.eraseFromParent(); 3193 return true; 3194 } 3195 3196 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3197 MachineRegisterInfo &MRI, 3198 MachineIRBuilder &B, 3199 unsigned AddrSpace) const { 3200 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3201 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 3202 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3203 MI.eraseFromParent(); 3204 return true; 3205 } 3206 3207 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3208 // offset (the offset that is included in bounds checking and swizzling, to be 3209 // split between the instruction's voffset and immoffset fields) and soffset 3210 // (the offset that is excluded from bounds checking and swizzling, to go in 3211 // the instruction's soffset field). This function takes the first kind of 3212 // offset and figures out how to split it between voffset and immoffset. 3213 std::tuple<Register, unsigned, unsigned> 3214 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3215 Register OrigOffset) const { 3216 const unsigned MaxImm = 4095; 3217 Register BaseReg; 3218 unsigned TotalConstOffset; 3219 MachineInstr *OffsetDef; 3220 const LLT S32 = LLT::scalar(32); 3221 3222 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3223 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3224 3225 unsigned ImmOffset = TotalConstOffset; 3226 3227 // If the immediate value is too big for the immoffset field, put the value 3228 // and -4096 into the immoffset field so that the value that is copied/added 3229 // for the voffset field is a multiple of 4096, and it stands more chance 3230 // of being CSEd with the copy/add for another similar load/store. 3231 // However, do not do that rounding down to a multiple of 4096 if that is a 3232 // negative number, as it appears to be illegal to have a negative offset 3233 // in the vgpr, even if adding the immediate offset makes it positive. 3234 unsigned Overflow = ImmOffset & ~MaxImm; 3235 ImmOffset -= Overflow; 3236 if ((int32_t)Overflow < 0) { 3237 Overflow += ImmOffset; 3238 ImmOffset = 0; 3239 } 3240 3241 if (Overflow != 0) { 3242 if (!BaseReg) { 3243 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3244 } else { 3245 auto OverflowVal = B.buildConstant(S32, Overflow); 3246 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3247 } 3248 } 3249 3250 if (!BaseReg) 3251 BaseReg = B.buildConstant(S32, 0).getReg(0); 3252 3253 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3254 } 3255 3256 /// Handle register layout difference for f16 images for some subtargets. 3257 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3258 MachineRegisterInfo &MRI, 3259 Register Reg) const { 3260 if (!ST.hasUnpackedD16VMem()) 3261 return Reg; 3262 3263 const LLT S16 = LLT::scalar(16); 3264 const LLT S32 = LLT::scalar(32); 3265 LLT StoreVT = MRI.getType(Reg); 3266 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3267 3268 auto Unmerge = B.buildUnmerge(S16, Reg); 3269 3270 SmallVector<Register, 4> WideRegs; 3271 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3272 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3273 3274 int NumElts = StoreVT.getNumElements(); 3275 3276 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3277 } 3278 3279 Register AMDGPULegalizerInfo::fixStoreSourceType( 3280 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3281 MachineRegisterInfo *MRI = B.getMRI(); 3282 LLT Ty = MRI->getType(VData); 3283 3284 const LLT S16 = LLT::scalar(16); 3285 3286 // Fixup illegal register types for i8 stores. 3287 if (Ty == LLT::scalar(8) || Ty == S16) { 3288 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3289 return AnyExt; 3290 } 3291 3292 if (Ty.isVector()) { 3293 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3294 if (IsFormat) 3295 return handleD16VData(B, *MRI, VData); 3296 } 3297 } 3298 3299 return VData; 3300 } 3301 3302 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3303 MachineRegisterInfo &MRI, 3304 MachineIRBuilder &B, 3305 bool IsTyped, 3306 bool IsFormat) const { 3307 Register VData = MI.getOperand(1).getReg(); 3308 LLT Ty = MRI.getType(VData); 3309 LLT EltTy = Ty.getScalarType(); 3310 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3311 const LLT S32 = LLT::scalar(32); 3312 3313 VData = fixStoreSourceType(B, VData, IsFormat); 3314 Register RSrc = MI.getOperand(2).getReg(); 3315 3316 MachineMemOperand *MMO = *MI.memoperands_begin(); 3317 const int MemSize = MMO->getSize(); 3318 3319 unsigned ImmOffset; 3320 unsigned TotalOffset; 3321 3322 // The typed intrinsics add an immediate after the registers. 3323 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3324 3325 // The struct intrinsic variants add one additional operand over raw. 3326 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3327 Register VIndex; 3328 int OpOffset = 0; 3329 if (HasVIndex) { 3330 VIndex = MI.getOperand(3).getReg(); 3331 OpOffset = 1; 3332 } 3333 3334 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3335 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3336 3337 unsigned Format = 0; 3338 if (IsTyped) { 3339 Format = MI.getOperand(5 + OpOffset).getImm(); 3340 ++OpOffset; 3341 } 3342 3343 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3344 3345 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3346 if (TotalOffset != 0) 3347 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3348 3349 unsigned Opc; 3350 if (IsTyped) { 3351 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3352 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3353 } else if (IsFormat) { 3354 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3355 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3356 } else { 3357 switch (MemSize) { 3358 case 1: 3359 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3360 break; 3361 case 2: 3362 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3363 break; 3364 default: 3365 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3366 break; 3367 } 3368 } 3369 3370 if (!VIndex) 3371 VIndex = B.buildConstant(S32, 0).getReg(0); 3372 3373 auto MIB = B.buildInstr(Opc) 3374 .addUse(VData) // vdata 3375 .addUse(RSrc) // rsrc 3376 .addUse(VIndex) // vindex 3377 .addUse(VOffset) // voffset 3378 .addUse(SOffset) // soffset 3379 .addImm(ImmOffset); // offset(imm) 3380 3381 if (IsTyped) 3382 MIB.addImm(Format); 3383 3384 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3385 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3386 .addMemOperand(MMO); 3387 3388 MI.eraseFromParent(); 3389 return true; 3390 } 3391 3392 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3393 MachineRegisterInfo &MRI, 3394 MachineIRBuilder &B, 3395 bool IsFormat, 3396 bool IsTyped) const { 3397 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3398 MachineMemOperand *MMO = *MI.memoperands_begin(); 3399 const int MemSize = MMO->getSize(); 3400 const LLT S32 = LLT::scalar(32); 3401 3402 Register Dst = MI.getOperand(0).getReg(); 3403 Register RSrc = MI.getOperand(2).getReg(); 3404 3405 // The typed intrinsics add an immediate after the registers. 3406 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3407 3408 // The struct intrinsic variants add one additional operand over raw. 3409 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3410 Register VIndex; 3411 int OpOffset = 0; 3412 if (HasVIndex) { 3413 VIndex = MI.getOperand(3).getReg(); 3414 OpOffset = 1; 3415 } 3416 3417 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3418 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3419 3420 unsigned Format = 0; 3421 if (IsTyped) { 3422 Format = MI.getOperand(5 + OpOffset).getImm(); 3423 ++OpOffset; 3424 } 3425 3426 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3427 unsigned ImmOffset; 3428 unsigned TotalOffset; 3429 3430 LLT Ty = MRI.getType(Dst); 3431 LLT EltTy = Ty.getScalarType(); 3432 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3433 const bool Unpacked = ST.hasUnpackedD16VMem(); 3434 3435 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3436 if (TotalOffset != 0) 3437 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3438 3439 unsigned Opc; 3440 3441 if (IsTyped) { 3442 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3443 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3444 } else if (IsFormat) { 3445 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3446 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3447 } else { 3448 switch (MemSize) { 3449 case 1: 3450 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3451 break; 3452 case 2: 3453 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3454 break; 3455 default: 3456 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3457 break; 3458 } 3459 } 3460 3461 Register LoadDstReg; 3462 3463 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3464 LLT UnpackedTy = Ty.changeElementSize(32); 3465 3466 if (IsExtLoad) 3467 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3468 else if (Unpacked && IsD16 && Ty.isVector()) 3469 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3470 else 3471 LoadDstReg = Dst; 3472 3473 if (!VIndex) 3474 VIndex = B.buildConstant(S32, 0).getReg(0); 3475 3476 auto MIB = B.buildInstr(Opc) 3477 .addDef(LoadDstReg) // vdata 3478 .addUse(RSrc) // rsrc 3479 .addUse(VIndex) // vindex 3480 .addUse(VOffset) // voffset 3481 .addUse(SOffset) // soffset 3482 .addImm(ImmOffset); // offset(imm) 3483 3484 if (IsTyped) 3485 MIB.addImm(Format); 3486 3487 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3488 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3489 .addMemOperand(MMO); 3490 3491 if (LoadDstReg != Dst) { 3492 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3493 3494 // Widen result for extending loads was widened. 3495 if (IsExtLoad) 3496 B.buildTrunc(Dst, LoadDstReg); 3497 else { 3498 // Repack to original 16-bit vector result 3499 // FIXME: G_TRUNC should work, but legalization currently fails 3500 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3501 SmallVector<Register, 4> Repack; 3502 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3503 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3504 B.buildMerge(Dst, Repack); 3505 } 3506 } 3507 3508 MI.eraseFromParent(); 3509 return true; 3510 } 3511 3512 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3513 MachineIRBuilder &B, 3514 bool IsInc) const { 3515 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3516 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3517 B.buildInstr(Opc) 3518 .addDef(MI.getOperand(0).getReg()) 3519 .addUse(MI.getOperand(2).getReg()) 3520 .addUse(MI.getOperand(3).getReg()) 3521 .cloneMemRefs(MI); 3522 MI.eraseFromParent(); 3523 return true; 3524 } 3525 3526 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3527 switch (IntrID) { 3528 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3529 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3530 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3531 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3532 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3533 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3534 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3535 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3536 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3537 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3538 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3539 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3540 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3541 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3542 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3543 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3544 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3545 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3546 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3547 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3548 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3549 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3550 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3551 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3552 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3553 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3554 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3555 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3556 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3557 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3558 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3559 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3560 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3561 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3562 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3563 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3564 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3565 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3566 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3567 default: 3568 llvm_unreachable("unhandled atomic opcode"); 3569 } 3570 } 3571 3572 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3573 MachineIRBuilder &B, 3574 Intrinsic::ID IID) const { 3575 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3576 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3577 3578 Register Dst = MI.getOperand(0).getReg(); 3579 Register VData = MI.getOperand(2).getReg(); 3580 3581 Register CmpVal; 3582 int OpOffset = 0; 3583 3584 if (IsCmpSwap) { 3585 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3586 ++OpOffset; 3587 } 3588 3589 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3590 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3591 3592 // The struct intrinsic variants add one additional operand over raw. 3593 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3594 Register VIndex; 3595 if (HasVIndex) { 3596 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3597 ++OpOffset; 3598 } 3599 3600 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3601 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3602 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3603 3604 MachineMemOperand *MMO = *MI.memoperands_begin(); 3605 3606 unsigned ImmOffset; 3607 unsigned TotalOffset; 3608 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3609 if (TotalOffset != 0) 3610 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3611 3612 if (!VIndex) 3613 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3614 3615 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3616 .addDef(Dst) 3617 .addUse(VData); // vdata 3618 3619 if (IsCmpSwap) 3620 MIB.addReg(CmpVal); 3621 3622 MIB.addUse(RSrc) // rsrc 3623 .addUse(VIndex) // vindex 3624 .addUse(VOffset) // voffset 3625 .addUse(SOffset) // soffset 3626 .addImm(ImmOffset) // offset(imm) 3627 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3628 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3629 .addMemOperand(MMO); 3630 3631 MI.eraseFromParent(); 3632 return true; 3633 } 3634 3635 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3636 /// vector with s16 typed elements. 3637 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3638 SmallVectorImpl<Register> &PackedAddrs, 3639 int AddrIdx, int DimIdx, int EndIdx, 3640 int NumGradients) { 3641 const LLT S16 = LLT::scalar(16); 3642 const LLT V2S16 = LLT::vector(2, 16); 3643 3644 for (int I = AddrIdx; I < EndIdx; ++I) { 3645 MachineOperand &SrcOp = MI.getOperand(I); 3646 if (!SrcOp.isReg()) 3647 continue; // _L to _LZ may have eliminated this. 3648 3649 Register AddrReg = SrcOp.getReg(); 3650 3651 if (I < DimIdx) { 3652 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3653 PackedAddrs.push_back(AddrReg); 3654 } else { 3655 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3656 // derivatives dx/dh and dx/dv are packed with undef. 3657 if (((I + 1) >= EndIdx) || 3658 ((NumGradients / 2) % 2 == 1 && 3659 (I == DimIdx + (NumGradients / 2) - 1 || 3660 I == DimIdx + NumGradients - 1)) || 3661 // Check for _L to _LZ optimization 3662 !MI.getOperand(I + 1).isReg()) { 3663 PackedAddrs.push_back( 3664 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3665 .getReg(0)); 3666 } else { 3667 PackedAddrs.push_back( 3668 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3669 .getReg(0)); 3670 ++I; 3671 } 3672 } 3673 } 3674 } 3675 3676 /// Convert from separate vaddr components to a single vector address register, 3677 /// and replace the remaining operands with $noreg. 3678 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3679 int DimIdx, int NumVAddrs) { 3680 const LLT S32 = LLT::scalar(32); 3681 3682 SmallVector<Register, 8> AddrRegs; 3683 for (int I = 0; I != NumVAddrs; ++I) { 3684 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3685 if (SrcOp.isReg()) { 3686 AddrRegs.push_back(SrcOp.getReg()); 3687 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3688 } 3689 } 3690 3691 int NumAddrRegs = AddrRegs.size(); 3692 if (NumAddrRegs != 1) { 3693 // Round up to 8 elements for v5-v7 3694 // FIXME: Missing intermediate sized register classes and instructions. 3695 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3696 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3697 auto Undef = B.buildUndef(S32); 3698 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3699 NumAddrRegs = RoundedNumRegs; 3700 } 3701 3702 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3703 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3704 } 3705 3706 for (int I = 1; I != NumVAddrs; ++I) { 3707 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3708 if (SrcOp.isReg()) 3709 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3710 } 3711 } 3712 3713 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3714 /// 3715 /// Depending on the subtarget, load/store with 16-bit element data need to be 3716 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3717 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3718 /// registers. 3719 /// 3720 /// We don't want to directly select image instructions just yet, but also want 3721 /// to exposes all register repacking to the legalizer/combiners. We also don't 3722 /// want a selected instrution entering RegBankSelect. In order to avoid 3723 /// defining a multitude of intermediate image instructions, directly hack on 3724 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3725 /// now unnecessary arguments with $noreg. 3726 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3727 MachineInstr &MI, MachineIRBuilder &B, 3728 GISelChangeObserver &Observer, 3729 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3730 3731 const int NumDefs = MI.getNumExplicitDefs(); 3732 bool IsTFE = NumDefs == 2; 3733 // We are only processing the operands of d16 image operations on subtargets 3734 // that use the unpacked register layout, or need to repack the TFE result. 3735 3736 // TODO: Do we need to guard against already legalized intrinsics? 3737 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3738 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3739 3740 MachineRegisterInfo *MRI = B.getMRI(); 3741 const LLT S32 = LLT::scalar(32); 3742 const LLT S16 = LLT::scalar(16); 3743 const LLT V2S16 = LLT::vector(2, 16); 3744 3745 // Index of first address argument 3746 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3747 3748 int NumVAddrs, NumGradients; 3749 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3750 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3751 getDMaskIdx(BaseOpcode, NumDefs); 3752 unsigned DMask = 0; 3753 3754 // Check for 16 bit addresses and pack if true. 3755 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3756 LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3757 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg()); 3758 const bool IsG16 = GradTy == S16; 3759 const bool IsA16 = AddrTy == S16; 3760 3761 int DMaskLanes = 0; 3762 if (!BaseOpcode->Atomic) { 3763 DMask = MI.getOperand(DMaskIdx).getImm(); 3764 if (BaseOpcode->Gather4) { 3765 DMaskLanes = 4; 3766 } else if (DMask != 0) { 3767 DMaskLanes = countPopulation(DMask); 3768 } else if (!IsTFE && !BaseOpcode->Store) { 3769 // If dmask is 0, this is a no-op load. This can be eliminated. 3770 B.buildUndef(MI.getOperand(0)); 3771 MI.eraseFromParent(); 3772 return true; 3773 } 3774 } 3775 3776 Observer.changingInstr(MI); 3777 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3778 3779 unsigned NewOpcode = NumDefs == 0 ? 3780 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3781 3782 // Track that we legalized this 3783 MI.setDesc(B.getTII().get(NewOpcode)); 3784 3785 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3786 // dmask to be at least 1 otherwise the instruction will fail 3787 if (IsTFE && DMask == 0) { 3788 DMask = 0x1; 3789 DMaskLanes = 1; 3790 MI.getOperand(DMaskIdx).setImm(DMask); 3791 } 3792 3793 if (BaseOpcode->Atomic) { 3794 Register VData0 = MI.getOperand(2).getReg(); 3795 LLT Ty = MRI->getType(VData0); 3796 3797 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 3798 if (Ty.isVector()) 3799 return false; 3800 3801 if (BaseOpcode->AtomicX2) { 3802 Register VData1 = MI.getOperand(3).getReg(); 3803 // The two values are packed in one register. 3804 LLT PackedTy = LLT::vector(2, Ty); 3805 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 3806 MI.getOperand(2).setReg(Concat.getReg(0)); 3807 MI.getOperand(3).setReg(AMDGPU::NoRegister); 3808 } 3809 } 3810 3811 int CorrectedNumVAddrs = NumVAddrs; 3812 3813 // Optimize _L to _LZ when _L is zero 3814 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 3815 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 3816 const ConstantFP *ConstantLod; 3817 const int LodIdx = AddrIdx + NumVAddrs - 1; 3818 3819 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 3820 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 3821 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 3822 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 3823 LZMappingInfo->LZ, ImageDimIntr->Dim); 3824 3825 // The starting indexes should remain in the same place. 3826 --NumVAddrs; 3827 --CorrectedNumVAddrs; 3828 3829 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 3830 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 3831 MI.RemoveOperand(LodIdx); 3832 } 3833 } 3834 } 3835 3836 // Optimize _mip away, when 'lod' is zero 3837 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 3838 int64_t ConstantLod; 3839 const int LodIdx = AddrIdx + NumVAddrs - 1; 3840 3841 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 3842 if (ConstantLod == 0) { 3843 // TODO: Change intrinsic opcode and remove operand instead or replacing 3844 // it with 0, as the _L to _LZ handling is done above. 3845 MI.getOperand(LodIdx).ChangeToImmediate(0); 3846 --CorrectedNumVAddrs; 3847 } 3848 } 3849 } 3850 3851 // Rewrite the addressing register layout before doing anything else. 3852 if (IsA16 || IsG16) { 3853 if (IsA16) { 3854 // Target must support the feature and gradients need to be 16 bit too 3855 if (!ST.hasA16() || !IsG16) 3856 return false; 3857 } else if (!ST.hasG16()) 3858 return false; 3859 3860 if (NumVAddrs > 1) { 3861 SmallVector<Register, 4> PackedRegs; 3862 // Don't compress addresses for G16 3863 const int PackEndIdx = 3864 IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients); 3865 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, 3866 PackEndIdx, NumGradients); 3867 3868 if (!IsA16) { 3869 // Add uncompressed address 3870 for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) { 3871 int AddrReg = MI.getOperand(I).getReg(); 3872 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 3873 PackedRegs.push_back(AddrReg); 3874 } 3875 } 3876 3877 // See also below in the non-a16 branch 3878 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 3879 3880 if (!UseNSA && PackedRegs.size() > 1) { 3881 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 3882 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 3883 PackedRegs[0] = Concat.getReg(0); 3884 PackedRegs.resize(1); 3885 } 3886 3887 const int NumPacked = PackedRegs.size(); 3888 for (int I = 0; I != NumVAddrs; ++I) { 3889 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 3890 if (!SrcOp.isReg()) { 3891 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 3892 continue; 3893 } 3894 3895 assert(SrcOp.getReg() != AMDGPU::NoRegister); 3896 3897 if (I < NumPacked) 3898 SrcOp.setReg(PackedRegs[I]); 3899 else 3900 SrcOp.setReg(AMDGPU::NoRegister); 3901 } 3902 } 3903 } else { 3904 // If the register allocator cannot place the address registers contiguously 3905 // without introducing moves, then using the non-sequential address encoding 3906 // is always preferable, since it saves VALU instructions and is usually a 3907 // wash in terms of code size or even better. 3908 // 3909 // However, we currently have no way of hinting to the register allocator 3910 // that MIMG addresses should be placed contiguously when it is possible to 3911 // do so, so force non-NSA for the common 2-address case as a heuristic. 3912 // 3913 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 3914 // allocation when possible. 3915 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 3916 3917 if (!UseNSA && NumVAddrs > 1) 3918 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 3919 } 3920 3921 int Flags = 0; 3922 if (IsA16) 3923 Flags |= 1; 3924 if (IsG16) 3925 Flags |= 2; 3926 MI.addOperand(MachineOperand::CreateImm(Flags)); 3927 3928 if (BaseOpcode->Store) { // No TFE for stores? 3929 // TODO: Handle dmask trim 3930 Register VData = MI.getOperand(1).getReg(); 3931 LLT Ty = MRI->getType(VData); 3932 if (!Ty.isVector() || Ty.getElementType() != S16) 3933 return true; 3934 3935 Register RepackedReg = handleD16VData(B, *MRI, VData); 3936 if (RepackedReg != VData) { 3937 MI.getOperand(1).setReg(RepackedReg); 3938 } 3939 3940 return true; 3941 } 3942 3943 Register DstReg = MI.getOperand(0).getReg(); 3944 LLT Ty = MRI->getType(DstReg); 3945 const LLT EltTy = Ty.getScalarType(); 3946 const bool IsD16 = Ty.getScalarType() == S16; 3947 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3948 3949 // Confirm that the return type is large enough for the dmask specified 3950 if (NumElts < DMaskLanes) 3951 return false; 3952 3953 if (NumElts > 4 || DMaskLanes > 4) 3954 return false; 3955 3956 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 3957 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 3958 3959 // The raw dword aligned data component of the load. The only legal cases 3960 // where this matters should be when using the packed D16 format, for 3961 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3962 LLT RoundedTy; 3963 3964 // S32 vector to to cover all data, plus TFE result element. 3965 LLT TFETy; 3966 3967 // Register type to use for each loaded component. Will be S32 or V2S16. 3968 LLT RegTy; 3969 3970 if (IsD16 && ST.hasUnpackedD16VMem()) { 3971 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 3972 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 3973 RegTy = S32; 3974 } else { 3975 unsigned EltSize = EltTy.getSizeInBits(); 3976 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 3977 unsigned RoundedSize = 32 * RoundedElts; 3978 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3979 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3980 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 3981 } 3982 3983 // The return type does not need adjustment. 3984 // TODO: Should we change s16 case to s32 or <2 x s16>? 3985 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 3986 return true; 3987 3988 Register Dst1Reg; 3989 3990 // Insert after the instruction. 3991 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3992 3993 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 3994 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 3995 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 3996 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 3997 3998 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 3999 4000 MI.getOperand(0).setReg(NewResultReg); 4001 4002 // In the IR, TFE is supposed to be used with a 2 element struct return 4003 // type. The intruction really returns these two values in one contiguous 4004 // register, with one additional dword beyond the loaded data. Rewrite the 4005 // return type to use a single register result. 4006 4007 if (IsTFE) { 4008 Dst1Reg = MI.getOperand(1).getReg(); 4009 if (MRI->getType(Dst1Reg) != S32) 4010 return false; 4011 4012 // TODO: Make sure the TFE operand bit is set. 4013 MI.RemoveOperand(1); 4014 4015 // Handle the easy case that requires no repack instructions. 4016 if (Ty == S32) { 4017 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4018 return true; 4019 } 4020 } 4021 4022 // Now figure out how to copy the new result register back into the old 4023 // result. 4024 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4025 4026 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4027 4028 if (ResultNumRegs == 1) { 4029 assert(!IsTFE); 4030 ResultRegs[0] = NewResultReg; 4031 } else { 4032 // We have to repack into a new vector of some kind. 4033 for (int I = 0; I != NumDataRegs; ++I) 4034 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4035 B.buildUnmerge(ResultRegs, NewResultReg); 4036 4037 // Drop the final TFE element to get the data part. The TFE result is 4038 // directly written to the right place already. 4039 if (IsTFE) 4040 ResultRegs.resize(NumDataRegs); 4041 } 4042 4043 // For an s16 scalar result, we form an s32 result with a truncate regardless 4044 // of packed vs. unpacked. 4045 if (IsD16 && !Ty.isVector()) { 4046 B.buildTrunc(DstReg, ResultRegs[0]); 4047 return true; 4048 } 4049 4050 // Avoid a build/concat_vector of 1 entry. 4051 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4052 B.buildBitcast(DstReg, ResultRegs[0]); 4053 return true; 4054 } 4055 4056 assert(Ty.isVector()); 4057 4058 if (IsD16) { 4059 // For packed D16 results with TFE enabled, all the data components are 4060 // S32. Cast back to the expected type. 4061 // 4062 // TODO: We don't really need to use load s32 elements. We would only need one 4063 // cast for the TFE result if a multiple of v2s16 was used. 4064 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4065 for (Register &Reg : ResultRegs) 4066 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4067 } else if (ST.hasUnpackedD16VMem()) { 4068 for (Register &Reg : ResultRegs) 4069 Reg = B.buildTrunc(S16, Reg).getReg(0); 4070 } 4071 } 4072 4073 auto padWithUndef = [&](LLT Ty, int NumElts) { 4074 if (NumElts == 0) 4075 return; 4076 Register Undef = B.buildUndef(Ty).getReg(0); 4077 for (int I = 0; I != NumElts; ++I) 4078 ResultRegs.push_back(Undef); 4079 }; 4080 4081 // Pad out any elements eliminated due to the dmask. 4082 LLT ResTy = MRI->getType(ResultRegs[0]); 4083 if (!ResTy.isVector()) { 4084 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4085 B.buildBuildVector(DstReg, ResultRegs); 4086 return true; 4087 } 4088 4089 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4090 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4091 4092 // Deal with the one annoying legal case. 4093 const LLT V3S16 = LLT::vector(3, 16); 4094 if (Ty == V3S16) { 4095 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4096 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4097 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4098 return true; 4099 } 4100 4101 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4102 B.buildConcatVectors(DstReg, ResultRegs); 4103 return true; 4104 } 4105 4106 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4107 MachineInstr &MI, MachineIRBuilder &B, 4108 GISelChangeObserver &Observer) const { 4109 Register Dst = MI.getOperand(0).getReg(); 4110 LLT Ty = B.getMRI()->getType(Dst); 4111 unsigned Size = Ty.getSizeInBits(); 4112 MachineFunction &MF = B.getMF(); 4113 4114 Observer.changingInstr(MI); 4115 4116 // FIXME: We don't really need this intermediate instruction. The intrinsic 4117 // should be fixed to have a memory operand. Since it's readnone, we're not 4118 // allowed to add one. 4119 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4120 MI.RemoveOperand(1); // Remove intrinsic ID 4121 4122 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4123 // TODO: Should this use datalayout alignment? 4124 const unsigned MemSize = (Size + 7) / 8; 4125 const Align MemAlign(4); 4126 MachineMemOperand *MMO = MF.getMachineMemOperand( 4127 MachinePointerInfo(), 4128 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4129 MachineMemOperand::MOInvariant, 4130 MemSize, MemAlign); 4131 MI.addMemOperand(MF, MMO); 4132 4133 // There are no 96-bit result scalar loads, but widening to 128-bit should 4134 // always be legal. We may need to restore this to a 96-bit result if it turns 4135 // out this needs to be converted to a vector load during RegBankSelect. 4136 if (!isPowerOf2_32(Size)) { 4137 LegalizerHelper Helper(MF, *this, Observer, B); 4138 4139 if (Ty.isVector()) 4140 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4141 else 4142 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4143 } 4144 4145 Observer.changedInstr(MI); 4146 return true; 4147 } 4148 4149 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4150 MachineRegisterInfo &MRI, 4151 MachineIRBuilder &B) const { 4152 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4153 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4154 !ST.isTrapHandlerEnabled()) { 4155 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4156 } else { 4157 // Pass queue pointer to trap handler as input, and insert trap instruction 4158 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4159 const ArgDescriptor *Arg = 4160 getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR); 4161 if (!Arg) 4162 return false; 4163 MachineRegisterInfo &MRI = *B.getMRI(); 4164 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4165 Register LiveIn = getLiveInRegister( 4166 B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), 4167 /*InsertLiveInCopy=*/false); 4168 if (!loadInputValue(LiveIn, B, Arg)) 4169 return false; 4170 B.buildCopy(SGPR01, LiveIn); 4171 B.buildInstr(AMDGPU::S_TRAP) 4172 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4173 .addReg(SGPR01, RegState::Implicit); 4174 } 4175 4176 MI.eraseFromParent(); 4177 return true; 4178 } 4179 4180 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4181 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4182 // Is non-HSA path or trap-handler disabled? then, report a warning 4183 // accordingly 4184 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4185 !ST.isTrapHandlerEnabled()) { 4186 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4187 "debugtrap handler not supported", 4188 MI.getDebugLoc(), DS_Warning); 4189 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4190 Ctx.diagnose(NoTrap); 4191 } else { 4192 // Insert debug-trap instruction 4193 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4194 } 4195 4196 MI.eraseFromParent(); 4197 return true; 4198 } 4199 4200 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4201 MachineInstr &MI) const { 4202 MachineIRBuilder &B = Helper.MIRBuilder; 4203 MachineRegisterInfo &MRI = *B.getMRI(); 4204 4205 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4206 auto IntrID = MI.getIntrinsicID(); 4207 switch (IntrID) { 4208 case Intrinsic::amdgcn_if: 4209 case Intrinsic::amdgcn_else: { 4210 MachineInstr *Br = nullptr; 4211 MachineBasicBlock *UncondBrTarget = nullptr; 4212 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4213 const SIRegisterInfo *TRI 4214 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4215 4216 Register Def = MI.getOperand(1).getReg(); 4217 Register Use = MI.getOperand(3).getReg(); 4218 4219 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4220 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4221 if (IntrID == Intrinsic::amdgcn_if) { 4222 B.buildInstr(AMDGPU::SI_IF) 4223 .addDef(Def) 4224 .addUse(Use) 4225 .addMBB(UncondBrTarget); 4226 } else { 4227 B.buildInstr(AMDGPU::SI_ELSE) 4228 .addDef(Def) 4229 .addUse(Use) 4230 .addMBB(UncondBrTarget) 4231 .addImm(0); 4232 } 4233 4234 if (Br) { 4235 Br->getOperand(0).setMBB(CondBrTarget); 4236 } else { 4237 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4238 // since we're swapping branch targets it needs to be reinserted. 4239 // FIXME: IRTranslator should probably not do this 4240 B.buildBr(*CondBrTarget); 4241 } 4242 4243 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4244 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4245 MI.eraseFromParent(); 4246 BrCond->eraseFromParent(); 4247 return true; 4248 } 4249 4250 return false; 4251 } 4252 case Intrinsic::amdgcn_loop: { 4253 MachineInstr *Br = nullptr; 4254 MachineBasicBlock *UncondBrTarget = nullptr; 4255 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4256 const SIRegisterInfo *TRI 4257 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4258 4259 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4260 Register Reg = MI.getOperand(2).getReg(); 4261 4262 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4263 B.buildInstr(AMDGPU::SI_LOOP) 4264 .addUse(Reg) 4265 .addMBB(UncondBrTarget); 4266 4267 if (Br) 4268 Br->getOperand(0).setMBB(CondBrTarget); 4269 else 4270 B.buildBr(*CondBrTarget); 4271 4272 MI.eraseFromParent(); 4273 BrCond->eraseFromParent(); 4274 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4275 return true; 4276 } 4277 4278 return false; 4279 } 4280 case Intrinsic::amdgcn_kernarg_segment_ptr: 4281 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4282 // This only makes sense to call in a kernel, so just lower to null. 4283 B.buildConstant(MI.getOperand(0).getReg(), 0); 4284 MI.eraseFromParent(); 4285 return true; 4286 } 4287 4288 return legalizePreloadedArgIntrin( 4289 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4290 case Intrinsic::amdgcn_implicitarg_ptr: 4291 return legalizeImplicitArgPtr(MI, MRI, B); 4292 case Intrinsic::amdgcn_workitem_id_x: 4293 return legalizePreloadedArgIntrin(MI, MRI, B, 4294 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4295 case Intrinsic::amdgcn_workitem_id_y: 4296 return legalizePreloadedArgIntrin(MI, MRI, B, 4297 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4298 case Intrinsic::amdgcn_workitem_id_z: 4299 return legalizePreloadedArgIntrin(MI, MRI, B, 4300 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4301 case Intrinsic::amdgcn_workgroup_id_x: 4302 return legalizePreloadedArgIntrin(MI, MRI, B, 4303 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4304 case Intrinsic::amdgcn_workgroup_id_y: 4305 return legalizePreloadedArgIntrin(MI, MRI, B, 4306 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4307 case Intrinsic::amdgcn_workgroup_id_z: 4308 return legalizePreloadedArgIntrin(MI, MRI, B, 4309 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4310 case Intrinsic::amdgcn_dispatch_ptr: 4311 return legalizePreloadedArgIntrin(MI, MRI, B, 4312 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4313 case Intrinsic::amdgcn_queue_ptr: 4314 return legalizePreloadedArgIntrin(MI, MRI, B, 4315 AMDGPUFunctionArgInfo::QUEUE_PTR); 4316 case Intrinsic::amdgcn_implicit_buffer_ptr: 4317 return legalizePreloadedArgIntrin( 4318 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4319 case Intrinsic::amdgcn_dispatch_id: 4320 return legalizePreloadedArgIntrin(MI, MRI, B, 4321 AMDGPUFunctionArgInfo::DISPATCH_ID); 4322 case Intrinsic::amdgcn_fdiv_fast: 4323 return legalizeFDIVFastIntrin(MI, MRI, B); 4324 case Intrinsic::amdgcn_is_shared: 4325 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4326 case Intrinsic::amdgcn_is_private: 4327 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4328 case Intrinsic::amdgcn_wavefrontsize: { 4329 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4330 MI.eraseFromParent(); 4331 return true; 4332 } 4333 case Intrinsic::amdgcn_s_buffer_load: 4334 return legalizeSBufferLoad(MI, B, Helper.Observer); 4335 case Intrinsic::amdgcn_raw_buffer_store: 4336 case Intrinsic::amdgcn_struct_buffer_store: 4337 return legalizeBufferStore(MI, MRI, B, false, false); 4338 case Intrinsic::amdgcn_raw_buffer_store_format: 4339 case Intrinsic::amdgcn_struct_buffer_store_format: 4340 return legalizeBufferStore(MI, MRI, B, false, true); 4341 case Intrinsic::amdgcn_raw_tbuffer_store: 4342 case Intrinsic::amdgcn_struct_tbuffer_store: 4343 return legalizeBufferStore(MI, MRI, B, true, true); 4344 case Intrinsic::amdgcn_raw_buffer_load: 4345 case Intrinsic::amdgcn_struct_buffer_load: 4346 return legalizeBufferLoad(MI, MRI, B, false, false); 4347 case Intrinsic::amdgcn_raw_buffer_load_format: 4348 case Intrinsic::amdgcn_struct_buffer_load_format: 4349 return legalizeBufferLoad(MI, MRI, B, true, false); 4350 case Intrinsic::amdgcn_raw_tbuffer_load: 4351 case Intrinsic::amdgcn_struct_tbuffer_load: 4352 return legalizeBufferLoad(MI, MRI, B, true, true); 4353 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4354 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4355 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4356 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4357 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4358 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4359 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4360 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4361 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4362 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4363 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4364 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4365 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4366 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4367 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4368 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4369 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4370 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4371 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4372 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4373 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4374 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4375 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4376 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4377 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4378 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4379 return legalizeBufferAtomic(MI, B, IntrID); 4380 case Intrinsic::amdgcn_atomic_inc: 4381 return legalizeAtomicIncDec(MI, B, true); 4382 case Intrinsic::amdgcn_atomic_dec: 4383 return legalizeAtomicIncDec(MI, B, false); 4384 case Intrinsic::trap: 4385 return legalizeTrapIntrinsic(MI, MRI, B); 4386 case Intrinsic::debugtrap: 4387 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4388 default: { 4389 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4390 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4391 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4392 return true; 4393 } 4394 } 4395 4396 return true; 4397 } 4398