1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-legalinfo" 31 32 using namespace llvm; 33 using namespace LegalizeActions; 34 using namespace LegalizeMutations; 35 using namespace LegalityPredicates; 36 using namespace MIPatternMatch; 37 38 // Hack until load/store selection patterns support any tuple of legal types. 39 static cl::opt<bool> EnableNewLegality( 40 "amdgpu-global-isel-new-legality", 41 cl::desc("Use GlobalISel desired legality, rather than try to use" 42 "rules compatible with selection patterns"), 43 cl::init(false), 44 cl::ReallyHidden); 45 46 static constexpr unsigned MaxRegisterSize = 1024; 47 48 // Round the number of elements to the next power of two elements 49 static LLT getPow2VectorType(LLT Ty) { 50 unsigned NElts = Ty.getNumElements(); 51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 52 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 53 } 54 55 // Round the number of bits to the next power of two bits 56 static LLT getPow2ScalarType(LLT Ty) { 57 unsigned Bits = Ty.getSizeInBits(); 58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 59 return LLT::scalar(Pow2Bits); 60 } 61 62 /// \returs true if this is an odd sized vector which should widen by adding an 63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 64 /// excludes s1 vectors, which should always be scalarized. 65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 66 return [=](const LegalityQuery &Query) { 67 const LLT Ty = Query.Types[TypeIdx]; 68 if (!Ty.isVector()) 69 return false; 70 71 const LLT EltTy = Ty.getElementType(); 72 const unsigned EltSize = EltTy.getSizeInBits(); 73 return Ty.getNumElements() % 2 != 0 && 74 EltSize > 1 && EltSize < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 return Ty.getSizeInBits() % 32 == 0; 83 }; 84 } 85 86 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getScalarType(); 90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 91 }; 92 } 93 94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 95 return [=](const LegalityQuery &Query) { 96 const LLT Ty = Query.Types[TypeIdx]; 97 const LLT EltTy = Ty.getElementType(); 98 return std::make_pair(TypeIdx, 99 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 100 }; 101 } 102 103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 104 return [=](const LegalityQuery &Query) { 105 const LLT Ty = Query.Types[TypeIdx]; 106 const LLT EltTy = Ty.getElementType(); 107 unsigned Size = Ty.getSizeInBits(); 108 unsigned Pieces = (Size + 63) / 64; 109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 110 return std::make_pair( 111 TypeIdx, 112 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy)); 113 }; 114 } 115 116 // Increase the number of vector elements to reach the next multiple of 32-bit 117 // type. 118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 122 const LLT EltTy = Ty.getElementType(); 123 const int Size = Ty.getSizeInBits(); 124 const int EltSize = EltTy.getSizeInBits(); 125 const int NextMul32 = (Size + 31) / 32; 126 127 assert(EltSize < 32); 128 129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 130 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 131 }; 132 } 133 134 static LLT getBitcastRegisterType(const LLT Ty) { 135 const unsigned Size = Ty.getSizeInBits(); 136 137 LLT CoercedTy; 138 if (Size <= 32) { 139 // <2 x s8> -> s16 140 // <4 x s8> -> s32 141 return LLT::scalar(Size); 142 } 143 144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 145 } 146 147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 148 return [=](const LegalityQuery &Query) { 149 const LLT Ty = Query.Types[TypeIdx]; 150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 151 }; 152 } 153 154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 155 return [=](const LegalityQuery &Query) { 156 const LLT Ty = Query.Types[TypeIdx]; 157 unsigned Size = Ty.getSizeInBits(); 158 assert(Size % 32 == 0); 159 return std::make_pair( 160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 161 }; 162 } 163 164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 165 return [=](const LegalityQuery &Query) { 166 const LLT QueryTy = Query.Types[TypeIdx]; 167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 168 }; 169 } 170 171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 172 return [=](const LegalityQuery &Query) { 173 const LLT QueryTy = Query.Types[TypeIdx]; 174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 175 }; 176 } 177 178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 179 return [=](const LegalityQuery &Query) { 180 const LLT QueryTy = Query.Types[TypeIdx]; 181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 182 }; 183 } 184 185 static bool isRegisterSize(unsigned Size) { 186 return Size % 32 == 0 && Size <= MaxRegisterSize; 187 } 188 189 static bool isRegisterVectorElementType(LLT EltTy) { 190 const int EltSize = EltTy.getSizeInBits(); 191 return EltSize == 16 || EltSize % 32 == 0; 192 } 193 194 static bool isRegisterVectorType(LLT Ty) { 195 const int EltSize = Ty.getElementType().getSizeInBits(); 196 return EltSize == 32 || EltSize == 64 || 197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 198 EltSize == 128 || EltSize == 256; 199 } 200 201 static bool isRegisterType(LLT Ty) { 202 if (!isRegisterSize(Ty.getSizeInBits())) 203 return false; 204 205 if (Ty.isVector()) 206 return isRegisterVectorType(Ty); 207 208 return true; 209 } 210 211 // Any combination of 32 or 64-bit elements up the maximum register size, and 212 // multiples of v2s16. 213 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 214 return [=](const LegalityQuery &Query) { 215 return isRegisterType(Query.Types[TypeIdx]); 216 }; 217 } 218 219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 220 return [=](const LegalityQuery &Query) { 221 const LLT QueryTy = Query.Types[TypeIdx]; 222 if (!QueryTy.isVector()) 223 return false; 224 const LLT EltTy = QueryTy.getElementType(); 225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 226 }; 227 } 228 229 // If we have a truncating store or an extending load with a data size larger 230 // than 32-bits, we need to reduce to a 32-bit type. 231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 232 return [=](const LegalityQuery &Query) { 233 const LLT Ty = Query.Types[TypeIdx]; 234 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 236 }; 237 } 238 239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 240 // handle some operations by just promoting the register during 241 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 243 bool IsLoad) { 244 switch (AS) { 245 case AMDGPUAS::PRIVATE_ADDRESS: 246 // FIXME: Private element size. 247 return ST.enableFlatScratch() ? 128 : 32; 248 case AMDGPUAS::LOCAL_ADDRESS: 249 return ST.useDS128() ? 128 : 64; 250 case AMDGPUAS::GLOBAL_ADDRESS: 251 case AMDGPUAS::CONSTANT_ADDRESS: 252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 253 // Treat constant and global as identical. SMRD loads are sometimes usable for 254 // global loads (ideally constant address space should be eliminated) 255 // depending on the context. Legality cannot be context dependent, but 256 // RegBankSelect can split the load as necessary depending on the pointer 257 // register bank/uniformity and if the memory is invariant or not written in a 258 // kernel. 259 return IsLoad ? 512 : 128; 260 default: 261 // Flat addresses may contextually need to be split to 32-bit parts if they 262 // may alias scratch depending on the subtarget. 263 return 128; 264 } 265 } 266 267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 268 const LegalityQuery &Query) { 269 const LLT Ty = Query.Types[0]; 270 271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 273 274 unsigned RegSize = Ty.getSizeInBits(); 275 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 276 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 277 unsigned AS = Query.Types[1].getAddressSpace(); 278 279 // All of these need to be custom lowered to cast the pointer operand. 280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 281 return false; 282 283 // Do not handle extending vector loads. 284 if (Ty.isVector() && MemSize != RegSize) 285 return false; 286 287 // TODO: We should be able to widen loads if the alignment is high enough, but 288 // we also need to modify the memory access size. 289 #if 0 290 // Accept widening loads based on alignment. 291 if (IsLoad && MemSize < Size) 292 MemSize = std::max(MemSize, Align); 293 #endif 294 295 // Only 1-byte and 2-byte to 32-bit extloads are valid. 296 if (MemSize != RegSize && RegSize != 32) 297 return false; 298 299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 300 return false; 301 302 switch (MemSize) { 303 case 8: 304 case 16: 305 case 32: 306 case 64: 307 case 128: 308 break; 309 case 96: 310 if (!ST.hasDwordx3LoadStores()) 311 return false; 312 break; 313 case 256: 314 case 512: 315 // These may contextually need to be broken down. 316 break; 317 default: 318 return false; 319 } 320 321 assert(RegSize >= MemSize); 322 323 if (AlignBits < MemSize) { 324 const SITargetLowering *TLI = ST.getTargetLowering(); 325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 326 Align(AlignBits / 8))) 327 return false; 328 } 329 330 return true; 331 } 332 333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 334 // workaround this. Eventually it should ignore the type for loads and only care 335 // about the size. Return true in cases where we will workaround this for now by 336 // bitcasting. 337 static bool loadStoreBitcastWorkaround(const LLT Ty) { 338 if (EnableNewLegality) 339 return false; 340 341 const unsigned Size = Ty.getSizeInBits(); 342 if (Size <= 64) 343 return false; 344 if (!Ty.isVector()) 345 return true; 346 347 LLT EltTy = Ty.getElementType(); 348 if (EltTy.isPointer()) 349 return true; 350 351 unsigned EltSize = EltTy.getSizeInBits(); 352 return EltSize != 32 && EltSize != 64; 353 } 354 355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 356 const LLT Ty = Query.Types[0]; 357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 358 !loadStoreBitcastWorkaround(Ty); 359 } 360 361 /// Return true if a load or store of the type should be lowered with a bitcast 362 /// to a different type. 363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 364 const LLT MemTy) { 365 const unsigned MemSizeInBits = MemTy.getSizeInBits(); 366 const unsigned Size = Ty.getSizeInBits(); 367 if (Size != MemSizeInBits) 368 return Size <= 32 && Ty.isVector(); 369 370 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 371 return true; 372 373 // Don't try to handle bitcasting vector ext loads for now. 374 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 375 (Size <= 32 || isRegisterSize(Size)) && 376 !isRegisterVectorElementType(Ty.getElementType()); 377 } 378 379 /// Return true if we should legalize a load by widening an odd sized memory 380 /// access up to the alignment. Note this case when the memory access itself 381 /// changes, not the size of the result register. 382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 383 unsigned AlignInBits, unsigned AddrSpace, 384 unsigned Opcode) { 385 unsigned SizeInBits = MemoryTy.getSizeInBits(); 386 // We don't want to widen cases that are naturally legal. 387 if (isPowerOf2_32(SizeInBits)) 388 return false; 389 390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 391 // end up widening these for a scalar load during RegBankSelect, since there 392 // aren't 96-bit scalar loads. 393 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 394 return false; 395 396 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 397 return false; 398 399 // A load is known dereferenceable up to the alignment, so it's legal to widen 400 // to it. 401 // 402 // TODO: Could check dereferenceable for less aligned cases. 403 unsigned RoundedSize = NextPowerOf2(SizeInBits); 404 if (AlignInBits < RoundedSize) 405 return false; 406 407 // Do not widen if it would introduce a slow unaligned load. 408 const SITargetLowering *TLI = ST.getTargetLowering(); 409 bool Fast = false; 410 return TLI->allowsMisalignedMemoryAccessesImpl( 411 RoundedSize, AddrSpace, Align(AlignInBits / 8), 412 MachineMemOperand::MOLoad, &Fast) && 413 Fast; 414 } 415 416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 417 unsigned Opcode) { 418 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 419 return false; 420 421 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 422 Query.MMODescrs[0].AlignInBits, 423 Query.Types[1].getAddressSpace(), Opcode); 424 } 425 426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 427 const GCNTargetMachine &TM) 428 : ST(ST_) { 429 using namespace TargetOpcode; 430 431 auto GetAddrSpacePtr = [&TM](unsigned AS) { 432 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 433 }; 434 435 const LLT S1 = LLT::scalar(1); 436 const LLT S8 = LLT::scalar(8); 437 const LLT S16 = LLT::scalar(16); 438 const LLT S32 = LLT::scalar(32); 439 const LLT S64 = LLT::scalar(64); 440 const LLT S128 = LLT::scalar(128); 441 const LLT S256 = LLT::scalar(256); 442 const LLT S512 = LLT::scalar(512); 443 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 444 445 const LLT V2S8 = LLT::fixed_vector(2, 8); 446 const LLT V2S16 = LLT::fixed_vector(2, 16); 447 const LLT V4S16 = LLT::fixed_vector(4, 16); 448 449 const LLT V2S32 = LLT::fixed_vector(2, 32); 450 const LLT V3S32 = LLT::fixed_vector(3, 32); 451 const LLT V4S32 = LLT::fixed_vector(4, 32); 452 const LLT V5S32 = LLT::fixed_vector(5, 32); 453 const LLT V6S32 = LLT::fixed_vector(6, 32); 454 const LLT V7S32 = LLT::fixed_vector(7, 32); 455 const LLT V8S32 = LLT::fixed_vector(8, 32); 456 const LLT V9S32 = LLT::fixed_vector(9, 32); 457 const LLT V10S32 = LLT::fixed_vector(10, 32); 458 const LLT V11S32 = LLT::fixed_vector(11, 32); 459 const LLT V12S32 = LLT::fixed_vector(12, 32); 460 const LLT V13S32 = LLT::fixed_vector(13, 32); 461 const LLT V14S32 = LLT::fixed_vector(14, 32); 462 const LLT V15S32 = LLT::fixed_vector(15, 32); 463 const LLT V16S32 = LLT::fixed_vector(16, 32); 464 const LLT V32S32 = LLT::fixed_vector(32, 32); 465 466 const LLT V2S64 = LLT::fixed_vector(2, 64); 467 const LLT V3S64 = LLT::fixed_vector(3, 64); 468 const LLT V4S64 = LLT::fixed_vector(4, 64); 469 const LLT V5S64 = LLT::fixed_vector(5, 64); 470 const LLT V6S64 = LLT::fixed_vector(6, 64); 471 const LLT V7S64 = LLT::fixed_vector(7, 64); 472 const LLT V8S64 = LLT::fixed_vector(8, 64); 473 const LLT V16S64 = LLT::fixed_vector(16, 64); 474 475 std::initializer_list<LLT> AllS32Vectors = 476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 478 std::initializer_list<LLT> AllS64Vectors = 479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 480 481 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 482 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 483 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 484 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 485 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 486 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 487 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 488 489 const LLT CodePtr = FlatPtr; 490 491 const std::initializer_list<LLT> AddrSpaces64 = { 492 GlobalPtr, ConstantPtr, FlatPtr 493 }; 494 495 const std::initializer_list<LLT> AddrSpaces32 = { 496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 497 }; 498 499 const std::initializer_list<LLT> FPTypesBase = { 500 S32, S64 501 }; 502 503 const std::initializer_list<LLT> FPTypes16 = { 504 S32, S64, S16 505 }; 506 507 const std::initializer_list<LLT> FPTypesPK16 = { 508 S32, S64, S16, V2S16 509 }; 510 511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 512 513 // s1 for VCC branches, s32 for SCC branches. 514 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 515 516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 517 // elements for v3s16 518 getActionDefinitionsBuilder(G_PHI) 519 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 520 .legalFor(AllS32Vectors) 521 .legalFor(AllS64Vectors) 522 .legalFor(AddrSpaces64) 523 .legalFor(AddrSpaces32) 524 .legalIf(isPointer(0)) 525 .clampScalar(0, S16, S256) 526 .widenScalarToNextPow2(0, 32) 527 .clampMaxNumElements(0, S32, 16) 528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 529 .scalarize(0); 530 531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 532 // Full set of gfx9 features. 533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 534 .legalFor({S32, S16, V2S16}) 535 .clampScalar(0, S16, S32) 536 .clampMaxNumElements(0, S16, 2) 537 .scalarize(0) 538 .widenScalarToNextPow2(0, 32); 539 540 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 541 .legalFor({S32, S16, V2S16}) // Clamp modifier 542 .minScalarOrElt(0, S16) 543 .clampMaxNumElements(0, S16, 2) 544 .scalarize(0) 545 .widenScalarToNextPow2(0, 32) 546 .lower(); 547 } else if (ST.has16BitInsts()) { 548 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 549 .legalFor({S32, S16}) 550 .clampScalar(0, S16, S32) 551 .scalarize(0) 552 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 553 554 // Technically the saturating operations require clamp bit support, but this 555 // was introduced at the same time as 16-bit operations. 556 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 557 .legalFor({S32, S16}) // Clamp modifier 558 .minScalar(0, S16) 559 .scalarize(0) 560 .widenScalarToNextPow2(0, 16) 561 .lower(); 562 563 // We're just lowering this, but it helps get a better result to try to 564 // coerce to the desired type first. 565 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 566 .minScalar(0, S16) 567 .scalarize(0) 568 .lower(); 569 } else { 570 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 571 .legalFor({S32}) 572 .clampScalar(0, S32, S32) 573 .scalarize(0); 574 575 if (ST.hasIntClamp()) { 576 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 577 .legalFor({S32}) // Clamp modifier. 578 .scalarize(0) 579 .minScalarOrElt(0, S32) 580 .lower(); 581 } else { 582 // Clamp bit support was added in VI, along with 16-bit operations. 583 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 584 .minScalar(0, S32) 585 .scalarize(0) 586 .lower(); 587 } 588 589 // FIXME: DAG expansion gets better results. The widening uses the smaller 590 // range values and goes for the min/max lowering directly. 591 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 592 .minScalar(0, S32) 593 .scalarize(0) 594 .lower(); 595 } 596 597 getActionDefinitionsBuilder( 598 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 599 .customFor({S32, S64}) 600 .clampScalar(0, S32, S64) 601 .widenScalarToNextPow2(0, 32) 602 .scalarize(0); 603 604 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 605 .legalFor({S32}) 606 .maxScalarOrElt(0, S32); 607 608 if (ST.hasVOP3PInsts()) { 609 Mulh 610 .clampMaxNumElements(0, S8, 2) 611 .lowerFor({V2S8}); 612 } 613 614 Mulh 615 .scalarize(0) 616 .lower(); 617 618 // Report legal for any types we can handle anywhere. For the cases only legal 619 // on the SALU, RegBankSelect will be able to re-legalize. 620 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 621 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 622 .clampScalar(0, S32, S64) 623 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 624 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 625 .widenScalarToNextPow2(0) 626 .scalarize(0); 627 628 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 629 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 630 .legalFor({{S32, S1}, {S32, S32}}) 631 .minScalar(0, S32) 632 // TODO: .scalarize(0) 633 .lower(); 634 635 getActionDefinitionsBuilder(G_BITCAST) 636 // Don't worry about the size constraint. 637 .legalIf(all(isRegisterType(0), isRegisterType(1))) 638 .lower(); 639 640 641 getActionDefinitionsBuilder(G_CONSTANT) 642 .legalFor({S1, S32, S64, S16, GlobalPtr, 643 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 644 .legalIf(isPointer(0)) 645 .clampScalar(0, S32, S64) 646 .widenScalarToNextPow2(0); 647 648 getActionDefinitionsBuilder(G_FCONSTANT) 649 .legalFor({S32, S64, S16}) 650 .clampScalar(0, S16, S64); 651 652 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 653 .legalIf(isRegisterType(0)) 654 // s1 and s16 are special cases because they have legal operations on 655 // them, but don't really occupy registers in the normal way. 656 .legalFor({S1, S16}) 657 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 658 .clampScalarOrElt(0, S32, MaxScalar) 659 .widenScalarToNextPow2(0, 32) 660 .clampMaxNumElements(0, S32, 16); 661 662 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 663 664 // If the amount is divergent, we have to do a wave reduction to get the 665 // maximum value, so this is expanded during RegBankSelect. 666 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 667 .legalFor({{PrivatePtr, S32}}); 668 669 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 670 .customIf(typeIsNot(0, PrivatePtr)); 671 672 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 673 674 auto &FPOpActions = getActionDefinitionsBuilder( 675 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 676 .legalFor({S32, S64}); 677 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 678 .customFor({S32, S64}); 679 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 680 .customFor({S32, S64}); 681 682 if (ST.has16BitInsts()) { 683 if (ST.hasVOP3PInsts()) 684 FPOpActions.legalFor({S16, V2S16}); 685 else 686 FPOpActions.legalFor({S16}); 687 688 TrigActions.customFor({S16}); 689 FDIVActions.customFor({S16}); 690 } 691 692 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 693 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 694 695 if (ST.hasVOP3PInsts()) { 696 MinNumMaxNum.customFor(FPTypesPK16) 697 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 698 .clampMaxNumElements(0, S16, 2) 699 .clampScalar(0, S16, S64) 700 .scalarize(0); 701 } else if (ST.has16BitInsts()) { 702 MinNumMaxNum.customFor(FPTypes16) 703 .clampScalar(0, S16, S64) 704 .scalarize(0); 705 } else { 706 MinNumMaxNum.customFor(FPTypesBase) 707 .clampScalar(0, S32, S64) 708 .scalarize(0); 709 } 710 711 if (ST.hasVOP3PInsts()) 712 FPOpActions.clampMaxNumElements(0, S16, 2); 713 714 FPOpActions 715 .scalarize(0) 716 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 717 718 TrigActions 719 .scalarize(0) 720 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 721 722 FDIVActions 723 .scalarize(0) 724 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 725 726 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 727 .legalFor(FPTypesPK16) 728 .clampMaxNumElements(0, S16, 2) 729 .scalarize(0) 730 .clampScalar(0, S16, S64); 731 732 if (ST.has16BitInsts()) { 733 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 734 .legalFor({S32, S64, S16}) 735 .scalarize(0) 736 .clampScalar(0, S16, S64); 737 } else { 738 getActionDefinitionsBuilder(G_FSQRT) 739 .legalFor({S32, S64}) 740 .scalarize(0) 741 .clampScalar(0, S32, S64); 742 743 if (ST.hasFractBug()) { 744 getActionDefinitionsBuilder(G_FFLOOR) 745 .customFor({S64}) 746 .legalFor({S32, S64}) 747 .scalarize(0) 748 .clampScalar(0, S32, S64); 749 } else { 750 getActionDefinitionsBuilder(G_FFLOOR) 751 .legalFor({S32, S64}) 752 .scalarize(0) 753 .clampScalar(0, S32, S64); 754 } 755 } 756 757 getActionDefinitionsBuilder(G_FPTRUNC) 758 .legalFor({{S32, S64}, {S16, S32}}) 759 .scalarize(0) 760 .lower(); 761 762 getActionDefinitionsBuilder(G_FPEXT) 763 .legalFor({{S64, S32}, {S32, S16}}) 764 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 765 .scalarize(0); 766 767 getActionDefinitionsBuilder(G_FSUB) 768 // Use actual fsub instruction 769 .legalFor({S32}) 770 // Must use fadd + fneg 771 .lowerFor({S64, S16, V2S16}) 772 .scalarize(0) 773 .clampScalar(0, S32, S64); 774 775 // Whether this is legal depends on the floating point mode for the function. 776 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 777 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 778 FMad.customFor({S32, S16}); 779 else if (ST.hasMadMacF32Insts()) 780 FMad.customFor({S32}); 781 else if (ST.hasMadF16()) 782 FMad.customFor({S16}); 783 FMad.scalarize(0) 784 .lower(); 785 786 auto &FRem = getActionDefinitionsBuilder(G_FREM); 787 if (ST.has16BitInsts()) { 788 FRem.customFor({S16, S32, S64}); 789 } else { 790 FRem.minScalar(0, S32) 791 .customFor({S32, S64}); 792 } 793 FRem.scalarize(0); 794 795 // TODO: Do we need to clamp maximum bitwidth? 796 getActionDefinitionsBuilder(G_TRUNC) 797 .legalIf(isScalar(0)) 798 .legalFor({{V2S16, V2S32}}) 799 .clampMaxNumElements(0, S16, 2) 800 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 801 // situations (like an invalid implicit use), we don't want to infinite loop 802 // in the legalizer. 803 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 804 .alwaysLegal(); 805 806 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 807 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 808 {S32, S1}, {S64, S1}, {S16, S1}}) 809 .scalarize(0) 810 .clampScalar(0, S32, S64) 811 .widenScalarToNextPow2(1, 32); 812 813 // TODO: Split s1->s64 during regbankselect for VALU. 814 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 815 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 816 .lowerFor({{S32, S64}}) 817 .lowerIf(typeIs(1, S1)) 818 .customFor({{S64, S64}}); 819 if (ST.has16BitInsts()) 820 IToFP.legalFor({{S16, S16}}); 821 IToFP.clampScalar(1, S32, S64) 822 .minScalar(0, S32) 823 .scalarize(0) 824 .widenScalarToNextPow2(1); 825 826 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 827 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 828 .customFor({{S64, S32}, {S64, S64}}) 829 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 830 if (ST.has16BitInsts()) 831 FPToI.legalFor({{S16, S16}}); 832 else 833 FPToI.minScalar(1, S32); 834 835 FPToI.minScalar(0, S32) 836 .widenScalarToNextPow2(0, 32) 837 .scalarize(0) 838 .lower(); 839 840 // Lower roundeven into G_FRINT 841 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 842 .scalarize(0) 843 .lower(); 844 845 if (ST.has16BitInsts()) { 846 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 847 .legalFor({S16, S32, S64}) 848 .clampScalar(0, S16, S64) 849 .scalarize(0); 850 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 851 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 852 .legalFor({S32, S64}) 853 .clampScalar(0, S32, S64) 854 .scalarize(0); 855 } else { 856 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 857 .legalFor({S32}) 858 .customFor({S64}) 859 .clampScalar(0, S32, S64) 860 .scalarize(0); 861 } 862 863 getActionDefinitionsBuilder(G_PTR_ADD) 864 .legalIf(all(isPointer(0), sameSize(0, 1))) 865 .scalarize(0) 866 .scalarSameSizeAs(1, 0); 867 868 getActionDefinitionsBuilder(G_PTRMASK) 869 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 870 .scalarSameSizeAs(1, 0) 871 .scalarize(0); 872 873 auto &CmpBuilder = 874 getActionDefinitionsBuilder(G_ICMP) 875 // The compare output type differs based on the register bank of the output, 876 // so make both s1 and s32 legal. 877 // 878 // Scalar compares producing output in scc will be promoted to s32, as that 879 // is the allocatable register type that will be needed for the copy from 880 // scc. This will be promoted during RegBankSelect, and we assume something 881 // before that won't try to use s32 result types. 882 // 883 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 884 // bank. 885 .legalForCartesianProduct( 886 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 887 .legalForCartesianProduct( 888 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 889 if (ST.has16BitInsts()) { 890 CmpBuilder.legalFor({{S1, S16}}); 891 } 892 893 CmpBuilder 894 .widenScalarToNextPow2(1) 895 .clampScalar(1, S32, S64) 896 .scalarize(0) 897 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 898 899 getActionDefinitionsBuilder(G_FCMP) 900 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 901 .widenScalarToNextPow2(1) 902 .clampScalar(1, S32, S64) 903 .scalarize(0); 904 905 // FIXME: fpow has a selection pattern that should move to custom lowering. 906 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 907 if (ST.has16BitInsts()) 908 Exp2Ops.legalFor({S32, S16}); 909 else 910 Exp2Ops.legalFor({S32}); 911 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 912 Exp2Ops.scalarize(0); 913 914 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 915 if (ST.has16BitInsts()) 916 ExpOps.customFor({{S32}, {S16}}); 917 else 918 ExpOps.customFor({S32}); 919 ExpOps.clampScalar(0, MinScalarFPTy, S32) 920 .scalarize(0); 921 922 getActionDefinitionsBuilder(G_FPOWI) 923 .clampScalar(0, MinScalarFPTy, S32) 924 .lower(); 925 926 // The 64-bit versions produce 32-bit results, but only on the SALU. 927 getActionDefinitionsBuilder(G_CTPOP) 928 .legalFor({{S32, S32}, {S32, S64}}) 929 .clampScalar(0, S32, S32) 930 .clampScalar(1, S32, S64) 931 .scalarize(0) 932 .widenScalarToNextPow2(0, 32) 933 .widenScalarToNextPow2(1, 32); 934 935 // The hardware instructions return a different result on 0 than the generic 936 // instructions expect. The hardware produces -1, but these produce the 937 // bitwidth. 938 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 939 .scalarize(0) 940 .clampScalar(0, S32, S32) 941 .clampScalar(1, S32, S64) 942 .widenScalarToNextPow2(0, 32) 943 .widenScalarToNextPow2(1, 32) 944 .lower(); 945 946 // The 64-bit versions produce 32-bit results, but only on the SALU. 947 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 948 .legalFor({{S32, S32}, {S32, S64}}) 949 .clampScalar(0, S32, S32) 950 .clampScalar(1, S32, S64) 951 .scalarize(0) 952 .widenScalarToNextPow2(0, 32) 953 .widenScalarToNextPow2(1, 32); 954 955 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 956 // RegBankSelect. 957 getActionDefinitionsBuilder(G_BITREVERSE) 958 .legalFor({S32, S64}) 959 .clampScalar(0, S32, S64) 960 .scalarize(0) 961 .widenScalarToNextPow2(0); 962 963 if (ST.has16BitInsts()) { 964 getActionDefinitionsBuilder(G_BSWAP) 965 .legalFor({S16, S32, V2S16}) 966 .clampMaxNumElements(0, S16, 2) 967 // FIXME: Fixing non-power-of-2 before clamp is workaround for 968 // narrowScalar limitation. 969 .widenScalarToNextPow2(0) 970 .clampScalar(0, S16, S32) 971 .scalarize(0); 972 973 if (ST.hasVOP3PInsts()) { 974 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 975 .legalFor({S32, S16, V2S16}) 976 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 977 .clampMaxNumElements(0, S16, 2) 978 .minScalar(0, S16) 979 .widenScalarToNextPow2(0) 980 .scalarize(0) 981 .lower(); 982 } else { 983 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 984 .legalFor({S32, S16}) 985 .widenScalarToNextPow2(0) 986 .minScalar(0, S16) 987 .scalarize(0) 988 .lower(); 989 } 990 } else { 991 // TODO: Should have same legality without v_perm_b32 992 getActionDefinitionsBuilder(G_BSWAP) 993 .legalFor({S32}) 994 .lowerIf(scalarNarrowerThan(0, 32)) 995 // FIXME: Fixing non-power-of-2 before clamp is workaround for 996 // narrowScalar limitation. 997 .widenScalarToNextPow2(0) 998 .maxScalar(0, S32) 999 .scalarize(0) 1000 .lower(); 1001 1002 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1003 .legalFor({S32}) 1004 .minScalar(0, S32) 1005 .widenScalarToNextPow2(0) 1006 .scalarize(0) 1007 .lower(); 1008 } 1009 1010 getActionDefinitionsBuilder(G_INTTOPTR) 1011 // List the common cases 1012 .legalForCartesianProduct(AddrSpaces64, {S64}) 1013 .legalForCartesianProduct(AddrSpaces32, {S32}) 1014 .scalarize(0) 1015 // Accept any address space as long as the size matches 1016 .legalIf(sameSize(0, 1)) 1017 .widenScalarIf(smallerThan(1, 0), 1018 [](const LegalityQuery &Query) { 1019 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1020 }) 1021 .narrowScalarIf(largerThan(1, 0), 1022 [](const LegalityQuery &Query) { 1023 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1024 }); 1025 1026 getActionDefinitionsBuilder(G_PTRTOINT) 1027 // List the common cases 1028 .legalForCartesianProduct(AddrSpaces64, {S64}) 1029 .legalForCartesianProduct(AddrSpaces32, {S32}) 1030 .scalarize(0) 1031 // Accept any address space as long as the size matches 1032 .legalIf(sameSize(0, 1)) 1033 .widenScalarIf(smallerThan(0, 1), 1034 [](const LegalityQuery &Query) { 1035 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1036 }) 1037 .narrowScalarIf( 1038 largerThan(0, 1), 1039 [](const LegalityQuery &Query) { 1040 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1041 }); 1042 1043 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1044 .scalarize(0) 1045 .custom(); 1046 1047 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1048 bool IsLoad) -> bool { 1049 const LLT DstTy = Query.Types[0]; 1050 1051 // Split vector extloads. 1052 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1053 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 1054 1055 if (MemSize < DstTy.getSizeInBits()) 1056 MemSize = std::max(MemSize, AlignBits); 1057 1058 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1059 return true; 1060 1061 const LLT PtrTy = Query.Types[1]; 1062 unsigned AS = PtrTy.getAddressSpace(); 1063 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1064 return true; 1065 1066 // Catch weird sized loads that don't evenly divide into the access sizes 1067 // TODO: May be able to widen depending on alignment etc. 1068 unsigned NumRegs = (MemSize + 31) / 32; 1069 if (NumRegs == 3) { 1070 if (!ST.hasDwordx3LoadStores()) 1071 return true; 1072 } else { 1073 // If the alignment allows, these should have been widened. 1074 if (!isPowerOf2_32(NumRegs)) 1075 return true; 1076 } 1077 1078 if (AlignBits < MemSize) { 1079 const SITargetLowering *TLI = ST.getTargetLowering(); 1080 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1081 Align(AlignBits / 8)); 1082 } 1083 1084 return false; 1085 }; 1086 1087 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1088 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1089 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1090 1091 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1092 // LDS 1093 // TODO: Unsupported flat for SI. 1094 1095 for (unsigned Op : {G_LOAD, G_STORE}) { 1096 const bool IsStore = Op == G_STORE; 1097 1098 auto &Actions = getActionDefinitionsBuilder(Op); 1099 // Explicitly list some common cases. 1100 // TODO: Does this help compile time at all? 1101 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1102 {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1103 {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1104 {S64, GlobalPtr, S64, GlobalAlign32}, 1105 {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1106 {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1107 {S32, GlobalPtr, S8, GlobalAlign8}, 1108 {S32, GlobalPtr, S16, GlobalAlign16}, 1109 1110 {S32, LocalPtr, S32, 32}, 1111 {S64, LocalPtr, S64, 32}, 1112 {V2S32, LocalPtr, V2S32, 32}, 1113 {S32, LocalPtr, S8, 8}, 1114 {S32, LocalPtr, S16, 16}, 1115 {V2S16, LocalPtr, S32, 32}, 1116 1117 {S32, PrivatePtr, S32, 32}, 1118 {S32, PrivatePtr, S8, 8}, 1119 {S32, PrivatePtr, S16, 16}, 1120 {V2S16, PrivatePtr, S32, 32}, 1121 1122 {S32, ConstantPtr, S32, GlobalAlign32}, 1123 {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1124 {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1125 {S64, ConstantPtr, S64, GlobalAlign32}, 1126 {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 1127 Actions.legalIf( 1128 [=](const LegalityQuery &Query) -> bool { 1129 return isLoadStoreLegal(ST, Query); 1130 }); 1131 1132 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1133 // 64-bits. 1134 // 1135 // TODO: Should generalize bitcast action into coerce, which will also cover 1136 // inserting addrspacecasts. 1137 Actions.customIf(typeIs(1, Constant32Ptr)); 1138 1139 // Turn any illegal element vectors into something easier to deal 1140 // with. These will ultimately produce 32-bit scalar shifts to extract the 1141 // parts anyway. 1142 // 1143 // For odd 16-bit element vectors, prefer to split those into pieces with 1144 // 16-bit vector parts. 1145 Actions.bitcastIf( 1146 [=](const LegalityQuery &Query) -> bool { 1147 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1148 Query.MMODescrs[0].MemoryTy); 1149 }, bitcastToRegisterType(0)); 1150 1151 if (!IsStore) { 1152 // Widen suitably aligned loads by loading extra bytes. The standard 1153 // legalization actions can't properly express widening memory operands. 1154 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1155 return shouldWidenLoad(ST, Query, G_LOAD); 1156 }); 1157 } 1158 1159 // FIXME: load/store narrowing should be moved to lower action 1160 Actions 1161 .narrowScalarIf( 1162 [=](const LegalityQuery &Query) -> bool { 1163 return !Query.Types[0].isVector() && 1164 needToSplitMemOp(Query, Op == G_LOAD); 1165 }, 1166 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1167 const LLT DstTy = Query.Types[0]; 1168 const LLT PtrTy = Query.Types[1]; 1169 1170 const unsigned DstSize = DstTy.getSizeInBits(); 1171 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1172 1173 // Split extloads. 1174 if (DstSize > MemSize) 1175 return std::make_pair(0, LLT::scalar(MemSize)); 1176 1177 if (!isPowerOf2_32(DstSize)) { 1178 // We're probably decomposing an odd sized store. Try to split 1179 // to the widest type. TODO: Account for alignment. As-is it 1180 // should be OK, since the new parts will be further legalized. 1181 unsigned FloorSize = PowerOf2Floor(DstSize); 1182 return std::make_pair(0, LLT::scalar(FloorSize)); 1183 } 1184 1185 if (DstSize > 32 && (DstSize % 32 != 0)) { 1186 // FIXME: Need a way to specify non-extload of larger size if 1187 // suitably aligned. 1188 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1189 } 1190 1191 unsigned MaxSize = maxSizeForAddrSpace(ST, 1192 PtrTy.getAddressSpace(), 1193 Op == G_LOAD); 1194 if (MemSize > MaxSize) 1195 return std::make_pair(0, LLT::scalar(MaxSize)); 1196 1197 unsigned Align = Query.MMODescrs[0].AlignInBits; 1198 return std::make_pair(0, LLT::scalar(Align)); 1199 }) 1200 .fewerElementsIf( 1201 [=](const LegalityQuery &Query) -> bool { 1202 return Query.Types[0].isVector() && 1203 needToSplitMemOp(Query, Op == G_LOAD); 1204 }, 1205 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1206 const LLT DstTy = Query.Types[0]; 1207 const LLT PtrTy = Query.Types[1]; 1208 1209 LLT EltTy = DstTy.getElementType(); 1210 unsigned MaxSize = maxSizeForAddrSpace(ST, 1211 PtrTy.getAddressSpace(), 1212 Op == G_LOAD); 1213 1214 // FIXME: Handle widened to power of 2 results better. This ends 1215 // up scalarizing. 1216 // FIXME: 3 element stores scalarized on SI 1217 1218 // Split if it's too large for the address space. 1219 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1220 if (MemSize > MaxSize) { 1221 unsigned NumElts = DstTy.getNumElements(); 1222 unsigned EltSize = EltTy.getSizeInBits(); 1223 1224 if (MaxSize % EltSize == 0) { 1225 return std::make_pair( 1226 0, LLT::scalarOrVector( 1227 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1228 } 1229 1230 unsigned NumPieces = MemSize / MaxSize; 1231 1232 // FIXME: Refine when odd breakdowns handled 1233 // The scalars will need to be re-legalized. 1234 if (NumPieces == 1 || NumPieces >= NumElts || 1235 NumElts % NumPieces != 0) 1236 return std::make_pair(0, EltTy); 1237 1238 return std::make_pair( 1239 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1240 } 1241 1242 // FIXME: We could probably handle weird extending loads better. 1243 if (DstTy.getSizeInBits() > MemSize) 1244 return std::make_pair(0, EltTy); 1245 1246 unsigned EltSize = EltTy.getSizeInBits(); 1247 unsigned DstSize = DstTy.getSizeInBits(); 1248 if (!isPowerOf2_32(DstSize)) { 1249 // We're probably decomposing an odd sized store. Try to split 1250 // to the widest type. TODO: Account for alignment. As-is it 1251 // should be OK, since the new parts will be further legalized. 1252 unsigned FloorSize = PowerOf2Floor(DstSize); 1253 return std::make_pair( 1254 0, LLT::scalarOrVector( 1255 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1256 } 1257 1258 // Need to split because of alignment. 1259 unsigned Align = Query.MMODescrs[0].AlignInBits; 1260 if (EltSize > Align && 1261 (EltSize / Align < DstTy.getNumElements())) { 1262 return std::make_pair( 1263 0, LLT::fixed_vector(EltSize / Align, EltTy)); 1264 } 1265 1266 // May need relegalization for the scalars. 1267 return std::make_pair(0, EltTy); 1268 }) 1269 .minScalar(0, S32) 1270 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1271 .widenScalarToNextPow2(0) 1272 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1273 .lower(); 1274 } 1275 1276 // FIXME: Unaligned accesses not lowered. 1277 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1278 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1279 {S32, GlobalPtr, S16, 2 * 8}, 1280 {S32, LocalPtr, S8, 8}, 1281 {S32, LocalPtr, S16, 16}, 1282 {S32, PrivatePtr, S8, 8}, 1283 {S32, PrivatePtr, S16, 16}, 1284 {S32, ConstantPtr, S8, 8}, 1285 {S32, ConstantPtr, S16, 2 * 8}}) 1286 .legalIf( 1287 [=](const LegalityQuery &Query) -> bool { 1288 return isLoadStoreLegal(ST, Query); 1289 }); 1290 1291 if (ST.hasFlatAddressSpace()) { 1292 ExtLoads.legalForTypesWithMemDesc( 1293 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 1294 } 1295 1296 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1297 // 64-bits. 1298 // 1299 // TODO: Should generalize bitcast action into coerce, which will also cover 1300 // inserting addrspacecasts. 1301 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1302 1303 ExtLoads.clampScalar(0, S32, S32) 1304 .widenScalarToNextPow2(0) 1305 .lower(); 1306 1307 auto &Atomics = getActionDefinitionsBuilder( 1308 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1309 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1310 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1311 G_ATOMICRMW_UMIN}) 1312 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1313 {S64, GlobalPtr}, {S64, LocalPtr}, 1314 {S32, RegionPtr}, {S64, RegionPtr}}); 1315 if (ST.hasFlatAddressSpace()) { 1316 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1317 } 1318 1319 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1320 if (ST.hasLDSFPAtomics()) { 1321 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1322 if (ST.hasGFX90AInsts()) 1323 Atomic.legalFor({{S64, LocalPtr}}); 1324 } 1325 if (ST.hasAtomicFaddInsts()) 1326 Atomic.legalFor({{S32, GlobalPtr}}); 1327 1328 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1329 // demarshalling 1330 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1331 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1332 {S32, FlatPtr}, {S64, FlatPtr}}) 1333 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1334 {S32, RegionPtr}, {S64, RegionPtr}}); 1335 // TODO: Pointer types, any 32-bit or 64-bit vector 1336 1337 // Condition should be s32 for scalar, s1 for vector. 1338 getActionDefinitionsBuilder(G_SELECT) 1339 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1340 LocalPtr, FlatPtr, PrivatePtr, 1341 LLT::fixed_vector(2, LocalPtr), 1342 LLT::fixed_vector(2, PrivatePtr)}, 1343 {S1, S32}) 1344 .clampScalar(0, S16, S64) 1345 .scalarize(1) 1346 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1347 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1348 .clampMaxNumElements(0, S32, 2) 1349 .clampMaxNumElements(0, LocalPtr, 2) 1350 .clampMaxNumElements(0, PrivatePtr, 2) 1351 .scalarize(0) 1352 .widenScalarToNextPow2(0) 1353 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1354 1355 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1356 // be more flexible with the shift amount type. 1357 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1358 .legalFor({{S32, S32}, {S64, S32}}); 1359 if (ST.has16BitInsts()) { 1360 if (ST.hasVOP3PInsts()) { 1361 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1362 .clampMaxNumElements(0, S16, 2); 1363 } else 1364 Shifts.legalFor({{S16, S16}}); 1365 1366 // TODO: Support 16-bit shift amounts for all types 1367 Shifts.widenScalarIf( 1368 [=](const LegalityQuery &Query) { 1369 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1370 // 32-bit amount. 1371 const LLT ValTy = Query.Types[0]; 1372 const LLT AmountTy = Query.Types[1]; 1373 return ValTy.getSizeInBits() <= 16 && 1374 AmountTy.getSizeInBits() < 16; 1375 }, changeTo(1, S16)); 1376 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1377 Shifts.clampScalar(1, S32, S32); 1378 Shifts.clampScalar(0, S16, S64); 1379 Shifts.widenScalarToNextPow2(0, 16); 1380 1381 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1382 .minScalar(0, S16) 1383 .scalarize(0) 1384 .lower(); 1385 } else { 1386 // Make sure we legalize the shift amount type first, as the general 1387 // expansion for the shifted type will produce much worse code if it hasn't 1388 // been truncated already. 1389 Shifts.clampScalar(1, S32, S32); 1390 Shifts.clampScalar(0, S32, S64); 1391 Shifts.widenScalarToNextPow2(0, 32); 1392 1393 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1394 .minScalar(0, S32) 1395 .scalarize(0) 1396 .lower(); 1397 } 1398 Shifts.scalarize(0); 1399 1400 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1401 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1402 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1403 unsigned IdxTypeIdx = 2; 1404 1405 getActionDefinitionsBuilder(Op) 1406 .customIf([=](const LegalityQuery &Query) { 1407 const LLT EltTy = Query.Types[EltTypeIdx]; 1408 const LLT VecTy = Query.Types[VecTypeIdx]; 1409 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1410 const unsigned EltSize = EltTy.getSizeInBits(); 1411 return (EltSize == 32 || EltSize == 64) && 1412 VecTy.getSizeInBits() % 32 == 0 && 1413 VecTy.getSizeInBits() <= MaxRegisterSize && 1414 IdxTy.getSizeInBits() == 32; 1415 }) 1416 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1417 bitcastToVectorElement32(VecTypeIdx)) 1418 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1419 .bitcastIf( 1420 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1421 [=](const LegalityQuery &Query) { 1422 // For > 64-bit element types, try to turn this into a 64-bit 1423 // element vector since we may be able to do better indexing 1424 // if this is scalar. If not, fall back to 32. 1425 const LLT EltTy = Query.Types[EltTypeIdx]; 1426 const LLT VecTy = Query.Types[VecTypeIdx]; 1427 const unsigned DstEltSize = EltTy.getSizeInBits(); 1428 const unsigned VecSize = VecTy.getSizeInBits(); 1429 1430 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1431 return std::make_pair( 1432 VecTypeIdx, 1433 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1434 }) 1435 .clampScalar(EltTypeIdx, S32, S64) 1436 .clampScalar(VecTypeIdx, S32, S64) 1437 .clampScalar(IdxTypeIdx, S32, S32) 1438 .clampMaxNumElements(VecTypeIdx, S32, 32) 1439 // TODO: Clamp elements for 64-bit vectors? 1440 // It should only be necessary with variable indexes. 1441 // As a last resort, lower to the stack 1442 .lower(); 1443 } 1444 1445 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1446 .unsupportedIf([=](const LegalityQuery &Query) { 1447 const LLT &EltTy = Query.Types[1].getElementType(); 1448 return Query.Types[0] != EltTy; 1449 }); 1450 1451 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1452 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1453 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1454 1455 // FIXME: Doesn't handle extract of illegal sizes. 1456 getActionDefinitionsBuilder(Op) 1457 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1458 // FIXME: Multiples of 16 should not be legal. 1459 .legalIf([=](const LegalityQuery &Query) { 1460 const LLT BigTy = Query.Types[BigTyIdx]; 1461 const LLT LitTy = Query.Types[LitTyIdx]; 1462 return (BigTy.getSizeInBits() % 32 == 0) && 1463 (LitTy.getSizeInBits() % 16 == 0); 1464 }) 1465 .widenScalarIf( 1466 [=](const LegalityQuery &Query) { 1467 const LLT BigTy = Query.Types[BigTyIdx]; 1468 return (BigTy.getScalarSizeInBits() < 16); 1469 }, 1470 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1471 .widenScalarIf( 1472 [=](const LegalityQuery &Query) { 1473 const LLT LitTy = Query.Types[LitTyIdx]; 1474 return (LitTy.getScalarSizeInBits() < 16); 1475 }, 1476 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1477 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1478 .widenScalarToNextPow2(BigTyIdx, 32); 1479 1480 } 1481 1482 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1483 .legalForCartesianProduct(AllS32Vectors, {S32}) 1484 .legalForCartesianProduct(AllS64Vectors, {S64}) 1485 .clampNumElements(0, V16S32, V32S32) 1486 .clampNumElements(0, V2S64, V16S64) 1487 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1488 1489 if (ST.hasScalarPackInsts()) { 1490 BuildVector 1491 // FIXME: Should probably widen s1 vectors straight to s32 1492 .minScalarOrElt(0, S16) 1493 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1494 .minScalar(1, S32); 1495 1496 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1497 .legalFor({V2S16, S32}) 1498 .lower(); 1499 BuildVector.minScalarOrElt(0, S32); 1500 } else { 1501 BuildVector.customFor({V2S16, S16}); 1502 BuildVector.minScalarOrElt(0, S32); 1503 1504 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1505 .customFor({V2S16, S32}) 1506 .lower(); 1507 } 1508 1509 BuildVector.legalIf(isRegisterType(0)); 1510 1511 // FIXME: Clamp maximum size 1512 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1513 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1514 .clampMaxNumElements(0, S32, 32) 1515 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1516 .clampMaxNumElements(0, S16, 64); 1517 1518 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1519 // pre-legalize. 1520 if (ST.hasVOP3PInsts()) { 1521 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1522 .customFor({V2S16, V2S16}) 1523 .lower(); 1524 } else 1525 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1526 1527 // Merge/Unmerge 1528 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1529 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1530 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1531 1532 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1533 const LLT Ty = Query.Types[TypeIdx]; 1534 if (Ty.isVector()) { 1535 const LLT &EltTy = Ty.getElementType(); 1536 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1537 return true; 1538 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1539 return true; 1540 } 1541 return false; 1542 }; 1543 1544 auto &Builder = getActionDefinitionsBuilder(Op) 1545 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1546 .lowerFor({{S16, V2S16}}) 1547 .lowerIf([=](const LegalityQuery &Query) { 1548 const LLT BigTy = Query.Types[BigTyIdx]; 1549 return BigTy.getSizeInBits() == 32; 1550 }) 1551 // Try to widen to s16 first for small types. 1552 // TODO: Only do this on targets with legal s16 shifts 1553 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1554 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1555 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1556 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1557 elementTypeIs(1, S16)), 1558 changeTo(1, V2S16)) 1559 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1560 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1561 // valid. 1562 .clampScalar(LitTyIdx, S32, S512) 1563 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1564 // Break up vectors with weird elements into scalars 1565 .fewerElementsIf( 1566 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1567 scalarize(0)) 1568 .fewerElementsIf( 1569 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1570 scalarize(1)) 1571 .clampScalar(BigTyIdx, S32, MaxScalar); 1572 1573 if (Op == G_MERGE_VALUES) { 1574 Builder.widenScalarIf( 1575 // TODO: Use 16-bit shifts if legal for 8-bit values? 1576 [=](const LegalityQuery &Query) { 1577 const LLT Ty = Query.Types[LitTyIdx]; 1578 return Ty.getSizeInBits() < 32; 1579 }, 1580 changeTo(LitTyIdx, S32)); 1581 } 1582 1583 Builder.widenScalarIf( 1584 [=](const LegalityQuery &Query) { 1585 const LLT Ty = Query.Types[BigTyIdx]; 1586 return !isPowerOf2_32(Ty.getSizeInBits()) && 1587 Ty.getSizeInBits() % 16 != 0; 1588 }, 1589 [=](const LegalityQuery &Query) { 1590 // Pick the next power of 2, or a multiple of 64 over 128. 1591 // Whichever is smaller. 1592 const LLT &Ty = Query.Types[BigTyIdx]; 1593 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1594 if (NewSizeInBits >= 256) { 1595 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1596 if (RoundedTo < NewSizeInBits) 1597 NewSizeInBits = RoundedTo; 1598 } 1599 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1600 }) 1601 // Any vectors left are the wrong size. Scalarize them. 1602 .scalarize(0) 1603 .scalarize(1); 1604 } 1605 1606 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1607 // RegBankSelect. 1608 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1609 .legalFor({{S32}, {S64}}); 1610 1611 if (ST.hasVOP3PInsts()) { 1612 SextInReg.lowerFor({{V2S16}}) 1613 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1614 // get more vector shift opportunities, since we'll get those when 1615 // expanded. 1616 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1617 } else if (ST.has16BitInsts()) { 1618 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1619 } else { 1620 // Prefer to promote to s32 before lowering if we don't have 16-bit 1621 // shifts. This avoid a lot of intermediate truncate and extend operations. 1622 SextInReg.lowerFor({{S32}, {S64}}); 1623 } 1624 1625 SextInReg 1626 .scalarize(0) 1627 .clampScalar(0, S32, S64) 1628 .lower(); 1629 1630 // TODO: Only Try to form v2s16 with legal packed instructions. 1631 getActionDefinitionsBuilder(G_FSHR) 1632 .legalFor({{S32, S32}}) 1633 .lowerFor({{V2S16, V2S16}}) 1634 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1635 .scalarize(0) 1636 .lower(); 1637 1638 if (ST.hasVOP3PInsts()) { 1639 getActionDefinitionsBuilder(G_FSHL) 1640 .lowerFor({{V2S16, V2S16}}) 1641 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1642 .scalarize(0) 1643 .lower(); 1644 } else { 1645 getActionDefinitionsBuilder(G_FSHL) 1646 .scalarize(0) 1647 .lower(); 1648 } 1649 1650 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1651 .legalFor({S64}); 1652 1653 getActionDefinitionsBuilder(G_FENCE) 1654 .alwaysLegal(); 1655 1656 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1657 .scalarize(0) 1658 .minScalar(0, S32) 1659 .lower(); 1660 1661 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1662 .legalFor({{S32, S32}, {S64, S32}}) 1663 .clampScalar(1, S32, S32) 1664 .clampScalar(0, S32, S64) 1665 .widenScalarToNextPow2(0) 1666 .scalarize(0); 1667 1668 getActionDefinitionsBuilder({ 1669 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1670 G_FCOPYSIGN, 1671 1672 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1673 G_ATOMICRMW_NAND, 1674 G_ATOMICRMW_FSUB, 1675 G_READ_REGISTER, 1676 G_WRITE_REGISTER, 1677 1678 G_SADDO, G_SSUBO, 1679 1680 // TODO: Implement 1681 G_FMINIMUM, G_FMAXIMUM}).lower(); 1682 1683 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1684 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1685 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1686 .unsupported(); 1687 1688 getLegacyLegalizerInfo().computeTables(); 1689 verify(*ST.getInstrInfo()); 1690 } 1691 1692 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1693 MachineInstr &MI) const { 1694 MachineIRBuilder &B = Helper.MIRBuilder; 1695 MachineRegisterInfo &MRI = *B.getMRI(); 1696 1697 switch (MI.getOpcode()) { 1698 case TargetOpcode::G_ADDRSPACE_CAST: 1699 return legalizeAddrSpaceCast(MI, MRI, B); 1700 case TargetOpcode::G_FRINT: 1701 return legalizeFrint(MI, MRI, B); 1702 case TargetOpcode::G_FCEIL: 1703 return legalizeFceil(MI, MRI, B); 1704 case TargetOpcode::G_FREM: 1705 return legalizeFrem(MI, MRI, B); 1706 case TargetOpcode::G_INTRINSIC_TRUNC: 1707 return legalizeIntrinsicTrunc(MI, MRI, B); 1708 case TargetOpcode::G_SITOFP: 1709 return legalizeITOFP(MI, MRI, B, true); 1710 case TargetOpcode::G_UITOFP: 1711 return legalizeITOFP(MI, MRI, B, false); 1712 case TargetOpcode::G_FPTOSI: 1713 return legalizeFPTOI(MI, MRI, B, true); 1714 case TargetOpcode::G_FPTOUI: 1715 return legalizeFPTOI(MI, MRI, B, false); 1716 case TargetOpcode::G_FMINNUM: 1717 case TargetOpcode::G_FMAXNUM: 1718 case TargetOpcode::G_FMINNUM_IEEE: 1719 case TargetOpcode::G_FMAXNUM_IEEE: 1720 return legalizeMinNumMaxNum(Helper, MI); 1721 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1722 return legalizeExtractVectorElt(MI, MRI, B); 1723 case TargetOpcode::G_INSERT_VECTOR_ELT: 1724 return legalizeInsertVectorElt(MI, MRI, B); 1725 case TargetOpcode::G_SHUFFLE_VECTOR: 1726 return legalizeShuffleVector(MI, MRI, B); 1727 case TargetOpcode::G_FSIN: 1728 case TargetOpcode::G_FCOS: 1729 return legalizeSinCos(MI, MRI, B); 1730 case TargetOpcode::G_GLOBAL_VALUE: 1731 return legalizeGlobalValue(MI, MRI, B); 1732 case TargetOpcode::G_LOAD: 1733 case TargetOpcode::G_SEXTLOAD: 1734 case TargetOpcode::G_ZEXTLOAD: 1735 return legalizeLoad(Helper, MI); 1736 case TargetOpcode::G_FMAD: 1737 return legalizeFMad(MI, MRI, B); 1738 case TargetOpcode::G_FDIV: 1739 return legalizeFDIV(MI, MRI, B); 1740 case TargetOpcode::G_UDIV: 1741 case TargetOpcode::G_UREM: 1742 case TargetOpcode::G_UDIVREM: 1743 return legalizeUnsignedDIV_REM(MI, MRI, B); 1744 case TargetOpcode::G_SDIV: 1745 case TargetOpcode::G_SREM: 1746 case TargetOpcode::G_SDIVREM: 1747 return legalizeSignedDIV_REM(MI, MRI, B); 1748 case TargetOpcode::G_ATOMIC_CMPXCHG: 1749 return legalizeAtomicCmpXChg(MI, MRI, B); 1750 case TargetOpcode::G_FLOG: 1751 return legalizeFlog(MI, B, numbers::ln2f); 1752 case TargetOpcode::G_FLOG10: 1753 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1754 case TargetOpcode::G_FEXP: 1755 return legalizeFExp(MI, B); 1756 case TargetOpcode::G_FPOW: 1757 return legalizeFPow(MI, B); 1758 case TargetOpcode::G_FFLOOR: 1759 return legalizeFFloor(MI, MRI, B); 1760 case TargetOpcode::G_BUILD_VECTOR: 1761 return legalizeBuildVector(MI, MRI, B); 1762 default: 1763 return false; 1764 } 1765 1766 llvm_unreachable("expected switch to return"); 1767 } 1768 1769 Register AMDGPULegalizerInfo::getSegmentAperture( 1770 unsigned AS, 1771 MachineRegisterInfo &MRI, 1772 MachineIRBuilder &B) const { 1773 MachineFunction &MF = B.getMF(); 1774 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1775 const LLT S32 = LLT::scalar(32); 1776 1777 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1778 1779 if (ST.hasApertureRegs()) { 1780 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1781 // getreg. 1782 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1783 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1784 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1785 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1786 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1787 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1788 unsigned Encoding = 1789 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1790 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1791 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1792 1793 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1794 1795 B.buildInstr(AMDGPU::S_GETREG_B32) 1796 .addDef(GetReg) 1797 .addImm(Encoding); 1798 MRI.setType(GetReg, S32); 1799 1800 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1801 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1802 } 1803 1804 Register QueuePtr = MRI.createGenericVirtualRegister( 1805 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1806 1807 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1808 return Register(); 1809 1810 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1811 // private_segment_aperture_base_hi. 1812 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1813 1814 // TODO: can we be smarter about machine pointer info? 1815 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1816 MachineMemOperand *MMO = MF.getMachineMemOperand( 1817 PtrInfo, 1818 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1819 MachineMemOperand::MOInvariant, 1820 LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 1821 1822 Register LoadAddr; 1823 1824 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1825 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1826 } 1827 1828 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1829 MachineInstr &MI, MachineRegisterInfo &MRI, 1830 MachineIRBuilder &B) const { 1831 MachineFunction &MF = B.getMF(); 1832 1833 const LLT S32 = LLT::scalar(32); 1834 Register Dst = MI.getOperand(0).getReg(); 1835 Register Src = MI.getOperand(1).getReg(); 1836 1837 LLT DstTy = MRI.getType(Dst); 1838 LLT SrcTy = MRI.getType(Src); 1839 unsigned DestAS = DstTy.getAddressSpace(); 1840 unsigned SrcAS = SrcTy.getAddressSpace(); 1841 1842 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1843 // vector element. 1844 assert(!DstTy.isVector()); 1845 1846 const AMDGPUTargetMachine &TM 1847 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1848 1849 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1850 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1851 return true; 1852 } 1853 1854 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1855 // Truncate. 1856 B.buildExtract(Dst, Src, 0); 1857 MI.eraseFromParent(); 1858 return true; 1859 } 1860 1861 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1862 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1863 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1864 1865 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1866 // another. Merge operands are required to be the same type, but creating an 1867 // extra ptrtoint would be kind of pointless. 1868 auto HighAddr = B.buildConstant( 1869 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1870 B.buildMerge(Dst, {Src, HighAddr}); 1871 MI.eraseFromParent(); 1872 return true; 1873 } 1874 1875 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1876 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1877 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1878 unsigned NullVal = TM.getNullPointerValue(DestAS); 1879 1880 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1881 auto FlatNull = B.buildConstant(SrcTy, 0); 1882 1883 // Extract low 32-bits of the pointer. 1884 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1885 1886 auto CmpRes = 1887 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1888 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1889 1890 MI.eraseFromParent(); 1891 return true; 1892 } 1893 1894 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1895 return false; 1896 1897 if (!ST.hasFlatAddressSpace()) 1898 return false; 1899 1900 auto SegmentNull = 1901 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1902 auto FlatNull = 1903 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1904 1905 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1906 if (!ApertureReg.isValid()) 1907 return false; 1908 1909 auto CmpRes = 1910 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1911 1912 // Coerce the type of the low half of the result so we can use merge_values. 1913 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1914 1915 // TODO: Should we allow mismatched types but matching sizes in merges to 1916 // avoid the ptrtoint? 1917 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1918 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1919 1920 MI.eraseFromParent(); 1921 return true; 1922 } 1923 1924 bool AMDGPULegalizerInfo::legalizeFrint( 1925 MachineInstr &MI, MachineRegisterInfo &MRI, 1926 MachineIRBuilder &B) const { 1927 Register Src = MI.getOperand(1).getReg(); 1928 LLT Ty = MRI.getType(Src); 1929 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1930 1931 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1932 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1933 1934 auto C1 = B.buildFConstant(Ty, C1Val); 1935 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1936 1937 // TODO: Should this propagate fast-math-flags? 1938 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1939 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1940 1941 auto C2 = B.buildFConstant(Ty, C2Val); 1942 auto Fabs = B.buildFAbs(Ty, Src); 1943 1944 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1945 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1946 MI.eraseFromParent(); 1947 return true; 1948 } 1949 1950 bool AMDGPULegalizerInfo::legalizeFceil( 1951 MachineInstr &MI, MachineRegisterInfo &MRI, 1952 MachineIRBuilder &B) const { 1953 1954 const LLT S1 = LLT::scalar(1); 1955 const LLT S64 = LLT::scalar(64); 1956 1957 Register Src = MI.getOperand(1).getReg(); 1958 assert(MRI.getType(Src) == S64); 1959 1960 // result = trunc(src) 1961 // if (src > 0.0 && src != result) 1962 // result += 1.0 1963 1964 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1965 1966 const auto Zero = B.buildFConstant(S64, 0.0); 1967 const auto One = B.buildFConstant(S64, 1.0); 1968 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1969 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1970 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1971 auto Add = B.buildSelect(S64, And, One, Zero); 1972 1973 // TODO: Should this propagate fast-math-flags? 1974 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1975 return true; 1976 } 1977 1978 bool AMDGPULegalizerInfo::legalizeFrem( 1979 MachineInstr &MI, MachineRegisterInfo &MRI, 1980 MachineIRBuilder &B) const { 1981 Register DstReg = MI.getOperand(0).getReg(); 1982 Register Src0Reg = MI.getOperand(1).getReg(); 1983 Register Src1Reg = MI.getOperand(2).getReg(); 1984 auto Flags = MI.getFlags(); 1985 LLT Ty = MRI.getType(DstReg); 1986 1987 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1988 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1989 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1990 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1991 MI.eraseFromParent(); 1992 return true; 1993 } 1994 1995 static MachineInstrBuilder extractF64Exponent(Register Hi, 1996 MachineIRBuilder &B) { 1997 const unsigned FractBits = 52; 1998 const unsigned ExpBits = 11; 1999 LLT S32 = LLT::scalar(32); 2000 2001 auto Const0 = B.buildConstant(S32, FractBits - 32); 2002 auto Const1 = B.buildConstant(S32, ExpBits); 2003 2004 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 2005 .addUse(Hi) 2006 .addUse(Const0.getReg(0)) 2007 .addUse(Const1.getReg(0)); 2008 2009 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 2010 } 2011 2012 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2013 MachineInstr &MI, MachineRegisterInfo &MRI, 2014 MachineIRBuilder &B) const { 2015 const LLT S1 = LLT::scalar(1); 2016 const LLT S32 = LLT::scalar(32); 2017 const LLT S64 = LLT::scalar(64); 2018 2019 Register Src = MI.getOperand(1).getReg(); 2020 assert(MRI.getType(Src) == S64); 2021 2022 // TODO: Should this use extract since the low half is unused? 2023 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2024 Register Hi = Unmerge.getReg(1); 2025 2026 // Extract the upper half, since this is where we will find the sign and 2027 // exponent. 2028 auto Exp = extractF64Exponent(Hi, B); 2029 2030 const unsigned FractBits = 52; 2031 2032 // Extract the sign bit. 2033 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2034 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2035 2036 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2037 2038 const auto Zero32 = B.buildConstant(S32, 0); 2039 2040 // Extend back to 64-bits. 2041 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2042 2043 auto Shr = B.buildAShr(S64, FractMask, Exp); 2044 auto Not = B.buildNot(S64, Shr); 2045 auto Tmp0 = B.buildAnd(S64, Src, Not); 2046 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2047 2048 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2049 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2050 2051 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2052 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2053 MI.eraseFromParent(); 2054 return true; 2055 } 2056 2057 bool AMDGPULegalizerInfo::legalizeITOFP( 2058 MachineInstr &MI, MachineRegisterInfo &MRI, 2059 MachineIRBuilder &B, bool Signed) const { 2060 2061 Register Dst = MI.getOperand(0).getReg(); 2062 Register Src = MI.getOperand(1).getReg(); 2063 2064 const LLT S64 = LLT::scalar(64); 2065 const LLT S32 = LLT::scalar(32); 2066 2067 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2068 2069 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2070 2071 auto CvtHi = Signed ? 2072 B.buildSITOFP(S64, Unmerge.getReg(1)) : 2073 B.buildUITOFP(S64, Unmerge.getReg(1)); 2074 2075 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2076 2077 auto ThirtyTwo = B.buildConstant(S32, 32); 2078 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2079 .addUse(CvtHi.getReg(0)) 2080 .addUse(ThirtyTwo.getReg(0)); 2081 2082 // TODO: Should this propagate fast-math-flags? 2083 B.buildFAdd(Dst, LdExp, CvtLo); 2084 MI.eraseFromParent(); 2085 return true; 2086 } 2087 2088 // TODO: Copied from DAG implementation. Verify logic and document how this 2089 // actually works. 2090 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2091 MachineRegisterInfo &MRI, 2092 MachineIRBuilder &B, 2093 bool Signed) const { 2094 2095 Register Dst = MI.getOperand(0).getReg(); 2096 Register Src = MI.getOperand(1).getReg(); 2097 2098 const LLT S64 = LLT::scalar(64); 2099 const LLT S32 = LLT::scalar(32); 2100 2101 const LLT SrcLT = MRI.getType(Src); 2102 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2103 2104 unsigned Flags = MI.getFlags(); 2105 2106 // The basic idea of converting a floating point number into a pair of 32-bit 2107 // integers is illustrated as follows: 2108 // 2109 // tf := trunc(val); 2110 // hif := floor(tf * 2^-32); 2111 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2112 // hi := fptoi(hif); 2113 // lo := fptoi(lof); 2114 // 2115 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2116 MachineInstrBuilder Sign; 2117 if (Signed && SrcLT == S32) { 2118 // However, a 32-bit floating point number has only 23 bits mantissa and 2119 // it's not enough to hold all the significant bits of `lof` if val is 2120 // negative. To avoid the loss of precision, We need to take the absolute 2121 // value after truncating and flip the result back based on the original 2122 // signedness. 2123 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2124 Trunc = B.buildFAbs(S32, Trunc, Flags); 2125 } 2126 MachineInstrBuilder K0, K1; 2127 if (SrcLT == S64) { 2128 K0 = B.buildFConstant(S64, 2129 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2130 K1 = B.buildFConstant(S64, 2131 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2132 } else { 2133 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2134 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2135 } 2136 2137 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2138 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2139 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2140 2141 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2142 : B.buildFPTOUI(S32, FloorMul); 2143 auto Lo = B.buildFPTOUI(S32, Fma); 2144 2145 if (Signed && SrcLT == S32) { 2146 // Flip the result based on the signedness, which is either all 0s or 1s. 2147 Sign = B.buildMerge(S64, {Sign, Sign}); 2148 // r := xor({lo, hi}, sign) - sign; 2149 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); 2150 } else 2151 B.buildMerge(Dst, {Lo, Hi}); 2152 MI.eraseFromParent(); 2153 2154 return true; 2155 } 2156 2157 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2158 MachineInstr &MI) const { 2159 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2160 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2161 2162 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2163 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2164 2165 // With ieee_mode disabled, the instructions have the correct behavior 2166 // already for G_FMINNUM/G_FMAXNUM 2167 if (!MFI->getMode().IEEE) 2168 return !IsIEEEOp; 2169 2170 if (IsIEEEOp) 2171 return true; 2172 2173 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2174 } 2175 2176 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2177 MachineInstr &MI, MachineRegisterInfo &MRI, 2178 MachineIRBuilder &B) const { 2179 // TODO: Should move some of this into LegalizerHelper. 2180 2181 // TODO: Promote dynamic indexing of s16 to s32 2182 2183 // FIXME: Artifact combiner probably should have replaced the truncated 2184 // constant before this, so we shouldn't need 2185 // getConstantVRegValWithLookThrough. 2186 Optional<ValueAndVReg> MaybeIdxVal = 2187 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2188 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2189 return true; 2190 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2191 2192 Register Dst = MI.getOperand(0).getReg(); 2193 Register Vec = MI.getOperand(1).getReg(); 2194 2195 LLT VecTy = MRI.getType(Vec); 2196 LLT EltTy = VecTy.getElementType(); 2197 assert(EltTy == MRI.getType(Dst)); 2198 2199 if (IdxVal < VecTy.getNumElements()) 2200 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); 2201 else 2202 B.buildUndef(Dst); 2203 2204 MI.eraseFromParent(); 2205 return true; 2206 } 2207 2208 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2209 MachineInstr &MI, MachineRegisterInfo &MRI, 2210 MachineIRBuilder &B) const { 2211 // TODO: Should move some of this into LegalizerHelper. 2212 2213 // TODO: Promote dynamic indexing of s16 to s32 2214 2215 // FIXME: Artifact combiner probably should have replaced the truncated 2216 // constant before this, so we shouldn't need 2217 // getConstantVRegValWithLookThrough. 2218 Optional<ValueAndVReg> MaybeIdxVal = 2219 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2220 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2221 return true; 2222 2223 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2224 Register Dst = MI.getOperand(0).getReg(); 2225 Register Vec = MI.getOperand(1).getReg(); 2226 Register Ins = MI.getOperand(2).getReg(); 2227 2228 LLT VecTy = MRI.getType(Vec); 2229 LLT EltTy = VecTy.getElementType(); 2230 assert(EltTy == MRI.getType(Ins)); 2231 2232 if (IdxVal < VecTy.getNumElements()) 2233 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); 2234 else 2235 B.buildUndef(Dst); 2236 2237 MI.eraseFromParent(); 2238 return true; 2239 } 2240 2241 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2242 MachineInstr &MI, MachineRegisterInfo &MRI, 2243 MachineIRBuilder &B) const { 2244 const LLT V2S16 = LLT::fixed_vector(2, 16); 2245 2246 Register Dst = MI.getOperand(0).getReg(); 2247 Register Src0 = MI.getOperand(1).getReg(); 2248 LLT DstTy = MRI.getType(Dst); 2249 LLT SrcTy = MRI.getType(Src0); 2250 2251 if (SrcTy == V2S16 && DstTy == V2S16 && 2252 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2253 return true; 2254 2255 MachineIRBuilder HelperBuilder(MI); 2256 GISelObserverWrapper DummyObserver; 2257 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2258 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2259 } 2260 2261 bool AMDGPULegalizerInfo::legalizeSinCos( 2262 MachineInstr &MI, MachineRegisterInfo &MRI, 2263 MachineIRBuilder &B) const { 2264 2265 Register DstReg = MI.getOperand(0).getReg(); 2266 Register SrcReg = MI.getOperand(1).getReg(); 2267 LLT Ty = MRI.getType(DstReg); 2268 unsigned Flags = MI.getFlags(); 2269 2270 Register TrigVal; 2271 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2272 if (ST.hasTrigReducedRange()) { 2273 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2274 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2275 .addUse(MulVal.getReg(0)) 2276 .setMIFlags(Flags).getReg(0); 2277 } else 2278 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2279 2280 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2281 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2282 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2283 .addUse(TrigVal) 2284 .setMIFlags(Flags); 2285 MI.eraseFromParent(); 2286 return true; 2287 } 2288 2289 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2290 MachineIRBuilder &B, 2291 const GlobalValue *GV, 2292 int64_t Offset, 2293 unsigned GAFlags) const { 2294 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2295 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2296 // to the following code sequence: 2297 // 2298 // For constant address space: 2299 // s_getpc_b64 s[0:1] 2300 // s_add_u32 s0, s0, $symbol 2301 // s_addc_u32 s1, s1, 0 2302 // 2303 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2304 // a fixup or relocation is emitted to replace $symbol with a literal 2305 // constant, which is a pc-relative offset from the encoding of the $symbol 2306 // operand to the global variable. 2307 // 2308 // For global address space: 2309 // s_getpc_b64 s[0:1] 2310 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2311 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2312 // 2313 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2314 // fixups or relocations are emitted to replace $symbol@*@lo and 2315 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2316 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2317 // operand to the global variable. 2318 // 2319 // What we want here is an offset from the value returned by s_getpc 2320 // (which is the address of the s_add_u32 instruction) to the global 2321 // variable, but since the encoding of $symbol starts 4 bytes after the start 2322 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2323 // small. This requires us to add 4 to the global variable offset in order to 2324 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2325 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2326 // instruction. 2327 2328 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2329 2330 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2331 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2332 2333 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2334 .addDef(PCReg); 2335 2336 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2337 if (GAFlags == SIInstrInfo::MO_NONE) 2338 MIB.addImm(0); 2339 else 2340 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2341 2342 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2343 2344 if (PtrTy.getSizeInBits() == 32) 2345 B.buildExtract(DstReg, PCReg, 0); 2346 return true; 2347 } 2348 2349 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2350 MachineInstr &MI, MachineRegisterInfo &MRI, 2351 MachineIRBuilder &B) const { 2352 Register DstReg = MI.getOperand(0).getReg(); 2353 LLT Ty = MRI.getType(DstReg); 2354 unsigned AS = Ty.getAddressSpace(); 2355 2356 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2357 MachineFunction &MF = B.getMF(); 2358 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2359 2360 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2361 if (!MFI->isModuleEntryFunction() && 2362 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2363 const Function &Fn = MF.getFunction(); 2364 DiagnosticInfoUnsupported BadLDSDecl( 2365 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2366 DS_Warning); 2367 Fn.getContext().diagnose(BadLDSDecl); 2368 2369 // We currently don't have a way to correctly allocate LDS objects that 2370 // aren't directly associated with a kernel. We do force inlining of 2371 // functions that use local objects. However, if these dead functions are 2372 // not eliminated, we don't want a compile time error. Just emit a warning 2373 // and a trap, since there should be no callable path here. 2374 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2375 B.buildUndef(DstReg); 2376 MI.eraseFromParent(); 2377 return true; 2378 } 2379 2380 // TODO: We could emit code to handle the initialization somewhere. 2381 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2382 const SITargetLowering *TLI = ST.getTargetLowering(); 2383 if (!TLI->shouldUseLDSConstAddress(GV)) { 2384 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2385 return true; // Leave in place; 2386 } 2387 2388 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2389 Type *Ty = GV->getValueType(); 2390 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2391 // zero-sized type in other languages to declare the dynamic shared 2392 // memory which size is not known at the compile time. They will be 2393 // allocated by the runtime and placed directly after the static 2394 // allocated ones. They all share the same offset. 2395 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2396 // Adjust alignment for that dynamic shared memory array. 2397 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2398 LLT S32 = LLT::scalar(32); 2399 auto Sz = 2400 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2401 B.buildIntToPtr(DstReg, Sz); 2402 MI.eraseFromParent(); 2403 return true; 2404 } 2405 } 2406 2407 B.buildConstant( 2408 DstReg, 2409 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2410 MI.eraseFromParent(); 2411 return true; 2412 } 2413 2414 const Function &Fn = MF.getFunction(); 2415 DiagnosticInfoUnsupported BadInit( 2416 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2417 Fn.getContext().diagnose(BadInit); 2418 return true; 2419 } 2420 2421 const SITargetLowering *TLI = ST.getTargetLowering(); 2422 2423 if (TLI->shouldEmitFixup(GV)) { 2424 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2425 MI.eraseFromParent(); 2426 return true; 2427 } 2428 2429 if (TLI->shouldEmitPCReloc(GV)) { 2430 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2431 MI.eraseFromParent(); 2432 return true; 2433 } 2434 2435 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2436 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2437 2438 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 2439 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2440 MachinePointerInfo::getGOT(MF), 2441 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2442 MachineMemOperand::MOInvariant, 2443 LoadTy, Align(8)); 2444 2445 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2446 2447 if (Ty.getSizeInBits() == 32) { 2448 // Truncate if this is a 32-bit constant adrdess. 2449 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2450 B.buildExtract(DstReg, Load, 0); 2451 } else 2452 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2453 2454 MI.eraseFromParent(); 2455 return true; 2456 } 2457 2458 static LLT widenToNextPowerOf2(LLT Ty) { 2459 if (Ty.isVector()) 2460 return Ty.changeElementCount( 2461 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2462 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2463 } 2464 2465 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2466 MachineInstr &MI) const { 2467 MachineIRBuilder &B = Helper.MIRBuilder; 2468 MachineRegisterInfo &MRI = *B.getMRI(); 2469 GISelChangeObserver &Observer = Helper.Observer; 2470 2471 Register PtrReg = MI.getOperand(1).getReg(); 2472 LLT PtrTy = MRI.getType(PtrReg); 2473 unsigned AddrSpace = PtrTy.getAddressSpace(); 2474 2475 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2476 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2477 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2478 Observer.changingInstr(MI); 2479 MI.getOperand(1).setReg(Cast.getReg(0)); 2480 Observer.changedInstr(MI); 2481 return true; 2482 } 2483 2484 if (MI.getOpcode() != AMDGPU::G_LOAD) 2485 return false; 2486 2487 Register ValReg = MI.getOperand(0).getReg(); 2488 LLT ValTy = MRI.getType(ValReg); 2489 2490 MachineMemOperand *MMO = *MI.memoperands_begin(); 2491 const unsigned ValSize = ValTy.getSizeInBits(); 2492 const LLT MemTy = MMO->getMemoryType(); 2493 const Align MemAlign = MMO->getAlign(); 2494 const unsigned MemSize = MemTy.getSizeInBits(); 2495 const unsigned AlignInBits = 8 * MemAlign.value(); 2496 2497 // Widen non-power-of-2 loads to the alignment if needed 2498 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 2499 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2500 2501 // This was already the correct extending load result type, so just adjust 2502 // the memory type. 2503 if (WideMemSize == ValSize) { 2504 MachineFunction &MF = B.getMF(); 2505 2506 MachineMemOperand *WideMMO = 2507 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2508 Observer.changingInstr(MI); 2509 MI.setMemRefs(MF, {WideMMO}); 2510 Observer.changedInstr(MI); 2511 return true; 2512 } 2513 2514 // Don't bother handling edge case that should probably never be produced. 2515 if (ValSize > WideMemSize) 2516 return false; 2517 2518 LLT WideTy = widenToNextPowerOf2(ValTy); 2519 2520 Register WideLoad; 2521 if (!WideTy.isVector()) { 2522 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2523 B.buildTrunc(ValReg, WideLoad).getReg(0); 2524 } else { 2525 // Extract the subvector. 2526 2527 if (isRegisterType(ValTy)) { 2528 // If this a case where G_EXTRACT is legal, use it. 2529 // (e.g. <3 x s32> -> <4 x s32>) 2530 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2531 B.buildExtract(ValReg, WideLoad, 0); 2532 } else { 2533 // For cases where the widened type isn't a nice register value, unmerge 2534 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2535 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2536 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2537 B.setInsertPt(B.getMBB(), MI.getIterator()); 2538 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2539 } 2540 } 2541 2542 MI.eraseFromParent(); 2543 return true; 2544 } 2545 2546 return false; 2547 } 2548 2549 bool AMDGPULegalizerInfo::legalizeFMad( 2550 MachineInstr &MI, MachineRegisterInfo &MRI, 2551 MachineIRBuilder &B) const { 2552 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2553 assert(Ty.isScalar()); 2554 2555 MachineFunction &MF = B.getMF(); 2556 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2557 2558 // TODO: Always legal with future ftz flag. 2559 // FIXME: Do we need just output? 2560 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2561 return true; 2562 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2563 return true; 2564 2565 MachineIRBuilder HelperBuilder(MI); 2566 GISelObserverWrapper DummyObserver; 2567 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2568 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2569 } 2570 2571 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2572 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2573 Register DstReg = MI.getOperand(0).getReg(); 2574 Register PtrReg = MI.getOperand(1).getReg(); 2575 Register CmpVal = MI.getOperand(2).getReg(); 2576 Register NewVal = MI.getOperand(3).getReg(); 2577 2578 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2579 "this should not have been custom lowered"); 2580 2581 LLT ValTy = MRI.getType(CmpVal); 2582 LLT VecTy = LLT::fixed_vector(2, ValTy); 2583 2584 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2585 2586 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2587 .addDef(DstReg) 2588 .addUse(PtrReg) 2589 .addUse(PackedVal) 2590 .setMemRefs(MI.memoperands()); 2591 2592 MI.eraseFromParent(); 2593 return true; 2594 } 2595 2596 bool AMDGPULegalizerInfo::legalizeFlog( 2597 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2598 Register Dst = MI.getOperand(0).getReg(); 2599 Register Src = MI.getOperand(1).getReg(); 2600 LLT Ty = B.getMRI()->getType(Dst); 2601 unsigned Flags = MI.getFlags(); 2602 2603 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2604 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2605 2606 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2607 MI.eraseFromParent(); 2608 return true; 2609 } 2610 2611 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2612 MachineIRBuilder &B) const { 2613 Register Dst = MI.getOperand(0).getReg(); 2614 Register Src = MI.getOperand(1).getReg(); 2615 unsigned Flags = MI.getFlags(); 2616 LLT Ty = B.getMRI()->getType(Dst); 2617 2618 auto K = B.buildFConstant(Ty, numbers::log2e); 2619 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2620 B.buildFExp2(Dst, Mul, Flags); 2621 MI.eraseFromParent(); 2622 return true; 2623 } 2624 2625 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2626 MachineIRBuilder &B) const { 2627 Register Dst = MI.getOperand(0).getReg(); 2628 Register Src0 = MI.getOperand(1).getReg(); 2629 Register Src1 = MI.getOperand(2).getReg(); 2630 unsigned Flags = MI.getFlags(); 2631 LLT Ty = B.getMRI()->getType(Dst); 2632 const LLT S16 = LLT::scalar(16); 2633 const LLT S32 = LLT::scalar(32); 2634 2635 if (Ty == S32) { 2636 auto Log = B.buildFLog2(S32, Src0, Flags); 2637 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2638 .addUse(Log.getReg(0)) 2639 .addUse(Src1) 2640 .setMIFlags(Flags); 2641 B.buildFExp2(Dst, Mul, Flags); 2642 } else if (Ty == S16) { 2643 // There's no f16 fmul_legacy, so we need to convert for it. 2644 auto Log = B.buildFLog2(S16, Src0, Flags); 2645 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2646 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2647 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2648 .addUse(Ext0.getReg(0)) 2649 .addUse(Ext1.getReg(0)) 2650 .setMIFlags(Flags); 2651 2652 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2653 } else 2654 return false; 2655 2656 MI.eraseFromParent(); 2657 return true; 2658 } 2659 2660 // Find a source register, ignoring any possible source modifiers. 2661 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2662 Register ModSrc = OrigSrc; 2663 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2664 ModSrc = SrcFNeg->getOperand(1).getReg(); 2665 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2666 ModSrc = SrcFAbs->getOperand(1).getReg(); 2667 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2668 ModSrc = SrcFAbs->getOperand(1).getReg(); 2669 return ModSrc; 2670 } 2671 2672 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2673 MachineRegisterInfo &MRI, 2674 MachineIRBuilder &B) const { 2675 2676 const LLT S1 = LLT::scalar(1); 2677 const LLT S64 = LLT::scalar(64); 2678 Register Dst = MI.getOperand(0).getReg(); 2679 Register OrigSrc = MI.getOperand(1).getReg(); 2680 unsigned Flags = MI.getFlags(); 2681 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2682 "this should not have been custom lowered"); 2683 2684 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2685 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2686 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2687 // V_FRACT bug is: 2688 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2689 // 2690 // Convert floor(x) to (x - fract(x)) 2691 2692 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2693 .addUse(OrigSrc) 2694 .setMIFlags(Flags); 2695 2696 // Give source modifier matching some assistance before obscuring a foldable 2697 // pattern. 2698 2699 // TODO: We can avoid the neg on the fract? The input sign to fract 2700 // shouldn't matter? 2701 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2702 2703 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2704 2705 Register Min = MRI.createGenericVirtualRegister(S64); 2706 2707 // We don't need to concern ourselves with the snan handling difference, so 2708 // use the one which will directly select. 2709 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2710 if (MFI->getMode().IEEE) 2711 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2712 else 2713 B.buildFMinNum(Min, Fract, Const, Flags); 2714 2715 Register CorrectedFract = Min; 2716 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2717 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2718 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2719 } 2720 2721 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2722 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2723 2724 MI.eraseFromParent(); 2725 return true; 2726 } 2727 2728 // Turn an illegal packed v2s16 build vector into bit operations. 2729 // TODO: This should probably be a bitcast action in LegalizerHelper. 2730 bool AMDGPULegalizerInfo::legalizeBuildVector( 2731 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2732 Register Dst = MI.getOperand(0).getReg(); 2733 const LLT S32 = LLT::scalar(32); 2734 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 2735 2736 Register Src0 = MI.getOperand(1).getReg(); 2737 Register Src1 = MI.getOperand(2).getReg(); 2738 assert(MRI.getType(Src0) == LLT::scalar(16)); 2739 2740 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2741 B.buildBitcast(Dst, Merge); 2742 2743 MI.eraseFromParent(); 2744 return true; 2745 } 2746 2747 // Check that this is a G_XOR x, -1 2748 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2749 if (MI.getOpcode() != TargetOpcode::G_XOR) 2750 return false; 2751 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2752 return ConstVal && *ConstVal == -1; 2753 } 2754 2755 // Return the use branch instruction, otherwise null if the usage is invalid. 2756 static MachineInstr * 2757 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2758 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2759 Register CondDef = MI.getOperand(0).getReg(); 2760 if (!MRI.hasOneNonDBGUse(CondDef)) 2761 return nullptr; 2762 2763 MachineBasicBlock *Parent = MI.getParent(); 2764 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2765 2766 if (isNot(MRI, *UseMI)) { 2767 Register NegatedCond = UseMI->getOperand(0).getReg(); 2768 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2769 return nullptr; 2770 2771 // We're deleting the def of this value, so we need to remove it. 2772 UseMI->eraseFromParent(); 2773 2774 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2775 Negated = true; 2776 } 2777 2778 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2779 return nullptr; 2780 2781 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2782 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2783 if (Next == Parent->end()) { 2784 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2785 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2786 return nullptr; 2787 UncondBrTarget = &*NextMBB; 2788 } else { 2789 if (Next->getOpcode() != AMDGPU::G_BR) 2790 return nullptr; 2791 Br = &*Next; 2792 UncondBrTarget = Br->getOperand(0).getMBB(); 2793 } 2794 2795 return UseMI; 2796 } 2797 2798 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2799 const ArgDescriptor *Arg, 2800 const TargetRegisterClass *ArgRC, 2801 LLT ArgTy) const { 2802 MCRegister SrcReg = Arg->getRegister(); 2803 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2804 assert(DstReg.isVirtual() && "Virtual register expected"); 2805 2806 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2807 ArgTy); 2808 if (Arg->isMasked()) { 2809 // TODO: Should we try to emit this once in the entry block? 2810 const LLT S32 = LLT::scalar(32); 2811 const unsigned Mask = Arg->getMask(); 2812 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2813 2814 Register AndMaskSrc = LiveIn; 2815 2816 if (Shift != 0) { 2817 auto ShiftAmt = B.buildConstant(S32, Shift); 2818 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2819 } 2820 2821 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2822 } else { 2823 B.buildCopy(DstReg, LiveIn); 2824 } 2825 2826 return true; 2827 } 2828 2829 bool AMDGPULegalizerInfo::loadInputValue( 2830 Register DstReg, MachineIRBuilder &B, 2831 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2832 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2833 const ArgDescriptor *Arg; 2834 const TargetRegisterClass *ArgRC; 2835 LLT ArgTy; 2836 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2837 2838 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2839 return false; // TODO: Handle these 2840 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2841 } 2842 2843 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2844 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2845 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2846 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2847 return false; 2848 2849 MI.eraseFromParent(); 2850 return true; 2851 } 2852 2853 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2854 MachineRegisterInfo &MRI, 2855 MachineIRBuilder &B) const { 2856 Register Dst = MI.getOperand(0).getReg(); 2857 LLT DstTy = MRI.getType(Dst); 2858 LLT S16 = LLT::scalar(16); 2859 LLT S32 = LLT::scalar(32); 2860 LLT S64 = LLT::scalar(64); 2861 2862 if (DstTy == S16) 2863 return legalizeFDIV16(MI, MRI, B); 2864 if (DstTy == S32) 2865 return legalizeFDIV32(MI, MRI, B); 2866 if (DstTy == S64) 2867 return legalizeFDIV64(MI, MRI, B); 2868 2869 return false; 2870 } 2871 2872 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 2873 Register DstDivReg, 2874 Register DstRemReg, 2875 Register X, 2876 Register Y) const { 2877 const LLT S1 = LLT::scalar(1); 2878 const LLT S32 = LLT::scalar(32); 2879 2880 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2881 // algorithm used here. 2882 2883 // Initial estimate of inv(y). 2884 auto FloatY = B.buildUITOFP(S32, Y); 2885 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2886 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2887 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2888 auto Z = B.buildFPTOUI(S32, ScaledY); 2889 2890 // One round of UNR. 2891 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2892 auto NegYZ = B.buildMul(S32, NegY, Z); 2893 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2894 2895 // Quotient/remainder estimate. 2896 auto Q = B.buildUMulH(S32, X, Z); 2897 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2898 2899 // First quotient/remainder refinement. 2900 auto One = B.buildConstant(S32, 1); 2901 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2902 if (DstDivReg) 2903 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2904 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2905 2906 // Second quotient/remainder refinement. 2907 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2908 if (DstDivReg) 2909 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 2910 2911 if (DstRemReg) 2912 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 2913 } 2914 2915 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2916 // 2917 // Return lo, hi of result 2918 // 2919 // %cvt.lo = G_UITOFP Val.lo 2920 // %cvt.hi = G_UITOFP Val.hi 2921 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2922 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2923 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2924 // %mul2 = G_FMUL %mul1, 2**(-32) 2925 // %trunc = G_INTRINSIC_TRUNC %mul2 2926 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2927 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2928 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2929 Register Val) { 2930 const LLT S32 = LLT::scalar(32); 2931 auto Unmerge = B.buildUnmerge(S32, Val); 2932 2933 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2934 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2935 2936 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2937 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2938 2939 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2940 auto Mul1 = 2941 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2942 2943 // 2**(-32) 2944 auto Mul2 = 2945 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2946 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2947 2948 // -(2**32) 2949 auto Mad2 = B.buildFMAD(S32, Trunc, 2950 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2951 2952 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2953 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2954 2955 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2956 } 2957 2958 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 2959 Register DstDivReg, 2960 Register DstRemReg, 2961 Register Numer, 2962 Register Denom) const { 2963 const LLT S32 = LLT::scalar(32); 2964 const LLT S64 = LLT::scalar(64); 2965 const LLT S1 = LLT::scalar(1); 2966 Register RcpLo, RcpHi; 2967 2968 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2969 2970 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2971 2972 auto Zero64 = B.buildConstant(S64, 0); 2973 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2974 2975 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2976 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2977 2978 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2979 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2980 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2981 2982 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2983 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2984 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2985 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2986 2987 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2988 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2989 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2990 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2991 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2992 2993 auto Zero32 = B.buildConstant(S32, 0); 2994 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2995 auto Add2_HiC = 2996 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2997 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2998 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2999 3000 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 3001 Register NumerLo = UnmergeNumer.getReg(0); 3002 Register NumerHi = UnmergeNumer.getReg(1); 3003 3004 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 3005 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 3006 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 3007 Register Mul3_Lo = UnmergeMul3.getReg(0); 3008 Register Mul3_Hi = UnmergeMul3.getReg(1); 3009 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 3010 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 3011 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 3012 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 3013 3014 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 3015 Register DenomLo = UnmergeDenom.getReg(0); 3016 Register DenomHi = UnmergeDenom.getReg(1); 3017 3018 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 3019 auto C1 = B.buildSExt(S32, CmpHi); 3020 3021 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 3022 auto C2 = B.buildSExt(S32, CmpLo); 3023 3024 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 3025 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 3026 3027 // TODO: Here and below portions of the code can be enclosed into if/endif. 3028 // Currently control flow is unconditional and we have 4 selects after 3029 // potential endif to substitute PHIs. 3030 3031 // if C3 != 0 ... 3032 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 3033 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 3034 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 3035 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 3036 3037 auto One64 = B.buildConstant(S64, 1); 3038 auto Add3 = B.buildAdd(S64, MulHi3, One64); 3039 3040 auto C4 = 3041 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 3042 auto C5 = 3043 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 3044 auto C6 = B.buildSelect( 3045 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 3046 3047 // if (C6 != 0) 3048 auto Add4 = B.buildAdd(S64, Add3, One64); 3049 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 3050 3051 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 3052 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 3053 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 3054 3055 // endif C6 3056 // endif C3 3057 3058 if (DstDivReg) { 3059 auto Sel1 = B.buildSelect( 3060 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3061 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3062 Sel1, MulHi3); 3063 } 3064 3065 if (DstRemReg) { 3066 auto Sel2 = B.buildSelect( 3067 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3068 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3069 Sel2, Sub1); 3070 } 3071 } 3072 3073 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3074 MachineRegisterInfo &MRI, 3075 MachineIRBuilder &B) const { 3076 Register DstDivReg, DstRemReg; 3077 switch (MI.getOpcode()) { 3078 default: 3079 llvm_unreachable("Unexpected opcode!"); 3080 case AMDGPU::G_UDIV: { 3081 DstDivReg = MI.getOperand(0).getReg(); 3082 break; 3083 } 3084 case AMDGPU::G_UREM: { 3085 DstRemReg = MI.getOperand(0).getReg(); 3086 break; 3087 } 3088 case AMDGPU::G_UDIVREM: { 3089 DstDivReg = MI.getOperand(0).getReg(); 3090 DstRemReg = MI.getOperand(1).getReg(); 3091 break; 3092 } 3093 } 3094 3095 const LLT S64 = LLT::scalar(64); 3096 const LLT S32 = LLT::scalar(32); 3097 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3098 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3099 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3100 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3101 3102 if (Ty == S32) 3103 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3104 else if (Ty == S64) 3105 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3106 else 3107 return false; 3108 3109 MI.eraseFromParent(); 3110 return true; 3111 } 3112 3113 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3114 MachineRegisterInfo &MRI, 3115 MachineIRBuilder &B) const { 3116 const LLT S64 = LLT::scalar(64); 3117 const LLT S32 = LLT::scalar(32); 3118 3119 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3120 if (Ty != S32 && Ty != S64) 3121 return false; 3122 3123 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3124 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3125 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3126 3127 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3128 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3129 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3130 3131 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3132 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3133 3134 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3135 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3136 3137 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3138 switch (MI.getOpcode()) { 3139 default: 3140 llvm_unreachable("Unexpected opcode!"); 3141 case AMDGPU::G_SDIV: { 3142 DstDivReg = MI.getOperand(0).getReg(); 3143 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3144 break; 3145 } 3146 case AMDGPU::G_SREM: { 3147 DstRemReg = MI.getOperand(0).getReg(); 3148 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3149 break; 3150 } 3151 case AMDGPU::G_SDIVREM: { 3152 DstDivReg = MI.getOperand(0).getReg(); 3153 DstRemReg = MI.getOperand(1).getReg(); 3154 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3155 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3156 break; 3157 } 3158 } 3159 3160 if (Ty == S32) 3161 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3162 else 3163 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3164 3165 if (DstDivReg) { 3166 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3167 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3168 B.buildSub(DstDivReg, SignXor, Sign); 3169 } 3170 3171 if (DstRemReg) { 3172 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3173 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3174 B.buildSub(DstRemReg, SignXor, Sign); 3175 } 3176 3177 MI.eraseFromParent(); 3178 return true; 3179 } 3180 3181 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3182 MachineRegisterInfo &MRI, 3183 MachineIRBuilder &B) const { 3184 Register Res = MI.getOperand(0).getReg(); 3185 Register LHS = MI.getOperand(1).getReg(); 3186 Register RHS = MI.getOperand(2).getReg(); 3187 uint16_t Flags = MI.getFlags(); 3188 LLT ResTy = MRI.getType(Res); 3189 3190 const MachineFunction &MF = B.getMF(); 3191 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3192 MI.getFlag(MachineInstr::FmAfn); 3193 3194 if (!AllowInaccurateRcp) 3195 return false; 3196 3197 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3198 // 1 / x -> RCP(x) 3199 if (CLHS->isExactlyValue(1.0)) { 3200 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3201 .addUse(RHS) 3202 .setMIFlags(Flags); 3203 3204 MI.eraseFromParent(); 3205 return true; 3206 } 3207 3208 // -1 / x -> RCP( FNEG(x) ) 3209 if (CLHS->isExactlyValue(-1.0)) { 3210 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3211 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3212 .addUse(FNeg.getReg(0)) 3213 .setMIFlags(Flags); 3214 3215 MI.eraseFromParent(); 3216 return true; 3217 } 3218 } 3219 3220 // x / y -> x * (1.0 / y) 3221 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3222 .addUse(RHS) 3223 .setMIFlags(Flags); 3224 B.buildFMul(Res, LHS, RCP, Flags); 3225 3226 MI.eraseFromParent(); 3227 return true; 3228 } 3229 3230 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3231 MachineRegisterInfo &MRI, 3232 MachineIRBuilder &B) const { 3233 Register Res = MI.getOperand(0).getReg(); 3234 Register X = MI.getOperand(1).getReg(); 3235 Register Y = MI.getOperand(2).getReg(); 3236 uint16_t Flags = MI.getFlags(); 3237 LLT ResTy = MRI.getType(Res); 3238 3239 const MachineFunction &MF = B.getMF(); 3240 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3241 MI.getFlag(MachineInstr::FmAfn); 3242 3243 if (!AllowInaccurateRcp) 3244 return false; 3245 3246 auto NegY = B.buildFNeg(ResTy, Y); 3247 auto One = B.buildFConstant(ResTy, 1.0); 3248 3249 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3250 .addUse(Y) 3251 .setMIFlags(Flags); 3252 3253 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3254 R = B.buildFMA(ResTy, Tmp0, R, R); 3255 3256 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3257 R = B.buildFMA(ResTy, Tmp1, R, R); 3258 3259 auto Ret = B.buildFMul(ResTy, X, R); 3260 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3261 3262 B.buildFMA(Res, Tmp2, R, Ret); 3263 MI.eraseFromParent(); 3264 return true; 3265 } 3266 3267 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3268 MachineRegisterInfo &MRI, 3269 MachineIRBuilder &B) const { 3270 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3271 return true; 3272 3273 Register Res = MI.getOperand(0).getReg(); 3274 Register LHS = MI.getOperand(1).getReg(); 3275 Register RHS = MI.getOperand(2).getReg(); 3276 3277 uint16_t Flags = MI.getFlags(); 3278 3279 LLT S16 = LLT::scalar(16); 3280 LLT S32 = LLT::scalar(32); 3281 3282 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3283 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3284 3285 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3286 .addUse(RHSExt.getReg(0)) 3287 .setMIFlags(Flags); 3288 3289 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3290 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3291 3292 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3293 .addUse(RDst.getReg(0)) 3294 .addUse(RHS) 3295 .addUse(LHS) 3296 .setMIFlags(Flags); 3297 3298 MI.eraseFromParent(); 3299 return true; 3300 } 3301 3302 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3303 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3304 static void toggleSPDenormMode(bool Enable, 3305 MachineIRBuilder &B, 3306 const GCNSubtarget &ST, 3307 AMDGPU::SIModeRegisterDefaults Mode) { 3308 // Set SP denorm mode to this value. 3309 unsigned SPDenormMode = 3310 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3311 3312 if (ST.hasDenormModeInst()) { 3313 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3314 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3315 3316 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3317 B.buildInstr(AMDGPU::S_DENORM_MODE) 3318 .addImm(NewDenormModeValue); 3319 3320 } else { 3321 // Select FP32 bit field in mode register. 3322 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3323 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3324 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3325 3326 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3327 .addImm(SPDenormMode) 3328 .addImm(SPDenormModeBitField); 3329 } 3330 } 3331 3332 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3333 MachineRegisterInfo &MRI, 3334 MachineIRBuilder &B) const { 3335 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3336 return true; 3337 3338 Register Res = MI.getOperand(0).getReg(); 3339 Register LHS = MI.getOperand(1).getReg(); 3340 Register RHS = MI.getOperand(2).getReg(); 3341 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3342 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3343 3344 uint16_t Flags = MI.getFlags(); 3345 3346 LLT S32 = LLT::scalar(32); 3347 LLT S1 = LLT::scalar(1); 3348 3349 auto One = B.buildFConstant(S32, 1.0f); 3350 3351 auto DenominatorScaled = 3352 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3353 .addUse(LHS) 3354 .addUse(RHS) 3355 .addImm(0) 3356 .setMIFlags(Flags); 3357 auto NumeratorScaled = 3358 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3359 .addUse(LHS) 3360 .addUse(RHS) 3361 .addImm(1) 3362 .setMIFlags(Flags); 3363 3364 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3365 .addUse(DenominatorScaled.getReg(0)) 3366 .setMIFlags(Flags); 3367 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3368 3369 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3370 // aren't modeled as reading it. 3371 if (!Mode.allFP32Denormals()) 3372 toggleSPDenormMode(true, B, ST, Mode); 3373 3374 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3375 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3376 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3377 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3378 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3379 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3380 3381 if (!Mode.allFP32Denormals()) 3382 toggleSPDenormMode(false, B, ST, Mode); 3383 3384 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3385 .addUse(Fma4.getReg(0)) 3386 .addUse(Fma1.getReg(0)) 3387 .addUse(Fma3.getReg(0)) 3388 .addUse(NumeratorScaled.getReg(1)) 3389 .setMIFlags(Flags); 3390 3391 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3392 .addUse(Fmas.getReg(0)) 3393 .addUse(RHS) 3394 .addUse(LHS) 3395 .setMIFlags(Flags); 3396 3397 MI.eraseFromParent(); 3398 return true; 3399 } 3400 3401 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3402 MachineRegisterInfo &MRI, 3403 MachineIRBuilder &B) const { 3404 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3405 return true; 3406 3407 Register Res = MI.getOperand(0).getReg(); 3408 Register LHS = MI.getOperand(1).getReg(); 3409 Register RHS = MI.getOperand(2).getReg(); 3410 3411 uint16_t Flags = MI.getFlags(); 3412 3413 LLT S64 = LLT::scalar(64); 3414 LLT S1 = LLT::scalar(1); 3415 3416 auto One = B.buildFConstant(S64, 1.0); 3417 3418 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3419 .addUse(LHS) 3420 .addUse(RHS) 3421 .addImm(0) 3422 .setMIFlags(Flags); 3423 3424 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3425 3426 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3427 .addUse(DivScale0.getReg(0)) 3428 .setMIFlags(Flags); 3429 3430 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3431 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3432 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3433 3434 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3435 .addUse(LHS) 3436 .addUse(RHS) 3437 .addImm(1) 3438 .setMIFlags(Flags); 3439 3440 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3441 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3442 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3443 3444 Register Scale; 3445 if (!ST.hasUsableDivScaleConditionOutput()) { 3446 // Workaround a hardware bug on SI where the condition output from div_scale 3447 // is not usable. 3448 3449 LLT S32 = LLT::scalar(32); 3450 3451 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3452 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3453 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3454 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3455 3456 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3457 Scale1Unmerge.getReg(1)); 3458 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3459 Scale0Unmerge.getReg(1)); 3460 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3461 } else { 3462 Scale = DivScale1.getReg(1); 3463 } 3464 3465 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3466 .addUse(Fma4.getReg(0)) 3467 .addUse(Fma3.getReg(0)) 3468 .addUse(Mul.getReg(0)) 3469 .addUse(Scale) 3470 .setMIFlags(Flags); 3471 3472 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3473 .addUse(Fmas.getReg(0)) 3474 .addUse(RHS) 3475 .addUse(LHS) 3476 .setMIFlags(Flags); 3477 3478 MI.eraseFromParent(); 3479 return true; 3480 } 3481 3482 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3483 MachineRegisterInfo &MRI, 3484 MachineIRBuilder &B) const { 3485 Register Res = MI.getOperand(0).getReg(); 3486 Register LHS = MI.getOperand(2).getReg(); 3487 Register RHS = MI.getOperand(3).getReg(); 3488 uint16_t Flags = MI.getFlags(); 3489 3490 LLT S32 = LLT::scalar(32); 3491 LLT S1 = LLT::scalar(1); 3492 3493 auto Abs = B.buildFAbs(S32, RHS, Flags); 3494 const APFloat C0Val(1.0f); 3495 3496 auto C0 = B.buildConstant(S32, 0x6f800000); 3497 auto C1 = B.buildConstant(S32, 0x2f800000); 3498 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3499 3500 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3501 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3502 3503 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3504 3505 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3506 .addUse(Mul0.getReg(0)) 3507 .setMIFlags(Flags); 3508 3509 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3510 3511 B.buildFMul(Res, Sel, Mul1, Flags); 3512 3513 MI.eraseFromParent(); 3514 return true; 3515 } 3516 3517 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3518 // FIXME: Why do we handle this one but not other removed instructions? 3519 // 3520 // Reciprocal square root. The clamp prevents infinite results, clamping 3521 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3522 // +-max_float. 3523 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3524 MachineRegisterInfo &MRI, 3525 MachineIRBuilder &B) const { 3526 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3527 return true; 3528 3529 Register Dst = MI.getOperand(0).getReg(); 3530 Register Src = MI.getOperand(2).getReg(); 3531 auto Flags = MI.getFlags(); 3532 3533 LLT Ty = MRI.getType(Dst); 3534 3535 const fltSemantics *FltSemantics; 3536 if (Ty == LLT::scalar(32)) 3537 FltSemantics = &APFloat::IEEEsingle(); 3538 else if (Ty == LLT::scalar(64)) 3539 FltSemantics = &APFloat::IEEEdouble(); 3540 else 3541 return false; 3542 3543 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3544 .addUse(Src) 3545 .setMIFlags(Flags); 3546 3547 // We don't need to concern ourselves with the snan handling difference, since 3548 // the rsq quieted (or not) so use the one which will directly select. 3549 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3550 const bool UseIEEE = MFI->getMode().IEEE; 3551 3552 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3553 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3554 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3555 3556 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3557 3558 if (UseIEEE) 3559 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3560 else 3561 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3562 MI.eraseFromParent(); 3563 return true; 3564 } 3565 3566 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3567 switch (IID) { 3568 case Intrinsic::amdgcn_ds_fadd: 3569 return AMDGPU::G_ATOMICRMW_FADD; 3570 case Intrinsic::amdgcn_ds_fmin: 3571 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3572 case Intrinsic::amdgcn_ds_fmax: 3573 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3574 default: 3575 llvm_unreachable("not a DS FP intrinsic"); 3576 } 3577 } 3578 3579 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3580 MachineInstr &MI, 3581 Intrinsic::ID IID) const { 3582 GISelChangeObserver &Observer = Helper.Observer; 3583 Observer.changingInstr(MI); 3584 3585 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3586 3587 // The remaining operands were used to set fields in the MemOperand on 3588 // construction. 3589 for (int I = 6; I > 3; --I) 3590 MI.RemoveOperand(I); 3591 3592 MI.RemoveOperand(1); // Remove the intrinsic ID. 3593 Observer.changedInstr(MI); 3594 return true; 3595 } 3596 3597 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3598 MachineRegisterInfo &MRI, 3599 MachineIRBuilder &B) const { 3600 uint64_t Offset = 3601 ST.getTargetLowering()->getImplicitParameterOffset( 3602 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3603 LLT DstTy = MRI.getType(DstReg); 3604 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3605 3606 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3607 if (!loadInputValue(KernargPtrReg, B, 3608 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3609 return false; 3610 3611 // FIXME: This should be nuw 3612 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3613 return true; 3614 } 3615 3616 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3617 MachineRegisterInfo &MRI, 3618 MachineIRBuilder &B) const { 3619 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3620 if (!MFI->isEntryFunction()) { 3621 return legalizePreloadedArgIntrin(MI, MRI, B, 3622 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3623 } 3624 3625 Register DstReg = MI.getOperand(0).getReg(); 3626 if (!getImplicitArgPtr(DstReg, MRI, B)) 3627 return false; 3628 3629 MI.eraseFromParent(); 3630 return true; 3631 } 3632 3633 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3634 MachineRegisterInfo &MRI, 3635 MachineIRBuilder &B, 3636 unsigned AddrSpace) const { 3637 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3638 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3639 Register Hi32 = Unmerge.getReg(1); 3640 3641 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3642 MI.eraseFromParent(); 3643 return true; 3644 } 3645 3646 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3647 // offset (the offset that is included in bounds checking and swizzling, to be 3648 // split between the instruction's voffset and immoffset fields) and soffset 3649 // (the offset that is excluded from bounds checking and swizzling, to go in 3650 // the instruction's soffset field). This function takes the first kind of 3651 // offset and figures out how to split it between voffset and immoffset. 3652 std::pair<Register, unsigned> 3653 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3654 Register OrigOffset) const { 3655 const unsigned MaxImm = 4095; 3656 Register BaseReg; 3657 unsigned ImmOffset; 3658 const LLT S32 = LLT::scalar(32); 3659 MachineRegisterInfo &MRI = *B.getMRI(); 3660 3661 std::tie(BaseReg, ImmOffset) = 3662 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3663 3664 // If BaseReg is a pointer, convert it to int. 3665 if (MRI.getType(BaseReg).isPointer()) 3666 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3667 3668 // If the immediate value is too big for the immoffset field, put the value 3669 // and -4096 into the immoffset field so that the value that is copied/added 3670 // for the voffset field is a multiple of 4096, and it stands more chance 3671 // of being CSEd with the copy/add for another similar load/store. 3672 // However, do not do that rounding down to a multiple of 4096 if that is a 3673 // negative number, as it appears to be illegal to have a negative offset 3674 // in the vgpr, even if adding the immediate offset makes it positive. 3675 unsigned Overflow = ImmOffset & ~MaxImm; 3676 ImmOffset -= Overflow; 3677 if ((int32_t)Overflow < 0) { 3678 Overflow += ImmOffset; 3679 ImmOffset = 0; 3680 } 3681 3682 if (Overflow != 0) { 3683 if (!BaseReg) { 3684 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3685 } else { 3686 auto OverflowVal = B.buildConstant(S32, Overflow); 3687 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3688 } 3689 } 3690 3691 if (!BaseReg) 3692 BaseReg = B.buildConstant(S32, 0).getReg(0); 3693 3694 return std::make_pair(BaseReg, ImmOffset); 3695 } 3696 3697 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic. 3698 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, 3699 Register VOffset, Register SOffset, 3700 unsigned ImmOffset, Register VIndex, 3701 MachineRegisterInfo &MRI) const { 3702 Optional<ValueAndVReg> MaybeVOffsetVal = 3703 getConstantVRegValWithLookThrough(VOffset, MRI); 3704 Optional<ValueAndVReg> MaybeSOffsetVal = 3705 getConstantVRegValWithLookThrough(SOffset, MRI); 3706 Optional<ValueAndVReg> MaybeVIndexVal = 3707 getConstantVRegValWithLookThrough(VIndex, MRI); 3708 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant, 3709 // update the MMO with that offset. The stride is unknown so we can only do 3710 // this if VIndex is constant 0. 3711 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && 3712 MaybeVIndexVal->Value == 0) { 3713 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + 3714 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; 3715 MMO->setOffset(TotalOffset); 3716 } else { 3717 // We don't have a constant combined offset to use in the MMO. Give up. 3718 MMO->setValue((Value *)nullptr); 3719 } 3720 } 3721 3722 /// Handle register layout difference for f16 images for some subtargets. 3723 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3724 MachineRegisterInfo &MRI, 3725 Register Reg, 3726 bool ImageStore) const { 3727 const LLT S16 = LLT::scalar(16); 3728 const LLT S32 = LLT::scalar(32); 3729 LLT StoreVT = MRI.getType(Reg); 3730 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3731 3732 if (ST.hasUnpackedD16VMem()) { 3733 auto Unmerge = B.buildUnmerge(S16, Reg); 3734 3735 SmallVector<Register, 4> WideRegs; 3736 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3737 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3738 3739 int NumElts = StoreVT.getNumElements(); 3740 3741 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 3742 .getReg(0); 3743 } 3744 3745 if (ImageStore && ST.hasImageStoreD16Bug()) { 3746 if (StoreVT.getNumElements() == 2) { 3747 SmallVector<Register, 4> PackedRegs; 3748 Reg = B.buildBitcast(S32, Reg).getReg(0); 3749 PackedRegs.push_back(Reg); 3750 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3751 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 3752 .getReg(0); 3753 } 3754 3755 if (StoreVT.getNumElements() == 3) { 3756 SmallVector<Register, 4> PackedRegs; 3757 auto Unmerge = B.buildUnmerge(S16, Reg); 3758 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3759 PackedRegs.push_back(Unmerge.getReg(I)); 3760 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3761 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 3762 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 3763 } 3764 3765 if (StoreVT.getNumElements() == 4) { 3766 SmallVector<Register, 4> PackedRegs; 3767 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 3768 auto Unmerge = B.buildUnmerge(S32, Reg); 3769 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3770 PackedRegs.push_back(Unmerge.getReg(I)); 3771 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3772 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 3773 .getReg(0); 3774 } 3775 3776 llvm_unreachable("invalid data type"); 3777 } 3778 3779 return Reg; 3780 } 3781 3782 Register AMDGPULegalizerInfo::fixStoreSourceType( 3783 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3784 MachineRegisterInfo *MRI = B.getMRI(); 3785 LLT Ty = MRI->getType(VData); 3786 3787 const LLT S16 = LLT::scalar(16); 3788 3789 // Fixup illegal register types for i8 stores. 3790 if (Ty == LLT::scalar(8) || Ty == S16) { 3791 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3792 return AnyExt; 3793 } 3794 3795 if (Ty.isVector()) { 3796 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3797 if (IsFormat) 3798 return handleD16VData(B, *MRI, VData); 3799 } 3800 } 3801 3802 return VData; 3803 } 3804 3805 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3806 MachineRegisterInfo &MRI, 3807 MachineIRBuilder &B, 3808 bool IsTyped, 3809 bool IsFormat) const { 3810 Register VData = MI.getOperand(1).getReg(); 3811 LLT Ty = MRI.getType(VData); 3812 LLT EltTy = Ty.getScalarType(); 3813 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3814 const LLT S32 = LLT::scalar(32); 3815 3816 VData = fixStoreSourceType(B, VData, IsFormat); 3817 Register RSrc = MI.getOperand(2).getReg(); 3818 3819 MachineMemOperand *MMO = *MI.memoperands_begin(); 3820 const int MemSize = MMO->getSize(); 3821 3822 unsigned ImmOffset; 3823 3824 // The typed intrinsics add an immediate after the registers. 3825 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3826 3827 // The struct intrinsic variants add one additional operand over raw. 3828 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3829 Register VIndex; 3830 int OpOffset = 0; 3831 if (HasVIndex) { 3832 VIndex = MI.getOperand(3).getReg(); 3833 OpOffset = 1; 3834 } else { 3835 VIndex = B.buildConstant(S32, 0).getReg(0); 3836 } 3837 3838 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3839 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3840 3841 unsigned Format = 0; 3842 if (IsTyped) { 3843 Format = MI.getOperand(5 + OpOffset).getImm(); 3844 ++OpOffset; 3845 } 3846 3847 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3848 3849 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 3850 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 3851 3852 unsigned Opc; 3853 if (IsTyped) { 3854 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3855 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3856 } else if (IsFormat) { 3857 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3858 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3859 } else { 3860 switch (MemSize) { 3861 case 1: 3862 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3863 break; 3864 case 2: 3865 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3866 break; 3867 default: 3868 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3869 break; 3870 } 3871 } 3872 3873 auto MIB = B.buildInstr(Opc) 3874 .addUse(VData) // vdata 3875 .addUse(RSrc) // rsrc 3876 .addUse(VIndex) // vindex 3877 .addUse(VOffset) // voffset 3878 .addUse(SOffset) // soffset 3879 .addImm(ImmOffset); // offset(imm) 3880 3881 if (IsTyped) 3882 MIB.addImm(Format); 3883 3884 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3885 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3886 .addMemOperand(MMO); 3887 3888 MI.eraseFromParent(); 3889 return true; 3890 } 3891 3892 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3893 MachineRegisterInfo &MRI, 3894 MachineIRBuilder &B, 3895 bool IsFormat, 3896 bool IsTyped) const { 3897 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3898 MachineMemOperand *MMO = *MI.memoperands_begin(); 3899 const LLT MemTy = MMO->getMemoryType(); 3900 const LLT S32 = LLT::scalar(32); 3901 3902 Register Dst = MI.getOperand(0).getReg(); 3903 Register RSrc = MI.getOperand(2).getReg(); 3904 3905 // The typed intrinsics add an immediate after the registers. 3906 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3907 3908 // The struct intrinsic variants add one additional operand over raw. 3909 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3910 Register VIndex; 3911 int OpOffset = 0; 3912 if (HasVIndex) { 3913 VIndex = MI.getOperand(3).getReg(); 3914 OpOffset = 1; 3915 } else { 3916 VIndex = B.buildConstant(S32, 0).getReg(0); 3917 } 3918 3919 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3920 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3921 3922 unsigned Format = 0; 3923 if (IsTyped) { 3924 Format = MI.getOperand(5 + OpOffset).getImm(); 3925 ++OpOffset; 3926 } 3927 3928 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3929 unsigned ImmOffset; 3930 3931 LLT Ty = MRI.getType(Dst); 3932 LLT EltTy = Ty.getScalarType(); 3933 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3934 const bool Unpacked = ST.hasUnpackedD16VMem(); 3935 3936 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 3937 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 3938 3939 unsigned Opc; 3940 3941 if (IsTyped) { 3942 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3943 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3944 } else if (IsFormat) { 3945 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3946 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3947 } else { 3948 switch (MemTy.getSizeInBits()) { 3949 case 8: 3950 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3951 break; 3952 case 16: 3953 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3954 break; 3955 default: 3956 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3957 break; 3958 } 3959 } 3960 3961 Register LoadDstReg; 3962 3963 bool IsExtLoad = 3964 (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector()); 3965 LLT UnpackedTy = Ty.changeElementSize(32); 3966 3967 if (IsExtLoad) 3968 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3969 else if (Unpacked && IsD16 && Ty.isVector()) 3970 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3971 else 3972 LoadDstReg = Dst; 3973 3974 auto MIB = B.buildInstr(Opc) 3975 .addDef(LoadDstReg) // vdata 3976 .addUse(RSrc) // rsrc 3977 .addUse(VIndex) // vindex 3978 .addUse(VOffset) // voffset 3979 .addUse(SOffset) // soffset 3980 .addImm(ImmOffset); // offset(imm) 3981 3982 if (IsTyped) 3983 MIB.addImm(Format); 3984 3985 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3986 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3987 .addMemOperand(MMO); 3988 3989 if (LoadDstReg != Dst) { 3990 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3991 3992 // Widen result for extending loads was widened. 3993 if (IsExtLoad) 3994 B.buildTrunc(Dst, LoadDstReg); 3995 else { 3996 // Repack to original 16-bit vector result 3997 // FIXME: G_TRUNC should work, but legalization currently fails 3998 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3999 SmallVector<Register, 4> Repack; 4000 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 4001 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 4002 B.buildMerge(Dst, Repack); 4003 } 4004 } 4005 4006 MI.eraseFromParent(); 4007 return true; 4008 } 4009 4010 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 4011 MachineIRBuilder &B, 4012 bool IsInc) const { 4013 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 4014 AMDGPU::G_AMDGPU_ATOMIC_DEC; 4015 B.buildInstr(Opc) 4016 .addDef(MI.getOperand(0).getReg()) 4017 .addUse(MI.getOperand(2).getReg()) 4018 .addUse(MI.getOperand(3).getReg()) 4019 .cloneMemRefs(MI); 4020 MI.eraseFromParent(); 4021 return true; 4022 } 4023 4024 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 4025 switch (IntrID) { 4026 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4027 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4028 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 4029 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4030 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4031 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 4032 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4033 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4034 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 4035 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4036 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4037 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 4038 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4039 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4040 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 4041 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4042 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4043 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 4044 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4045 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4046 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 4047 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4048 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4049 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 4050 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4051 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4052 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 4053 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4054 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4055 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 4056 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4057 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4058 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 4059 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4060 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4061 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 4062 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4063 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4064 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4065 case Intrinsic::amdgcn_buffer_atomic_fadd: 4066 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4067 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4068 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4069 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4070 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4071 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4072 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4073 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4074 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 4075 default: 4076 llvm_unreachable("unhandled atomic opcode"); 4077 } 4078 } 4079 4080 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4081 MachineIRBuilder &B, 4082 Intrinsic::ID IID) const { 4083 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4084 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4085 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4086 4087 Register Dst; 4088 4089 int OpOffset = 0; 4090 if (HasReturn) { 4091 // A few FP atomics do not support return values. 4092 Dst = MI.getOperand(0).getReg(); 4093 } else { 4094 OpOffset = -1; 4095 } 4096 4097 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4098 Register CmpVal; 4099 4100 if (IsCmpSwap) { 4101 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4102 ++OpOffset; 4103 } 4104 4105 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4106 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4107 4108 // The struct intrinsic variants add one additional operand over raw. 4109 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4110 Register VIndex; 4111 if (HasVIndex) { 4112 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4113 ++OpOffset; 4114 } else { 4115 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4116 } 4117 4118 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4119 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4120 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4121 4122 MachineMemOperand *MMO = *MI.memoperands_begin(); 4123 4124 unsigned ImmOffset; 4125 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4126 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); 4127 4128 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4129 4130 if (HasReturn) 4131 MIB.addDef(Dst); 4132 4133 MIB.addUse(VData); // vdata 4134 4135 if (IsCmpSwap) 4136 MIB.addReg(CmpVal); 4137 4138 MIB.addUse(RSrc) // rsrc 4139 .addUse(VIndex) // vindex 4140 .addUse(VOffset) // voffset 4141 .addUse(SOffset) // soffset 4142 .addImm(ImmOffset) // offset(imm) 4143 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4144 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4145 .addMemOperand(MMO); 4146 4147 MI.eraseFromParent(); 4148 return true; 4149 } 4150 4151 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4152 /// vector with s16 typed elements. 4153 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4154 SmallVectorImpl<Register> &PackedAddrs, 4155 unsigned ArgOffset, 4156 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4157 bool IsA16, bool IsG16) { 4158 const LLT S16 = LLT::scalar(16); 4159 const LLT V2S16 = LLT::fixed_vector(2, 16); 4160 auto EndIdx = Intr->VAddrEnd; 4161 4162 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4163 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4164 if (!SrcOp.isReg()) 4165 continue; // _L to _LZ may have eliminated this. 4166 4167 Register AddrReg = SrcOp.getReg(); 4168 4169 if ((I < Intr->GradientStart) || 4170 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4171 (I >= Intr->CoordStart && !IsA16)) { 4172 // Handle any gradient or coordinate operands that should not be packed 4173 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4174 PackedAddrs.push_back(AddrReg); 4175 } else { 4176 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4177 // derivatives dx/dh and dx/dv are packed with undef. 4178 if (((I + 1) >= EndIdx) || 4179 ((Intr->NumGradients / 2) % 2 == 1 && 4180 (I == static_cast<unsigned>(Intr->GradientStart + 4181 (Intr->NumGradients / 2) - 1) || 4182 I == static_cast<unsigned>(Intr->GradientStart + 4183 Intr->NumGradients - 1))) || 4184 // Check for _L to _LZ optimization 4185 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4186 PackedAddrs.push_back( 4187 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4188 .getReg(0)); 4189 } else { 4190 PackedAddrs.push_back( 4191 B.buildBuildVector( 4192 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4193 .getReg(0)); 4194 ++I; 4195 } 4196 } 4197 } 4198 } 4199 4200 /// Convert from separate vaddr components to a single vector address register, 4201 /// and replace the remaining operands with $noreg. 4202 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4203 int DimIdx, int NumVAddrs) { 4204 const LLT S32 = LLT::scalar(32); 4205 4206 SmallVector<Register, 8> AddrRegs; 4207 for (int I = 0; I != NumVAddrs; ++I) { 4208 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4209 if (SrcOp.isReg()) { 4210 AddrRegs.push_back(SrcOp.getReg()); 4211 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4212 } 4213 } 4214 4215 int NumAddrRegs = AddrRegs.size(); 4216 if (NumAddrRegs != 1) { 4217 // Above 8 elements round up to next power of 2 (i.e. 16). 4218 if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) { 4219 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4220 auto Undef = B.buildUndef(S32); 4221 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4222 NumAddrRegs = RoundedNumRegs; 4223 } 4224 4225 auto VAddr = 4226 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 4227 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4228 } 4229 4230 for (int I = 1; I != NumVAddrs; ++I) { 4231 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4232 if (SrcOp.isReg()) 4233 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4234 } 4235 } 4236 4237 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4238 /// 4239 /// Depending on the subtarget, load/store with 16-bit element data need to be 4240 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4241 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4242 /// registers. 4243 /// 4244 /// We don't want to directly select image instructions just yet, but also want 4245 /// to exposes all register repacking to the legalizer/combiners. We also don't 4246 /// want a selected instrution entering RegBankSelect. In order to avoid 4247 /// defining a multitude of intermediate image instructions, directly hack on 4248 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 4249 /// now unnecessary arguments with $noreg. 4250 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4251 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4252 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4253 4254 const unsigned NumDefs = MI.getNumExplicitDefs(); 4255 const unsigned ArgOffset = NumDefs + 1; 4256 bool IsTFE = NumDefs == 2; 4257 // We are only processing the operands of d16 image operations on subtargets 4258 // that use the unpacked register layout, or need to repack the TFE result. 4259 4260 // TODO: Do we need to guard against already legalized intrinsics? 4261 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4262 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4263 4264 MachineRegisterInfo *MRI = B.getMRI(); 4265 const LLT S32 = LLT::scalar(32); 4266 const LLT S16 = LLT::scalar(16); 4267 const LLT V2S16 = LLT::fixed_vector(2, 16); 4268 4269 unsigned DMask = 0; 4270 4271 // Check for 16 bit addresses and pack if true. 4272 LLT GradTy = 4273 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4274 LLT AddrTy = 4275 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4276 const bool IsG16 = GradTy == S16; 4277 const bool IsA16 = AddrTy == S16; 4278 4279 int DMaskLanes = 0; 4280 if (!BaseOpcode->Atomic) { 4281 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4282 if (BaseOpcode->Gather4) { 4283 DMaskLanes = 4; 4284 } else if (DMask != 0) { 4285 DMaskLanes = countPopulation(DMask); 4286 } else if (!IsTFE && !BaseOpcode->Store) { 4287 // If dmask is 0, this is a no-op load. This can be eliminated. 4288 B.buildUndef(MI.getOperand(0)); 4289 MI.eraseFromParent(); 4290 return true; 4291 } 4292 } 4293 4294 Observer.changingInstr(MI); 4295 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4296 4297 unsigned NewOpcode = NumDefs == 0 ? 4298 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4299 4300 // Track that we legalized this 4301 MI.setDesc(B.getTII().get(NewOpcode)); 4302 4303 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4304 // dmask to be at least 1 otherwise the instruction will fail 4305 if (IsTFE && DMask == 0) { 4306 DMask = 0x1; 4307 DMaskLanes = 1; 4308 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4309 } 4310 4311 if (BaseOpcode->Atomic) { 4312 Register VData0 = MI.getOperand(2).getReg(); 4313 LLT Ty = MRI->getType(VData0); 4314 4315 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4316 if (Ty.isVector()) 4317 return false; 4318 4319 if (BaseOpcode->AtomicX2) { 4320 Register VData1 = MI.getOperand(3).getReg(); 4321 // The two values are packed in one register. 4322 LLT PackedTy = LLT::fixed_vector(2, Ty); 4323 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4324 MI.getOperand(2).setReg(Concat.getReg(0)); 4325 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4326 } 4327 } 4328 4329 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4330 4331 // Optimize _L to _LZ when _L is zero 4332 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4333 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4334 const ConstantFP *ConstantLod; 4335 4336 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4337 m_GFCst(ConstantLod))) { 4338 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4339 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4340 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4341 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, 4342 Intr->Dim); 4343 4344 // The starting indexes should remain in the same place. 4345 --CorrectedNumVAddrs; 4346 4347 MI.getOperand(MI.getNumExplicitDefs()) 4348 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4349 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4350 Intr = NewImageDimIntr; 4351 } 4352 } 4353 } 4354 4355 // Optimize _mip away, when 'lod' is zero 4356 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4357 int64_t ConstantLod; 4358 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4359 m_ICst(ConstantLod))) { 4360 if (ConstantLod == 0) { 4361 // TODO: Change intrinsic opcode and remove operand instead or replacing 4362 // it with 0, as the _L to _LZ handling is done above. 4363 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4364 --CorrectedNumVAddrs; 4365 } 4366 } 4367 } 4368 4369 // Rewrite the addressing register layout before doing anything else. 4370 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4371 // 16 bit gradients are supported, but are tied to the A16 control 4372 // so both gradients and addresses must be 16 bit 4373 return false; 4374 } 4375 4376 if (IsA16 && !ST.hasA16()) { 4377 // A16 not supported 4378 return false; 4379 } 4380 4381 if (IsA16 || IsG16) { 4382 if (Intr->NumVAddrs > 1) { 4383 SmallVector<Register, 4> PackedRegs; 4384 4385 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4386 IsG16); 4387 4388 // See also below in the non-a16 branch 4389 const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 && 4390 PackedRegs.size() <= ST.getNSAMaxSize(); 4391 4392 if (!UseNSA && PackedRegs.size() > 1) { 4393 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 4394 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4395 PackedRegs[0] = Concat.getReg(0); 4396 PackedRegs.resize(1); 4397 } 4398 4399 const unsigned NumPacked = PackedRegs.size(); 4400 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4401 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4402 if (!SrcOp.isReg()) { 4403 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4404 continue; 4405 } 4406 4407 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4408 4409 if (I - Intr->VAddrStart < NumPacked) 4410 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4411 else 4412 SrcOp.setReg(AMDGPU::NoRegister); 4413 } 4414 } 4415 } else { 4416 // If the register allocator cannot place the address registers contiguously 4417 // without introducing moves, then using the non-sequential address encoding 4418 // is always preferable, since it saves VALU instructions and is usually a 4419 // wash in terms of code size or even better. 4420 // 4421 // However, we currently have no way of hinting to the register allocator 4422 // that MIMG addresses should be placed contiguously when it is possible to 4423 // do so, so force non-NSA for the common 2-address case as a heuristic. 4424 // 4425 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4426 // allocation when possible. 4427 const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 && 4428 CorrectedNumVAddrs <= ST.getNSAMaxSize(); 4429 4430 if (!UseNSA && Intr->NumVAddrs > 1) 4431 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4432 Intr->NumVAddrs); 4433 } 4434 4435 int Flags = 0; 4436 if (IsA16) 4437 Flags |= 1; 4438 if (IsG16) 4439 Flags |= 2; 4440 MI.addOperand(MachineOperand::CreateImm(Flags)); 4441 4442 if (BaseOpcode->Store) { // No TFE for stores? 4443 // TODO: Handle dmask trim 4444 Register VData = MI.getOperand(1).getReg(); 4445 LLT Ty = MRI->getType(VData); 4446 if (!Ty.isVector() || Ty.getElementType() != S16) 4447 return true; 4448 4449 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4450 if (RepackedReg != VData) { 4451 MI.getOperand(1).setReg(RepackedReg); 4452 } 4453 4454 return true; 4455 } 4456 4457 Register DstReg = MI.getOperand(0).getReg(); 4458 LLT Ty = MRI->getType(DstReg); 4459 const LLT EltTy = Ty.getScalarType(); 4460 const bool IsD16 = Ty.getScalarType() == S16; 4461 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4462 4463 // Confirm that the return type is large enough for the dmask specified 4464 if (NumElts < DMaskLanes) 4465 return false; 4466 4467 if (NumElts > 4 || DMaskLanes > 4) 4468 return false; 4469 4470 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4471 const LLT AdjustedTy = 4472 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 4473 4474 // The raw dword aligned data component of the load. The only legal cases 4475 // where this matters should be when using the packed D16 format, for 4476 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4477 LLT RoundedTy; 4478 4479 // S32 vector to to cover all data, plus TFE result element. 4480 LLT TFETy; 4481 4482 // Register type to use for each loaded component. Will be S32 or V2S16. 4483 LLT RegTy; 4484 4485 if (IsD16 && ST.hasUnpackedD16VMem()) { 4486 RoundedTy = 4487 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 4488 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 4489 RegTy = S32; 4490 } else { 4491 unsigned EltSize = EltTy.getSizeInBits(); 4492 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4493 unsigned RoundedSize = 32 * RoundedElts; 4494 RoundedTy = LLT::scalarOrVector( 4495 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 4496 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 4497 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4498 } 4499 4500 // The return type does not need adjustment. 4501 // TODO: Should we change s16 case to s32 or <2 x s16>? 4502 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4503 return true; 4504 4505 Register Dst1Reg; 4506 4507 // Insert after the instruction. 4508 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4509 4510 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4511 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4512 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4513 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4514 4515 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4516 4517 MI.getOperand(0).setReg(NewResultReg); 4518 4519 // In the IR, TFE is supposed to be used with a 2 element struct return 4520 // type. The intruction really returns these two values in one contiguous 4521 // register, with one additional dword beyond the loaded data. Rewrite the 4522 // return type to use a single register result. 4523 4524 if (IsTFE) { 4525 Dst1Reg = MI.getOperand(1).getReg(); 4526 if (MRI->getType(Dst1Reg) != S32) 4527 return false; 4528 4529 // TODO: Make sure the TFE operand bit is set. 4530 MI.RemoveOperand(1); 4531 4532 // Handle the easy case that requires no repack instructions. 4533 if (Ty == S32) { 4534 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4535 return true; 4536 } 4537 } 4538 4539 // Now figure out how to copy the new result register back into the old 4540 // result. 4541 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4542 4543 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4544 4545 if (ResultNumRegs == 1) { 4546 assert(!IsTFE); 4547 ResultRegs[0] = NewResultReg; 4548 } else { 4549 // We have to repack into a new vector of some kind. 4550 for (int I = 0; I != NumDataRegs; ++I) 4551 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4552 B.buildUnmerge(ResultRegs, NewResultReg); 4553 4554 // Drop the final TFE element to get the data part. The TFE result is 4555 // directly written to the right place already. 4556 if (IsTFE) 4557 ResultRegs.resize(NumDataRegs); 4558 } 4559 4560 // For an s16 scalar result, we form an s32 result with a truncate regardless 4561 // of packed vs. unpacked. 4562 if (IsD16 && !Ty.isVector()) { 4563 B.buildTrunc(DstReg, ResultRegs[0]); 4564 return true; 4565 } 4566 4567 // Avoid a build/concat_vector of 1 entry. 4568 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4569 B.buildBitcast(DstReg, ResultRegs[0]); 4570 return true; 4571 } 4572 4573 assert(Ty.isVector()); 4574 4575 if (IsD16) { 4576 // For packed D16 results with TFE enabled, all the data components are 4577 // S32. Cast back to the expected type. 4578 // 4579 // TODO: We don't really need to use load s32 elements. We would only need one 4580 // cast for the TFE result if a multiple of v2s16 was used. 4581 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4582 for (Register &Reg : ResultRegs) 4583 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4584 } else if (ST.hasUnpackedD16VMem()) { 4585 for (Register &Reg : ResultRegs) 4586 Reg = B.buildTrunc(S16, Reg).getReg(0); 4587 } 4588 } 4589 4590 auto padWithUndef = [&](LLT Ty, int NumElts) { 4591 if (NumElts == 0) 4592 return; 4593 Register Undef = B.buildUndef(Ty).getReg(0); 4594 for (int I = 0; I != NumElts; ++I) 4595 ResultRegs.push_back(Undef); 4596 }; 4597 4598 // Pad out any elements eliminated due to the dmask. 4599 LLT ResTy = MRI->getType(ResultRegs[0]); 4600 if (!ResTy.isVector()) { 4601 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4602 B.buildBuildVector(DstReg, ResultRegs); 4603 return true; 4604 } 4605 4606 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4607 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4608 4609 // Deal with the one annoying legal case. 4610 const LLT V3S16 = LLT::fixed_vector(3, 16); 4611 if (Ty == V3S16) { 4612 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4613 auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs); 4614 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4615 return true; 4616 } 4617 4618 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4619 B.buildConcatVectors(DstReg, ResultRegs); 4620 return true; 4621 } 4622 4623 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4624 LegalizerHelper &Helper, MachineInstr &MI) const { 4625 MachineIRBuilder &B = Helper.MIRBuilder; 4626 GISelChangeObserver &Observer = Helper.Observer; 4627 4628 Register Dst = MI.getOperand(0).getReg(); 4629 LLT Ty = B.getMRI()->getType(Dst); 4630 unsigned Size = Ty.getSizeInBits(); 4631 MachineFunction &MF = B.getMF(); 4632 4633 Observer.changingInstr(MI); 4634 4635 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 4636 Ty = getBitcastRegisterType(Ty); 4637 Helper.bitcastDst(MI, Ty, 0); 4638 Dst = MI.getOperand(0).getReg(); 4639 B.setInsertPt(B.getMBB(), MI); 4640 } 4641 4642 // FIXME: We don't really need this intermediate instruction. The intrinsic 4643 // should be fixed to have a memory operand. Since it's readnone, we're not 4644 // allowed to add one. 4645 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4646 MI.RemoveOperand(1); // Remove intrinsic ID 4647 4648 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4649 // TODO: Should this use datalayout alignment? 4650 const unsigned MemSize = (Size + 7) / 8; 4651 const Align MemAlign(4); 4652 MachineMemOperand *MMO = MF.getMachineMemOperand( 4653 MachinePointerInfo(), 4654 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4655 MachineMemOperand::MOInvariant, 4656 MemSize, MemAlign); 4657 MI.addMemOperand(MF, MMO); 4658 4659 // There are no 96-bit result scalar loads, but widening to 128-bit should 4660 // always be legal. We may need to restore this to a 96-bit result if it turns 4661 // out this needs to be converted to a vector load during RegBankSelect. 4662 if (!isPowerOf2_32(Size)) { 4663 if (Ty.isVector()) 4664 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4665 else 4666 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4667 } 4668 4669 Observer.changedInstr(MI); 4670 return true; 4671 } 4672 4673 // TODO: Move to selection 4674 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4675 MachineRegisterInfo &MRI, 4676 MachineIRBuilder &B) const { 4677 if (!ST.isTrapHandlerEnabled() || 4678 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4679 return legalizeTrapEndpgm(MI, MRI, B); 4680 4681 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4682 switch (*HsaAbiVer) { 4683 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4684 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4685 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4686 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4687 return ST.supportsGetDoorbellID() ? 4688 legalizeTrapHsa(MI, MRI, B) : 4689 legalizeTrapHsaQueuePtr(MI, MRI, B); 4690 } 4691 } 4692 4693 llvm_unreachable("Unknown trap handler"); 4694 } 4695 4696 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4697 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4698 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4699 MI.eraseFromParent(); 4700 return true; 4701 } 4702 4703 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4704 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4705 // Pass queue pointer to trap handler as input, and insert trap instruction 4706 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4707 Register LiveIn = 4708 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4709 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4710 return false; 4711 4712 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4713 B.buildCopy(SGPR01, LiveIn); 4714 B.buildInstr(AMDGPU::S_TRAP) 4715 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4716 .addReg(SGPR01, RegState::Implicit); 4717 4718 MI.eraseFromParent(); 4719 return true; 4720 } 4721 4722 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4723 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4724 B.buildInstr(AMDGPU::S_TRAP) 4725 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4726 MI.eraseFromParent(); 4727 return true; 4728 } 4729 4730 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4731 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4732 // Is non-HSA path or trap-handler disabled? then, report a warning 4733 // accordingly 4734 if (!ST.isTrapHandlerEnabled() || 4735 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4736 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4737 "debugtrap handler not supported", 4738 MI.getDebugLoc(), DS_Warning); 4739 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4740 Ctx.diagnose(NoTrap); 4741 } else { 4742 // Insert debug-trap instruction 4743 B.buildInstr(AMDGPU::S_TRAP) 4744 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4745 } 4746 4747 MI.eraseFromParent(); 4748 return true; 4749 } 4750 4751 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4752 MachineIRBuilder &B) const { 4753 MachineRegisterInfo &MRI = *B.getMRI(); 4754 const LLT S16 = LLT::scalar(16); 4755 const LLT S32 = LLT::scalar(32); 4756 4757 Register DstReg = MI.getOperand(0).getReg(); 4758 Register NodePtr = MI.getOperand(2).getReg(); 4759 Register RayExtent = MI.getOperand(3).getReg(); 4760 Register RayOrigin = MI.getOperand(4).getReg(); 4761 Register RayDir = MI.getOperand(5).getReg(); 4762 Register RayInvDir = MI.getOperand(6).getReg(); 4763 Register TDescr = MI.getOperand(7).getReg(); 4764 4765 if (!ST.hasGFX10_AEncoding()) { 4766 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 4767 "intrinsic not supported on subtarget", 4768 MI.getDebugLoc()); 4769 B.getMF().getFunction().getContext().diagnose(BadIntrin); 4770 return false; 4771 } 4772 4773 bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4774 bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4775 unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa 4776 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa 4777 : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa 4778 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; 4779 4780 SmallVector<Register, 12> Ops; 4781 if (Is64) { 4782 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4783 Ops.push_back(Unmerge.getReg(0)); 4784 Ops.push_back(Unmerge.getReg(1)); 4785 } else { 4786 Ops.push_back(NodePtr); 4787 } 4788 Ops.push_back(RayExtent); 4789 4790 auto packLanes = [&Ops, &S32, &B] (Register Src) { 4791 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); 4792 Ops.push_back(Unmerge.getReg(0)); 4793 Ops.push_back(Unmerge.getReg(1)); 4794 Ops.push_back(Unmerge.getReg(2)); 4795 }; 4796 4797 packLanes(RayOrigin); 4798 if (IsA16) { 4799 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); 4800 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); 4801 Register R1 = MRI.createGenericVirtualRegister(S32); 4802 Register R2 = MRI.createGenericVirtualRegister(S32); 4803 Register R3 = MRI.createGenericVirtualRegister(S32); 4804 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4805 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4806 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4807 Ops.push_back(R1); 4808 Ops.push_back(R2); 4809 Ops.push_back(R3); 4810 } else { 4811 packLanes(RayDir); 4812 packLanes(RayInvDir); 4813 } 4814 4815 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4816 .addDef(DstReg) 4817 .addImm(Opcode); 4818 4819 for (Register R : Ops) { 4820 MIB.addUse(R); 4821 } 4822 4823 MIB.addUse(TDescr) 4824 .addImm(IsA16 ? 1 : 0) 4825 .cloneMemRefs(MI); 4826 4827 MI.eraseFromParent(); 4828 return true; 4829 } 4830 4831 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4832 MachineInstr &MI) const { 4833 MachineIRBuilder &B = Helper.MIRBuilder; 4834 MachineRegisterInfo &MRI = *B.getMRI(); 4835 4836 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4837 auto IntrID = MI.getIntrinsicID(); 4838 switch (IntrID) { 4839 case Intrinsic::amdgcn_if: 4840 case Intrinsic::amdgcn_else: { 4841 MachineInstr *Br = nullptr; 4842 MachineBasicBlock *UncondBrTarget = nullptr; 4843 bool Negated = false; 4844 if (MachineInstr *BrCond = 4845 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4846 const SIRegisterInfo *TRI 4847 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4848 4849 Register Def = MI.getOperand(1).getReg(); 4850 Register Use = MI.getOperand(3).getReg(); 4851 4852 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4853 4854 if (Negated) 4855 std::swap(CondBrTarget, UncondBrTarget); 4856 4857 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4858 if (IntrID == Intrinsic::amdgcn_if) { 4859 B.buildInstr(AMDGPU::SI_IF) 4860 .addDef(Def) 4861 .addUse(Use) 4862 .addMBB(UncondBrTarget); 4863 } else { 4864 B.buildInstr(AMDGPU::SI_ELSE) 4865 .addDef(Def) 4866 .addUse(Use) 4867 .addMBB(UncondBrTarget); 4868 } 4869 4870 if (Br) { 4871 Br->getOperand(0).setMBB(CondBrTarget); 4872 } else { 4873 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4874 // since we're swapping branch targets it needs to be reinserted. 4875 // FIXME: IRTranslator should probably not do this 4876 B.buildBr(*CondBrTarget); 4877 } 4878 4879 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4880 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4881 MI.eraseFromParent(); 4882 BrCond->eraseFromParent(); 4883 return true; 4884 } 4885 4886 return false; 4887 } 4888 case Intrinsic::amdgcn_loop: { 4889 MachineInstr *Br = nullptr; 4890 MachineBasicBlock *UncondBrTarget = nullptr; 4891 bool Negated = false; 4892 if (MachineInstr *BrCond = 4893 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4894 const SIRegisterInfo *TRI 4895 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4896 4897 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4898 Register Reg = MI.getOperand(2).getReg(); 4899 4900 if (Negated) 4901 std::swap(CondBrTarget, UncondBrTarget); 4902 4903 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4904 B.buildInstr(AMDGPU::SI_LOOP) 4905 .addUse(Reg) 4906 .addMBB(UncondBrTarget); 4907 4908 if (Br) 4909 Br->getOperand(0).setMBB(CondBrTarget); 4910 else 4911 B.buildBr(*CondBrTarget); 4912 4913 MI.eraseFromParent(); 4914 BrCond->eraseFromParent(); 4915 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4916 return true; 4917 } 4918 4919 return false; 4920 } 4921 case Intrinsic::amdgcn_kernarg_segment_ptr: 4922 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4923 // This only makes sense to call in a kernel, so just lower to null. 4924 B.buildConstant(MI.getOperand(0).getReg(), 0); 4925 MI.eraseFromParent(); 4926 return true; 4927 } 4928 4929 return legalizePreloadedArgIntrin( 4930 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4931 case Intrinsic::amdgcn_implicitarg_ptr: 4932 return legalizeImplicitArgPtr(MI, MRI, B); 4933 case Intrinsic::amdgcn_workitem_id_x: 4934 return legalizePreloadedArgIntrin(MI, MRI, B, 4935 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4936 case Intrinsic::amdgcn_workitem_id_y: 4937 return legalizePreloadedArgIntrin(MI, MRI, B, 4938 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4939 case Intrinsic::amdgcn_workitem_id_z: 4940 return legalizePreloadedArgIntrin(MI, MRI, B, 4941 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4942 case Intrinsic::amdgcn_workgroup_id_x: 4943 return legalizePreloadedArgIntrin(MI, MRI, B, 4944 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4945 case Intrinsic::amdgcn_workgroup_id_y: 4946 return legalizePreloadedArgIntrin(MI, MRI, B, 4947 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4948 case Intrinsic::amdgcn_workgroup_id_z: 4949 return legalizePreloadedArgIntrin(MI, MRI, B, 4950 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4951 case Intrinsic::amdgcn_dispatch_ptr: 4952 return legalizePreloadedArgIntrin(MI, MRI, B, 4953 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4954 case Intrinsic::amdgcn_queue_ptr: 4955 return legalizePreloadedArgIntrin(MI, MRI, B, 4956 AMDGPUFunctionArgInfo::QUEUE_PTR); 4957 case Intrinsic::amdgcn_implicit_buffer_ptr: 4958 return legalizePreloadedArgIntrin( 4959 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4960 case Intrinsic::amdgcn_dispatch_id: 4961 return legalizePreloadedArgIntrin(MI, MRI, B, 4962 AMDGPUFunctionArgInfo::DISPATCH_ID); 4963 case Intrinsic::amdgcn_fdiv_fast: 4964 return legalizeFDIVFastIntrin(MI, MRI, B); 4965 case Intrinsic::amdgcn_is_shared: 4966 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4967 case Intrinsic::amdgcn_is_private: 4968 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4969 case Intrinsic::amdgcn_wavefrontsize: { 4970 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4971 MI.eraseFromParent(); 4972 return true; 4973 } 4974 case Intrinsic::amdgcn_s_buffer_load: 4975 return legalizeSBufferLoad(Helper, MI); 4976 case Intrinsic::amdgcn_raw_buffer_store: 4977 case Intrinsic::amdgcn_struct_buffer_store: 4978 return legalizeBufferStore(MI, MRI, B, false, false); 4979 case Intrinsic::amdgcn_raw_buffer_store_format: 4980 case Intrinsic::amdgcn_struct_buffer_store_format: 4981 return legalizeBufferStore(MI, MRI, B, false, true); 4982 case Intrinsic::amdgcn_raw_tbuffer_store: 4983 case Intrinsic::amdgcn_struct_tbuffer_store: 4984 return legalizeBufferStore(MI, MRI, B, true, true); 4985 case Intrinsic::amdgcn_raw_buffer_load: 4986 case Intrinsic::amdgcn_struct_buffer_load: 4987 return legalizeBufferLoad(MI, MRI, B, false, false); 4988 case Intrinsic::amdgcn_raw_buffer_load_format: 4989 case Intrinsic::amdgcn_struct_buffer_load_format: 4990 return legalizeBufferLoad(MI, MRI, B, true, false); 4991 case Intrinsic::amdgcn_raw_tbuffer_load: 4992 case Intrinsic::amdgcn_struct_tbuffer_load: 4993 return legalizeBufferLoad(MI, MRI, B, true, true); 4994 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4995 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4996 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4997 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4998 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4999 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 5000 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 5001 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 5002 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 5003 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 5004 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 5005 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 5006 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 5007 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 5008 case Intrinsic::amdgcn_raw_buffer_atomic_and: 5009 case Intrinsic::amdgcn_struct_buffer_atomic_and: 5010 case Intrinsic::amdgcn_raw_buffer_atomic_or: 5011 case Intrinsic::amdgcn_struct_buffer_atomic_or: 5012 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 5013 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 5014 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 5015 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 5016 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 5017 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 5018 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 5019 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 5020 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 5021 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 5022 case Intrinsic::amdgcn_buffer_atomic_fadd: 5023 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 5024 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 5025 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 5026 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 5027 return legalizeBufferAtomic(MI, B, IntrID); 5028 case Intrinsic::amdgcn_atomic_inc: 5029 return legalizeAtomicIncDec(MI, B, true); 5030 case Intrinsic::amdgcn_atomic_dec: 5031 return legalizeAtomicIncDec(MI, B, false); 5032 case Intrinsic::trap: 5033 return legalizeTrapIntrinsic(MI, MRI, B); 5034 case Intrinsic::debugtrap: 5035 return legalizeDebugTrapIntrinsic(MI, MRI, B); 5036 case Intrinsic::amdgcn_rsq_clamp: 5037 return legalizeRsqClampIntrinsic(MI, MRI, B); 5038 case Intrinsic::amdgcn_ds_fadd: 5039 case Intrinsic::amdgcn_ds_fmin: 5040 case Intrinsic::amdgcn_ds_fmax: 5041 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 5042 case Intrinsic::amdgcn_image_bvh_intersect_ray: 5043 return legalizeBVHIntrinsic(MI, B); 5044 default: { 5045 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 5046 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 5047 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 5048 return true; 5049 } 5050 } 5051 5052 return true; 5053 } 5054