1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-legalinfo" 31 32 using namespace llvm; 33 using namespace LegalizeActions; 34 using namespace LegalizeMutations; 35 using namespace LegalityPredicates; 36 using namespace MIPatternMatch; 37 38 // Hack until load/store selection patterns support any tuple of legal types. 39 static cl::opt<bool> EnableNewLegality( 40 "amdgpu-global-isel-new-legality", 41 cl::desc("Use GlobalISel desired legality, rather than try to use" 42 "rules compatible with selection patterns"), 43 cl::init(false), 44 cl::ReallyHidden); 45 46 static constexpr unsigned MaxRegisterSize = 1024; 47 48 // Round the number of elements to the next power of two elements 49 static LLT getPow2VectorType(LLT Ty) { 50 unsigned NElts = Ty.getNumElements(); 51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 52 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 53 } 54 55 // Round the number of bits to the next power of two bits 56 static LLT getPow2ScalarType(LLT Ty) { 57 unsigned Bits = Ty.getSizeInBits(); 58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 59 return LLT::scalar(Pow2Bits); 60 } 61 62 /// \returns true if this is an odd sized vector which should widen by adding an 63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 64 /// excludes s1 vectors, which should always be scalarized. 65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 66 return [=](const LegalityQuery &Query) { 67 const LLT Ty = Query.Types[TypeIdx]; 68 if (!Ty.isVector()) 69 return false; 70 71 const LLT EltTy = Ty.getElementType(); 72 const unsigned EltSize = EltTy.getSizeInBits(); 73 return Ty.getNumElements() % 2 != 0 && 74 EltSize > 1 && EltSize < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 return Ty.getSizeInBits() % 32 == 0; 83 }; 84 } 85 86 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getScalarType(); 90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 91 }; 92 } 93 94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 95 return [=](const LegalityQuery &Query) { 96 const LLT Ty = Query.Types[TypeIdx]; 97 const LLT EltTy = Ty.getElementType(); 98 return std::make_pair(TypeIdx, 99 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 100 }; 101 } 102 103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 104 return [=](const LegalityQuery &Query) { 105 const LLT Ty = Query.Types[TypeIdx]; 106 const LLT EltTy = Ty.getElementType(); 107 unsigned Size = Ty.getSizeInBits(); 108 unsigned Pieces = (Size + 63) / 64; 109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 110 return std::make_pair( 111 TypeIdx, 112 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy)); 113 }; 114 } 115 116 // Increase the number of vector elements to reach the next multiple of 32-bit 117 // type. 118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 122 const LLT EltTy = Ty.getElementType(); 123 const int Size = Ty.getSizeInBits(); 124 const int EltSize = EltTy.getSizeInBits(); 125 const int NextMul32 = (Size + 31) / 32; 126 127 assert(EltSize < 32); 128 129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 130 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 131 }; 132 } 133 134 static LLT getBitcastRegisterType(const LLT Ty) { 135 const unsigned Size = Ty.getSizeInBits(); 136 137 LLT CoercedTy; 138 if (Size <= 32) { 139 // <2 x s8> -> s16 140 // <4 x s8> -> s32 141 return LLT::scalar(Size); 142 } 143 144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 145 } 146 147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 148 return [=](const LegalityQuery &Query) { 149 const LLT Ty = Query.Types[TypeIdx]; 150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 151 }; 152 } 153 154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 155 return [=](const LegalityQuery &Query) { 156 const LLT Ty = Query.Types[TypeIdx]; 157 unsigned Size = Ty.getSizeInBits(); 158 assert(Size % 32 == 0); 159 return std::make_pair( 160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 161 }; 162 } 163 164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 165 return [=](const LegalityQuery &Query) { 166 const LLT QueryTy = Query.Types[TypeIdx]; 167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 168 }; 169 } 170 171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 172 return [=](const LegalityQuery &Query) { 173 const LLT QueryTy = Query.Types[TypeIdx]; 174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 175 }; 176 } 177 178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 179 return [=](const LegalityQuery &Query) { 180 const LLT QueryTy = Query.Types[TypeIdx]; 181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 182 }; 183 } 184 185 static bool isRegisterSize(unsigned Size) { 186 return Size % 32 == 0 && Size <= MaxRegisterSize; 187 } 188 189 static bool isRegisterVectorElementType(LLT EltTy) { 190 const int EltSize = EltTy.getSizeInBits(); 191 return EltSize == 16 || EltSize % 32 == 0; 192 } 193 194 static bool isRegisterVectorType(LLT Ty) { 195 const int EltSize = Ty.getElementType().getSizeInBits(); 196 return EltSize == 32 || EltSize == 64 || 197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 198 EltSize == 128 || EltSize == 256; 199 } 200 201 static bool isRegisterType(LLT Ty) { 202 if (!isRegisterSize(Ty.getSizeInBits())) 203 return false; 204 205 if (Ty.isVector()) 206 return isRegisterVectorType(Ty); 207 208 return true; 209 } 210 211 // Any combination of 32 or 64-bit elements up the maximum register size, and 212 // multiples of v2s16. 213 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 214 return [=](const LegalityQuery &Query) { 215 return isRegisterType(Query.Types[TypeIdx]); 216 }; 217 } 218 219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 220 return [=](const LegalityQuery &Query) { 221 const LLT QueryTy = Query.Types[TypeIdx]; 222 if (!QueryTy.isVector()) 223 return false; 224 const LLT EltTy = QueryTy.getElementType(); 225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 226 }; 227 } 228 229 // If we have a truncating store or an extending load with a data size larger 230 // than 32-bits, we need to reduce to a 32-bit type. 231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 232 return [=](const LegalityQuery &Query) { 233 const LLT Ty = Query.Types[TypeIdx]; 234 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 236 }; 237 } 238 239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 240 // handle some operations by just promoting the register during 241 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 243 bool IsLoad) { 244 switch (AS) { 245 case AMDGPUAS::PRIVATE_ADDRESS: 246 // FIXME: Private element size. 247 return ST.enableFlatScratch() ? 128 : 32; 248 case AMDGPUAS::LOCAL_ADDRESS: 249 return ST.useDS128() ? 128 : 64; 250 case AMDGPUAS::GLOBAL_ADDRESS: 251 case AMDGPUAS::CONSTANT_ADDRESS: 252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 253 // Treat constant and global as identical. SMRD loads are sometimes usable for 254 // global loads (ideally constant address space should be eliminated) 255 // depending on the context. Legality cannot be context dependent, but 256 // RegBankSelect can split the load as necessary depending on the pointer 257 // register bank/uniformity and if the memory is invariant or not written in a 258 // kernel. 259 return IsLoad ? 512 : 128; 260 default: 261 // Flat addresses may contextually need to be split to 32-bit parts if they 262 // may alias scratch depending on the subtarget. 263 return 128; 264 } 265 } 266 267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 268 const LegalityQuery &Query) { 269 const LLT Ty = Query.Types[0]; 270 271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 273 274 unsigned RegSize = Ty.getSizeInBits(); 275 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 276 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits; 277 unsigned AS = Query.Types[1].getAddressSpace(); 278 279 // All of these need to be custom lowered to cast the pointer operand. 280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 281 return false; 282 283 // Do not handle extending vector loads. 284 if (Ty.isVector() && MemSize != RegSize) 285 return false; 286 287 // TODO: We should be able to widen loads if the alignment is high enough, but 288 // we also need to modify the memory access size. 289 #if 0 290 // Accept widening loads based on alignment. 291 if (IsLoad && MemSize < Size) 292 MemSize = std::max(MemSize, Align); 293 #endif 294 295 // Only 1-byte and 2-byte to 32-bit extloads are valid. 296 if (MemSize != RegSize && RegSize != 32) 297 return false; 298 299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 300 return false; 301 302 switch (MemSize) { 303 case 8: 304 case 16: 305 case 32: 306 case 64: 307 case 128: 308 break; 309 case 96: 310 if (!ST.hasDwordx3LoadStores()) 311 return false; 312 break; 313 case 256: 314 case 512: 315 // These may contextually need to be broken down. 316 break; 317 default: 318 return false; 319 } 320 321 assert(RegSize >= MemSize); 322 323 if (AlignBits < MemSize) { 324 const SITargetLowering *TLI = ST.getTargetLowering(); 325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 326 Align(AlignBits / 8))) 327 return false; 328 } 329 330 return true; 331 } 332 333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 334 // workaround this. Eventually it should ignore the type for loads and only care 335 // about the size. Return true in cases where we will workaround this for now by 336 // bitcasting. 337 static bool loadStoreBitcastWorkaround(const LLT Ty) { 338 if (EnableNewLegality) 339 return false; 340 341 const unsigned Size = Ty.getSizeInBits(); 342 if (Size <= 64) 343 return false; 344 if (!Ty.isVector()) 345 return true; 346 347 LLT EltTy = Ty.getElementType(); 348 if (EltTy.isPointer()) 349 return true; 350 351 unsigned EltSize = EltTy.getSizeInBits(); 352 return EltSize != 32 && EltSize != 64; 353 } 354 355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 356 const LLT Ty = Query.Types[0]; 357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 358 !loadStoreBitcastWorkaround(Ty); 359 } 360 361 /// Return true if a load or store of the type should be lowered with a bitcast 362 /// to a different type. 363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 364 const LLT MemTy) { 365 const unsigned MemSizeInBits = MemTy.getSizeInBits(); 366 const unsigned Size = Ty.getSizeInBits(); 367 if (Size != MemSizeInBits) 368 return Size <= 32 && Ty.isVector(); 369 370 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 371 return true; 372 373 // Don't try to handle bitcasting vector ext loads for now. 374 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 375 (Size <= 32 || isRegisterSize(Size)) && 376 !isRegisterVectorElementType(Ty.getElementType()); 377 } 378 379 /// Return true if we should legalize a load by widening an odd sized memory 380 /// access up to the alignment. Note this case when the memory access itself 381 /// changes, not the size of the result register. 382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 383 uint64_t AlignInBits, unsigned AddrSpace, 384 unsigned Opcode) { 385 unsigned SizeInBits = MemoryTy.getSizeInBits(); 386 // We don't want to widen cases that are naturally legal. 387 if (isPowerOf2_32(SizeInBits)) 388 return false; 389 390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 391 // end up widening these for a scalar load during RegBankSelect, since there 392 // aren't 96-bit scalar loads. 393 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 394 return false; 395 396 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 397 return false; 398 399 // A load is known dereferenceable up to the alignment, so it's legal to widen 400 // to it. 401 // 402 // TODO: Could check dereferenceable for less aligned cases. 403 unsigned RoundedSize = NextPowerOf2(SizeInBits); 404 if (AlignInBits < RoundedSize) 405 return false; 406 407 // Do not widen if it would introduce a slow unaligned load. 408 const SITargetLowering *TLI = ST.getTargetLowering(); 409 bool Fast = false; 410 return TLI->allowsMisalignedMemoryAccessesImpl( 411 RoundedSize, AddrSpace, Align(AlignInBits / 8), 412 MachineMemOperand::MOLoad, &Fast) && 413 Fast; 414 } 415 416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 417 unsigned Opcode) { 418 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 419 return false; 420 421 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 422 Query.MMODescrs[0].AlignInBits, 423 Query.Types[1].getAddressSpace(), Opcode); 424 } 425 426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 427 const GCNTargetMachine &TM) 428 : ST(ST_) { 429 using namespace TargetOpcode; 430 431 auto GetAddrSpacePtr = [&TM](unsigned AS) { 432 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 433 }; 434 435 const LLT S1 = LLT::scalar(1); 436 const LLT S8 = LLT::scalar(8); 437 const LLT S16 = LLT::scalar(16); 438 const LLT S32 = LLT::scalar(32); 439 const LLT S64 = LLT::scalar(64); 440 const LLT S128 = LLT::scalar(128); 441 const LLT S256 = LLT::scalar(256); 442 const LLT S512 = LLT::scalar(512); 443 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 444 445 const LLT V2S8 = LLT::fixed_vector(2, 8); 446 const LLT V2S16 = LLT::fixed_vector(2, 16); 447 const LLT V4S16 = LLT::fixed_vector(4, 16); 448 449 const LLT V2S32 = LLT::fixed_vector(2, 32); 450 const LLT V3S32 = LLT::fixed_vector(3, 32); 451 const LLT V4S32 = LLT::fixed_vector(4, 32); 452 const LLT V5S32 = LLT::fixed_vector(5, 32); 453 const LLT V6S32 = LLT::fixed_vector(6, 32); 454 const LLT V7S32 = LLT::fixed_vector(7, 32); 455 const LLT V8S32 = LLT::fixed_vector(8, 32); 456 const LLT V9S32 = LLT::fixed_vector(9, 32); 457 const LLT V10S32 = LLT::fixed_vector(10, 32); 458 const LLT V11S32 = LLT::fixed_vector(11, 32); 459 const LLT V12S32 = LLT::fixed_vector(12, 32); 460 const LLT V13S32 = LLT::fixed_vector(13, 32); 461 const LLT V14S32 = LLT::fixed_vector(14, 32); 462 const LLT V15S32 = LLT::fixed_vector(15, 32); 463 const LLT V16S32 = LLT::fixed_vector(16, 32); 464 const LLT V32S32 = LLT::fixed_vector(32, 32); 465 466 const LLT V2S64 = LLT::fixed_vector(2, 64); 467 const LLT V3S64 = LLT::fixed_vector(3, 64); 468 const LLT V4S64 = LLT::fixed_vector(4, 64); 469 const LLT V5S64 = LLT::fixed_vector(5, 64); 470 const LLT V6S64 = LLT::fixed_vector(6, 64); 471 const LLT V7S64 = LLT::fixed_vector(7, 64); 472 const LLT V8S64 = LLT::fixed_vector(8, 64); 473 const LLT V16S64 = LLT::fixed_vector(16, 64); 474 475 std::initializer_list<LLT> AllS32Vectors = 476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 478 std::initializer_list<LLT> AllS64Vectors = 479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 480 481 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 482 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 483 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 484 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 485 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 486 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 487 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 488 489 const LLT CodePtr = FlatPtr; 490 491 const std::initializer_list<LLT> AddrSpaces64 = { 492 GlobalPtr, ConstantPtr, FlatPtr 493 }; 494 495 const std::initializer_list<LLT> AddrSpaces32 = { 496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 497 }; 498 499 const std::initializer_list<LLT> FPTypesBase = { 500 S32, S64 501 }; 502 503 const std::initializer_list<LLT> FPTypes16 = { 504 S32, S64, S16 505 }; 506 507 const std::initializer_list<LLT> FPTypesPK16 = { 508 S32, S64, S16, V2S16 509 }; 510 511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 512 513 // s1 for VCC branches, s32 for SCC branches. 514 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 515 516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 517 // elements for v3s16 518 getActionDefinitionsBuilder(G_PHI) 519 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 520 .legalFor(AllS32Vectors) 521 .legalFor(AllS64Vectors) 522 .legalFor(AddrSpaces64) 523 .legalFor(AddrSpaces32) 524 .legalIf(isPointer(0)) 525 .clampScalar(0, S16, S256) 526 .widenScalarToNextPow2(0, 32) 527 .clampMaxNumElements(0, S32, 16) 528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 529 .scalarize(0); 530 531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 532 // Full set of gfx9 features. 533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 534 .legalFor({S32, S16, V2S16}) 535 .minScalar(0, S16) 536 .clampMaxNumElementsStrict(0, S16, 2) 537 .widenScalarToNextMultipleOf(0, 32) 538 .maxScalar(0, S32) 539 .scalarize(0); 540 541 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 542 .legalFor({S32, S16, V2S16}) // Clamp modifier 543 .minScalarOrElt(0, S16) 544 .clampMaxNumElementsStrict(0, S16, 2) 545 .scalarize(0) 546 .widenScalarToNextPow2(0, 32) 547 .lower(); 548 } else if (ST.has16BitInsts()) { 549 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 550 .legalFor({S32, S16}) 551 .minScalar(0, S16) 552 .widenScalarToNextMultipleOf(0, 32) 553 .maxScalar(0, S32) 554 .scalarize(0); 555 556 // Technically the saturating operations require clamp bit support, but this 557 // was introduced at the same time as 16-bit operations. 558 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 559 .legalFor({S32, S16}) // Clamp modifier 560 .minScalar(0, S16) 561 .scalarize(0) 562 .widenScalarToNextPow2(0, 16) 563 .lower(); 564 565 // We're just lowering this, but it helps get a better result to try to 566 // coerce to the desired type first. 567 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 568 .minScalar(0, S16) 569 .scalarize(0) 570 .lower(); 571 } else { 572 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 573 .legalFor({S32}) 574 .widenScalarToNextMultipleOf(0, 32) 575 .clampScalar(0, S32, S32) 576 .scalarize(0); 577 578 if (ST.hasIntClamp()) { 579 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 580 .legalFor({S32}) // Clamp modifier. 581 .scalarize(0) 582 .minScalarOrElt(0, S32) 583 .lower(); 584 } else { 585 // Clamp bit support was added in VI, along with 16-bit operations. 586 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 587 .minScalar(0, S32) 588 .scalarize(0) 589 .lower(); 590 } 591 592 // FIXME: DAG expansion gets better results. The widening uses the smaller 593 // range values and goes for the min/max lowering directly. 594 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 595 .minScalar(0, S32) 596 .scalarize(0) 597 .lower(); 598 } 599 600 getActionDefinitionsBuilder( 601 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 602 .customFor({S32, S64}) 603 .clampScalar(0, S32, S64) 604 .widenScalarToNextPow2(0, 32) 605 .scalarize(0); 606 607 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 608 .legalFor({S32}) 609 .maxScalar(0, S32); 610 611 if (ST.hasVOP3PInsts()) { 612 Mulh 613 .clampMaxNumElements(0, S8, 2) 614 .lowerFor({V2S8}); 615 } 616 617 Mulh 618 .scalarize(0) 619 .lower(); 620 621 // Report legal for any types we can handle anywhere. For the cases only legal 622 // on the SALU, RegBankSelect will be able to re-legalize. 623 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 624 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 625 .clampScalar(0, S32, S64) 626 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 627 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 628 .widenScalarToNextPow2(0) 629 .scalarize(0); 630 631 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 632 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 633 .legalFor({{S32, S1}, {S32, S32}}) 634 .minScalar(0, S32) 635 // TODO: .scalarize(0) 636 .lower(); 637 638 getActionDefinitionsBuilder(G_BITCAST) 639 // Don't worry about the size constraint. 640 .legalIf(all(isRegisterType(0), isRegisterType(1))) 641 .lower(); 642 643 644 getActionDefinitionsBuilder(G_CONSTANT) 645 .legalFor({S1, S32, S64, S16, GlobalPtr, 646 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 647 .legalIf(isPointer(0)) 648 .clampScalar(0, S32, S64) 649 .widenScalarToNextPow2(0); 650 651 getActionDefinitionsBuilder(G_FCONSTANT) 652 .legalFor({S32, S64, S16}) 653 .clampScalar(0, S16, S64); 654 655 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 656 .legalIf(isRegisterType(0)) 657 // s1 and s16 are special cases because they have legal operations on 658 // them, but don't really occupy registers in the normal way. 659 .legalFor({S1, S16}) 660 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 661 .clampScalarOrElt(0, S32, MaxScalar) 662 .widenScalarToNextPow2(0, 32) 663 .clampMaxNumElements(0, S32, 16); 664 665 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 666 667 // If the amount is divergent, we have to do a wave reduction to get the 668 // maximum value, so this is expanded during RegBankSelect. 669 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 670 .legalFor({{PrivatePtr, S32}}); 671 672 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 673 .customIf(typeIsNot(0, PrivatePtr)); 674 675 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 676 677 auto &FPOpActions = getActionDefinitionsBuilder( 678 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 679 .legalFor({S32, S64}); 680 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 681 .customFor({S32, S64}); 682 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 683 .customFor({S32, S64}); 684 685 if (ST.has16BitInsts()) { 686 if (ST.hasVOP3PInsts()) 687 FPOpActions.legalFor({S16, V2S16}); 688 else 689 FPOpActions.legalFor({S16}); 690 691 TrigActions.customFor({S16}); 692 FDIVActions.customFor({S16}); 693 } 694 695 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 696 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 697 698 if (ST.hasVOP3PInsts()) { 699 MinNumMaxNum.customFor(FPTypesPK16) 700 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 701 .clampMaxNumElements(0, S16, 2) 702 .clampScalar(0, S16, S64) 703 .scalarize(0); 704 } else if (ST.has16BitInsts()) { 705 MinNumMaxNum.customFor(FPTypes16) 706 .clampScalar(0, S16, S64) 707 .scalarize(0); 708 } else { 709 MinNumMaxNum.customFor(FPTypesBase) 710 .clampScalar(0, S32, S64) 711 .scalarize(0); 712 } 713 714 if (ST.hasVOP3PInsts()) 715 FPOpActions.clampMaxNumElementsStrict(0, S16, 2); 716 717 FPOpActions 718 .scalarize(0) 719 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 720 721 TrigActions 722 .scalarize(0) 723 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 724 725 FDIVActions 726 .scalarize(0) 727 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 728 729 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 730 .legalFor(FPTypesPK16) 731 .clampMaxNumElementsStrict(0, S16, 2) 732 .scalarize(0) 733 .clampScalar(0, S16, S64); 734 735 if (ST.has16BitInsts()) { 736 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 737 .legalFor({S32, S64, S16}) 738 .scalarize(0) 739 .clampScalar(0, S16, S64); 740 } else { 741 getActionDefinitionsBuilder(G_FSQRT) 742 .legalFor({S32, S64}) 743 .scalarize(0) 744 .clampScalar(0, S32, S64); 745 746 if (ST.hasFractBug()) { 747 getActionDefinitionsBuilder(G_FFLOOR) 748 .customFor({S64}) 749 .legalFor({S32, S64}) 750 .scalarize(0) 751 .clampScalar(0, S32, S64); 752 } else { 753 getActionDefinitionsBuilder(G_FFLOOR) 754 .legalFor({S32, S64}) 755 .scalarize(0) 756 .clampScalar(0, S32, S64); 757 } 758 } 759 760 getActionDefinitionsBuilder(G_FPTRUNC) 761 .legalFor({{S32, S64}, {S16, S32}}) 762 .scalarize(0) 763 .lower(); 764 765 getActionDefinitionsBuilder(G_FPEXT) 766 .legalFor({{S64, S32}, {S32, S16}}) 767 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 768 .scalarize(0); 769 770 getActionDefinitionsBuilder(G_FSUB) 771 // Use actual fsub instruction 772 .legalFor({S32}) 773 // Must use fadd + fneg 774 .lowerFor({S64, S16, V2S16}) 775 .scalarize(0) 776 .clampScalar(0, S32, S64); 777 778 // Whether this is legal depends on the floating point mode for the function. 779 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 780 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 781 FMad.customFor({S32, S16}); 782 else if (ST.hasMadMacF32Insts()) 783 FMad.customFor({S32}); 784 else if (ST.hasMadF16()) 785 FMad.customFor({S16}); 786 FMad.scalarize(0) 787 .lower(); 788 789 auto &FRem = getActionDefinitionsBuilder(G_FREM); 790 if (ST.has16BitInsts()) { 791 FRem.customFor({S16, S32, S64}); 792 } else { 793 FRem.minScalar(0, S32) 794 .customFor({S32, S64}); 795 } 796 FRem.scalarize(0); 797 798 // TODO: Do we need to clamp maximum bitwidth? 799 getActionDefinitionsBuilder(G_TRUNC) 800 .legalIf(isScalar(0)) 801 .legalFor({{V2S16, V2S32}}) 802 .clampMaxNumElements(0, S16, 2) 803 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 804 // situations (like an invalid implicit use), we don't want to infinite loop 805 // in the legalizer. 806 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 807 .alwaysLegal(); 808 809 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 810 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 811 {S32, S1}, {S64, S1}, {S16, S1}}) 812 .scalarize(0) 813 .clampScalar(0, S32, S64) 814 .widenScalarToNextPow2(1, 32); 815 816 // TODO: Split s1->s64 during regbankselect for VALU. 817 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 818 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 819 .lowerIf(typeIs(1, S1)) 820 .customFor({{S32, S64}, {S64, S64}}); 821 if (ST.has16BitInsts()) 822 IToFP.legalFor({{S16, S16}}); 823 IToFP.clampScalar(1, S32, S64) 824 .minScalar(0, S32) 825 .scalarize(0) 826 .widenScalarToNextPow2(1); 827 828 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 829 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 830 .customFor({{S64, S32}, {S64, S64}}) 831 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 832 if (ST.has16BitInsts()) 833 FPToI.legalFor({{S16, S16}}); 834 else 835 FPToI.minScalar(1, S32); 836 837 FPToI.minScalar(0, S32) 838 .widenScalarToNextPow2(0, 32) 839 .scalarize(0) 840 .lower(); 841 842 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND) 843 .customFor({S16, S32}) 844 .scalarize(0) 845 .lower(); 846 847 // Lower roundeven into G_FRINT 848 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 849 .scalarize(0) 850 .lower(); 851 852 if (ST.has16BitInsts()) { 853 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 854 .legalFor({S16, S32, S64}) 855 .clampScalar(0, S16, S64) 856 .scalarize(0); 857 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 858 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 859 .legalFor({S32, S64}) 860 .clampScalar(0, S32, S64) 861 .scalarize(0); 862 } else { 863 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 864 .legalFor({S32}) 865 .customFor({S64}) 866 .clampScalar(0, S32, S64) 867 .scalarize(0); 868 } 869 870 getActionDefinitionsBuilder(G_PTR_ADD) 871 .legalIf(all(isPointer(0), sameSize(0, 1))) 872 .scalarize(0) 873 .scalarSameSizeAs(1, 0); 874 875 getActionDefinitionsBuilder(G_PTRMASK) 876 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 877 .scalarSameSizeAs(1, 0) 878 .scalarize(0); 879 880 auto &CmpBuilder = 881 getActionDefinitionsBuilder(G_ICMP) 882 // The compare output type differs based on the register bank of the output, 883 // so make both s1 and s32 legal. 884 // 885 // Scalar compares producing output in scc will be promoted to s32, as that 886 // is the allocatable register type that will be needed for the copy from 887 // scc. This will be promoted during RegBankSelect, and we assume something 888 // before that won't try to use s32 result types. 889 // 890 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 891 // bank. 892 .legalForCartesianProduct( 893 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 894 .legalForCartesianProduct( 895 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 896 if (ST.has16BitInsts()) { 897 CmpBuilder.legalFor({{S1, S16}}); 898 } 899 900 CmpBuilder 901 .widenScalarToNextPow2(1) 902 .clampScalar(1, S32, S64) 903 .scalarize(0) 904 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 905 906 getActionDefinitionsBuilder(G_FCMP) 907 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 908 .widenScalarToNextPow2(1) 909 .clampScalar(1, S32, S64) 910 .scalarize(0); 911 912 // FIXME: fpow has a selection pattern that should move to custom lowering. 913 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 914 if (ST.has16BitInsts()) 915 Exp2Ops.legalFor({S32, S16}); 916 else 917 Exp2Ops.legalFor({S32}); 918 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 919 Exp2Ops.scalarize(0); 920 921 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 922 if (ST.has16BitInsts()) 923 ExpOps.customFor({{S32}, {S16}}); 924 else 925 ExpOps.customFor({S32}); 926 ExpOps.clampScalar(0, MinScalarFPTy, S32) 927 .scalarize(0); 928 929 getActionDefinitionsBuilder(G_FPOWI) 930 .clampScalar(0, MinScalarFPTy, S32) 931 .lower(); 932 933 // The 64-bit versions produce 32-bit results, but only on the SALU. 934 getActionDefinitionsBuilder(G_CTPOP) 935 .legalFor({{S32, S32}, {S32, S64}}) 936 .clampScalar(0, S32, S32) 937 .widenScalarToNextPow2(1, 32) 938 .clampScalar(1, S32, S64) 939 .scalarize(0) 940 .widenScalarToNextPow2(0, 32); 941 942 943 // The hardware instructions return a different result on 0 than the generic 944 // instructions expect. The hardware produces -1, but these produce the 945 // bitwidth. 946 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 947 .scalarize(0) 948 .clampScalar(0, S32, S32) 949 .clampScalar(1, S32, S64) 950 .widenScalarToNextPow2(0, 32) 951 .widenScalarToNextPow2(1, 32) 952 .custom(); 953 954 // The 64-bit versions produce 32-bit results, but only on the SALU. 955 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 956 .legalFor({{S32, S32}, {S32, S64}}) 957 .clampScalar(0, S32, S32) 958 .clampScalar(1, S32, S64) 959 .scalarize(0) 960 .widenScalarToNextPow2(0, 32) 961 .widenScalarToNextPow2(1, 32); 962 963 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 964 // RegBankSelect. 965 getActionDefinitionsBuilder(G_BITREVERSE) 966 .legalFor({S32, S64}) 967 .clampScalar(0, S32, S64) 968 .scalarize(0) 969 .widenScalarToNextPow2(0); 970 971 if (ST.has16BitInsts()) { 972 getActionDefinitionsBuilder(G_BSWAP) 973 .legalFor({S16, S32, V2S16}) 974 .clampMaxNumElementsStrict(0, S16, 2) 975 // FIXME: Fixing non-power-of-2 before clamp is workaround for 976 // narrowScalar limitation. 977 .widenScalarToNextPow2(0) 978 .clampScalar(0, S16, S32) 979 .scalarize(0); 980 981 if (ST.hasVOP3PInsts()) { 982 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 983 .legalFor({S32, S16, V2S16}) 984 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 985 .clampMaxNumElements(0, S16, 2) 986 .minScalar(0, S16) 987 .widenScalarToNextPow2(0) 988 .scalarize(0) 989 .lower(); 990 } else { 991 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 992 .legalFor({S32, S16}) 993 .widenScalarToNextPow2(0) 994 .minScalar(0, S16) 995 .scalarize(0) 996 .lower(); 997 } 998 } else { 999 // TODO: Should have same legality without v_perm_b32 1000 getActionDefinitionsBuilder(G_BSWAP) 1001 .legalFor({S32}) 1002 .lowerIf(scalarNarrowerThan(0, 32)) 1003 // FIXME: Fixing non-power-of-2 before clamp is workaround for 1004 // narrowScalar limitation. 1005 .widenScalarToNextPow2(0) 1006 .maxScalar(0, S32) 1007 .scalarize(0) 1008 .lower(); 1009 1010 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1011 .legalFor({S32}) 1012 .minScalar(0, S32) 1013 .widenScalarToNextPow2(0) 1014 .scalarize(0) 1015 .lower(); 1016 } 1017 1018 getActionDefinitionsBuilder(G_INTTOPTR) 1019 // List the common cases 1020 .legalForCartesianProduct(AddrSpaces64, {S64}) 1021 .legalForCartesianProduct(AddrSpaces32, {S32}) 1022 .scalarize(0) 1023 // Accept any address space as long as the size matches 1024 .legalIf(sameSize(0, 1)) 1025 .widenScalarIf(smallerThan(1, 0), 1026 [](const LegalityQuery &Query) { 1027 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1028 }) 1029 .narrowScalarIf(largerThan(1, 0), 1030 [](const LegalityQuery &Query) { 1031 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1032 }); 1033 1034 getActionDefinitionsBuilder(G_PTRTOINT) 1035 // List the common cases 1036 .legalForCartesianProduct(AddrSpaces64, {S64}) 1037 .legalForCartesianProduct(AddrSpaces32, {S32}) 1038 .scalarize(0) 1039 // Accept any address space as long as the size matches 1040 .legalIf(sameSize(0, 1)) 1041 .widenScalarIf(smallerThan(0, 1), 1042 [](const LegalityQuery &Query) { 1043 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1044 }) 1045 .narrowScalarIf( 1046 largerThan(0, 1), 1047 [](const LegalityQuery &Query) { 1048 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1049 }); 1050 1051 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1052 .scalarize(0) 1053 .custom(); 1054 1055 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1056 bool IsLoad) -> bool { 1057 const LLT DstTy = Query.Types[0]; 1058 1059 // Split vector extloads. 1060 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1061 1062 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1063 return true; 1064 1065 const LLT PtrTy = Query.Types[1]; 1066 unsigned AS = PtrTy.getAddressSpace(); 1067 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1068 return true; 1069 1070 // Catch weird sized loads that don't evenly divide into the access sizes 1071 // TODO: May be able to widen depending on alignment etc. 1072 unsigned NumRegs = (MemSize + 31) / 32; 1073 if (NumRegs == 3) { 1074 if (!ST.hasDwordx3LoadStores()) 1075 return true; 1076 } else { 1077 // If the alignment allows, these should have been widened. 1078 if (!isPowerOf2_32(NumRegs)) 1079 return true; 1080 } 1081 1082 return false; 1083 }; 1084 1085 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1086 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1087 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1088 1089 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1090 // LDS 1091 // TODO: Unsupported flat for SI. 1092 1093 for (unsigned Op : {G_LOAD, G_STORE}) { 1094 const bool IsStore = Op == G_STORE; 1095 1096 auto &Actions = getActionDefinitionsBuilder(Op); 1097 // Explicitly list some common cases. 1098 // TODO: Does this help compile time at all? 1099 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1100 {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1101 {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1102 {S64, GlobalPtr, S64, GlobalAlign32}, 1103 {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1104 {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1105 {S32, GlobalPtr, S8, GlobalAlign8}, 1106 {S32, GlobalPtr, S16, GlobalAlign16}, 1107 1108 {S32, LocalPtr, S32, 32}, 1109 {S64, LocalPtr, S64, 32}, 1110 {V2S32, LocalPtr, V2S32, 32}, 1111 {S32, LocalPtr, S8, 8}, 1112 {S32, LocalPtr, S16, 16}, 1113 {V2S16, LocalPtr, S32, 32}, 1114 1115 {S32, PrivatePtr, S32, 32}, 1116 {S32, PrivatePtr, S8, 8}, 1117 {S32, PrivatePtr, S16, 16}, 1118 {V2S16, PrivatePtr, S32, 32}, 1119 1120 {S32, ConstantPtr, S32, GlobalAlign32}, 1121 {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1122 {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1123 {S64, ConstantPtr, S64, GlobalAlign32}, 1124 {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 1125 Actions.legalIf( 1126 [=](const LegalityQuery &Query) -> bool { 1127 return isLoadStoreLegal(ST, Query); 1128 }); 1129 1130 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1131 // 64-bits. 1132 // 1133 // TODO: Should generalize bitcast action into coerce, which will also cover 1134 // inserting addrspacecasts. 1135 Actions.customIf(typeIs(1, Constant32Ptr)); 1136 1137 // Turn any illegal element vectors into something easier to deal 1138 // with. These will ultimately produce 32-bit scalar shifts to extract the 1139 // parts anyway. 1140 // 1141 // For odd 16-bit element vectors, prefer to split those into pieces with 1142 // 16-bit vector parts. 1143 Actions.bitcastIf( 1144 [=](const LegalityQuery &Query) -> bool { 1145 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1146 Query.MMODescrs[0].MemoryTy); 1147 }, bitcastToRegisterType(0)); 1148 1149 if (!IsStore) { 1150 // Widen suitably aligned loads by loading extra bytes. The standard 1151 // legalization actions can't properly express widening memory operands. 1152 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1153 return shouldWidenLoad(ST, Query, G_LOAD); 1154 }); 1155 } 1156 1157 // FIXME: load/store narrowing should be moved to lower action 1158 Actions 1159 .narrowScalarIf( 1160 [=](const LegalityQuery &Query) -> bool { 1161 return !Query.Types[0].isVector() && 1162 needToSplitMemOp(Query, Op == G_LOAD); 1163 }, 1164 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1165 const LLT DstTy = Query.Types[0]; 1166 const LLT PtrTy = Query.Types[1]; 1167 1168 const unsigned DstSize = DstTy.getSizeInBits(); 1169 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1170 1171 // Split extloads. 1172 if (DstSize > MemSize) 1173 return std::make_pair(0, LLT::scalar(MemSize)); 1174 1175 unsigned MaxSize = maxSizeForAddrSpace(ST, 1176 PtrTy.getAddressSpace(), 1177 Op == G_LOAD); 1178 if (MemSize > MaxSize) 1179 return std::make_pair(0, LLT::scalar(MaxSize)); 1180 1181 uint64_t Align = Query.MMODescrs[0].AlignInBits; 1182 return std::make_pair(0, LLT::scalar(Align)); 1183 }) 1184 .fewerElementsIf( 1185 [=](const LegalityQuery &Query) -> bool { 1186 return Query.Types[0].isVector() && 1187 needToSplitMemOp(Query, Op == G_LOAD); 1188 }, 1189 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1190 const LLT DstTy = Query.Types[0]; 1191 const LLT PtrTy = Query.Types[1]; 1192 1193 LLT EltTy = DstTy.getElementType(); 1194 unsigned MaxSize = maxSizeForAddrSpace(ST, 1195 PtrTy.getAddressSpace(), 1196 Op == G_LOAD); 1197 1198 // FIXME: Handle widened to power of 2 results better. This ends 1199 // up scalarizing. 1200 // FIXME: 3 element stores scalarized on SI 1201 1202 // Split if it's too large for the address space. 1203 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1204 if (MemSize > MaxSize) { 1205 unsigned NumElts = DstTy.getNumElements(); 1206 unsigned EltSize = EltTy.getSizeInBits(); 1207 1208 if (MaxSize % EltSize == 0) { 1209 return std::make_pair( 1210 0, LLT::scalarOrVector( 1211 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1212 } 1213 1214 unsigned NumPieces = MemSize / MaxSize; 1215 1216 // FIXME: Refine when odd breakdowns handled 1217 // The scalars will need to be re-legalized. 1218 if (NumPieces == 1 || NumPieces >= NumElts || 1219 NumElts % NumPieces != 0) 1220 return std::make_pair(0, EltTy); 1221 1222 return std::make_pair( 1223 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1224 } 1225 1226 // FIXME: We could probably handle weird extending loads better. 1227 if (DstTy.getSizeInBits() > MemSize) 1228 return std::make_pair(0, EltTy); 1229 1230 unsigned EltSize = EltTy.getSizeInBits(); 1231 unsigned DstSize = DstTy.getSizeInBits(); 1232 if (!isPowerOf2_32(DstSize)) { 1233 // We're probably decomposing an odd sized store. Try to split 1234 // to the widest type. TODO: Account for alignment. As-is it 1235 // should be OK, since the new parts will be further legalized. 1236 unsigned FloorSize = PowerOf2Floor(DstSize); 1237 return std::make_pair( 1238 0, LLT::scalarOrVector( 1239 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1240 } 1241 1242 // May need relegalization for the scalars. 1243 return std::make_pair(0, EltTy); 1244 }) 1245 .minScalar(0, S32) 1246 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1247 .widenScalarToNextPow2(0) 1248 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1249 .lower(); 1250 } 1251 1252 // FIXME: Unaligned accesses not lowered. 1253 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1254 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1255 {S32, GlobalPtr, S16, 2 * 8}, 1256 {S32, LocalPtr, S8, 8}, 1257 {S32, LocalPtr, S16, 16}, 1258 {S32, PrivatePtr, S8, 8}, 1259 {S32, PrivatePtr, S16, 16}, 1260 {S32, ConstantPtr, S8, 8}, 1261 {S32, ConstantPtr, S16, 2 * 8}}) 1262 .legalIf( 1263 [=](const LegalityQuery &Query) -> bool { 1264 return isLoadStoreLegal(ST, Query); 1265 }); 1266 1267 if (ST.hasFlatAddressSpace()) { 1268 ExtLoads.legalForTypesWithMemDesc( 1269 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 1270 } 1271 1272 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1273 // 64-bits. 1274 // 1275 // TODO: Should generalize bitcast action into coerce, which will also cover 1276 // inserting addrspacecasts. 1277 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1278 1279 ExtLoads.clampScalar(0, S32, S32) 1280 .widenScalarToNextPow2(0) 1281 .lower(); 1282 1283 auto &Atomics = getActionDefinitionsBuilder( 1284 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1285 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1286 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1287 G_ATOMICRMW_UMIN}) 1288 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1289 {S64, GlobalPtr}, {S64, LocalPtr}, 1290 {S32, RegionPtr}, {S64, RegionPtr}}); 1291 if (ST.hasFlatAddressSpace()) { 1292 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1293 } 1294 1295 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1296 if (ST.hasLDSFPAtomicAdd()) { 1297 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1298 if (ST.hasGFX90AInsts()) 1299 Atomic.legalFor({{S64, LocalPtr}}); 1300 if (ST.hasGFX940Insts()) 1301 Atomic.legalFor({{V2S16, LocalPtr}}); 1302 } 1303 if (ST.hasAtomicFaddInsts()) 1304 Atomic.legalFor({{S32, GlobalPtr}}); 1305 1306 if (ST.hasGFX90AInsts()) { 1307 // These are legal with some caveats, and should have undergone expansion in 1308 // the IR in most situations 1309 // TODO: Move atomic expansion into legalizer 1310 // TODO: Also supports <2 x f16> 1311 Atomic.legalFor({ 1312 {S32, GlobalPtr}, 1313 {S64, GlobalPtr}, 1314 {S64, FlatPtr} 1315 }); 1316 } 1317 1318 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1319 // demarshalling 1320 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1321 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1322 {S32, FlatPtr}, {S64, FlatPtr}}) 1323 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1324 {S32, RegionPtr}, {S64, RegionPtr}}); 1325 // TODO: Pointer types, any 32-bit or 64-bit vector 1326 1327 // Condition should be s32 for scalar, s1 for vector. 1328 getActionDefinitionsBuilder(G_SELECT) 1329 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1330 LocalPtr, FlatPtr, PrivatePtr, 1331 LLT::fixed_vector(2, LocalPtr), 1332 LLT::fixed_vector(2, PrivatePtr)}, 1333 {S1, S32}) 1334 .clampScalar(0, S16, S64) 1335 .scalarize(1) 1336 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1337 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1338 .clampMaxNumElements(0, S32, 2) 1339 .clampMaxNumElements(0, LocalPtr, 2) 1340 .clampMaxNumElements(0, PrivatePtr, 2) 1341 .scalarize(0) 1342 .widenScalarToNextPow2(0) 1343 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1344 1345 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1346 // be more flexible with the shift amount type. 1347 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1348 .legalFor({{S32, S32}, {S64, S32}}); 1349 if (ST.has16BitInsts()) { 1350 if (ST.hasVOP3PInsts()) { 1351 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1352 .clampMaxNumElements(0, S16, 2); 1353 } else 1354 Shifts.legalFor({{S16, S16}}); 1355 1356 // TODO: Support 16-bit shift amounts for all types 1357 Shifts.widenScalarIf( 1358 [=](const LegalityQuery &Query) { 1359 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1360 // 32-bit amount. 1361 const LLT ValTy = Query.Types[0]; 1362 const LLT AmountTy = Query.Types[1]; 1363 return ValTy.getSizeInBits() <= 16 && 1364 AmountTy.getSizeInBits() < 16; 1365 }, changeTo(1, S16)); 1366 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1367 Shifts.clampScalar(1, S32, S32); 1368 Shifts.widenScalarToNextPow2(0, 16); 1369 Shifts.clampScalar(0, S16, S64); 1370 1371 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1372 .minScalar(0, S16) 1373 .scalarize(0) 1374 .lower(); 1375 } else { 1376 // Make sure we legalize the shift amount type first, as the general 1377 // expansion for the shifted type will produce much worse code if it hasn't 1378 // been truncated already. 1379 Shifts.clampScalar(1, S32, S32); 1380 Shifts.widenScalarToNextPow2(0, 32); 1381 Shifts.clampScalar(0, S32, S64); 1382 1383 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1384 .minScalar(0, S32) 1385 .scalarize(0) 1386 .lower(); 1387 } 1388 Shifts.scalarize(0); 1389 1390 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1391 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1392 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1393 unsigned IdxTypeIdx = 2; 1394 1395 getActionDefinitionsBuilder(Op) 1396 .customIf([=](const LegalityQuery &Query) { 1397 const LLT EltTy = Query.Types[EltTypeIdx]; 1398 const LLT VecTy = Query.Types[VecTypeIdx]; 1399 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1400 const unsigned EltSize = EltTy.getSizeInBits(); 1401 return (EltSize == 32 || EltSize == 64) && 1402 VecTy.getSizeInBits() % 32 == 0 && 1403 VecTy.getSizeInBits() <= MaxRegisterSize && 1404 IdxTy.getSizeInBits() == 32; 1405 }) 1406 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1407 bitcastToVectorElement32(VecTypeIdx)) 1408 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1409 .bitcastIf( 1410 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1411 [=](const LegalityQuery &Query) { 1412 // For > 64-bit element types, try to turn this into a 64-bit 1413 // element vector since we may be able to do better indexing 1414 // if this is scalar. If not, fall back to 32. 1415 const LLT EltTy = Query.Types[EltTypeIdx]; 1416 const LLT VecTy = Query.Types[VecTypeIdx]; 1417 const unsigned DstEltSize = EltTy.getSizeInBits(); 1418 const unsigned VecSize = VecTy.getSizeInBits(); 1419 1420 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1421 return std::make_pair( 1422 VecTypeIdx, 1423 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1424 }) 1425 .clampScalar(EltTypeIdx, S32, S64) 1426 .clampScalar(VecTypeIdx, S32, S64) 1427 .clampScalar(IdxTypeIdx, S32, S32) 1428 .clampMaxNumElements(VecTypeIdx, S32, 32) 1429 // TODO: Clamp elements for 64-bit vectors? 1430 // It should only be necessary with variable indexes. 1431 // As a last resort, lower to the stack 1432 .lower(); 1433 } 1434 1435 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1436 .unsupportedIf([=](const LegalityQuery &Query) { 1437 const LLT &EltTy = Query.Types[1].getElementType(); 1438 return Query.Types[0] != EltTy; 1439 }); 1440 1441 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1442 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1443 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1444 1445 // FIXME: Doesn't handle extract of illegal sizes. 1446 getActionDefinitionsBuilder(Op) 1447 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1448 .lowerIf([=](const LegalityQuery &Query) { 1449 // Sub-vector(or single element) insert and extract. 1450 // TODO: verify immediate offset here since lower only works with 1451 // whole elements. 1452 const LLT BigTy = Query.Types[BigTyIdx]; 1453 return BigTy.isVector(); 1454 }) 1455 // FIXME: Multiples of 16 should not be legal. 1456 .legalIf([=](const LegalityQuery &Query) { 1457 const LLT BigTy = Query.Types[BigTyIdx]; 1458 const LLT LitTy = Query.Types[LitTyIdx]; 1459 return (BigTy.getSizeInBits() % 32 == 0) && 1460 (LitTy.getSizeInBits() % 16 == 0); 1461 }) 1462 .widenScalarIf( 1463 [=](const LegalityQuery &Query) { 1464 const LLT BigTy = Query.Types[BigTyIdx]; 1465 return (BigTy.getScalarSizeInBits() < 16); 1466 }, 1467 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1468 .widenScalarIf( 1469 [=](const LegalityQuery &Query) { 1470 const LLT LitTy = Query.Types[LitTyIdx]; 1471 return (LitTy.getScalarSizeInBits() < 16); 1472 }, 1473 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1474 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1475 .widenScalarToNextPow2(BigTyIdx, 32); 1476 1477 } 1478 1479 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1480 .legalForCartesianProduct(AllS32Vectors, {S32}) 1481 .legalForCartesianProduct(AllS64Vectors, {S64}) 1482 .clampNumElements(0, V16S32, V32S32) 1483 .clampNumElements(0, V2S64, V16S64) 1484 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1485 1486 if (ST.hasScalarPackInsts()) { 1487 BuildVector 1488 // FIXME: Should probably widen s1 vectors straight to s32 1489 .minScalarOrElt(0, S16) 1490 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1491 .minScalar(1, S32); 1492 1493 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1494 .legalFor({V2S16, S32}) 1495 .lower(); 1496 BuildVector.minScalarOrElt(0, S32); 1497 } else { 1498 BuildVector.customFor({V2S16, S16}); 1499 BuildVector.minScalarOrElt(0, S32); 1500 1501 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1502 .customFor({V2S16, S32}) 1503 .lower(); 1504 } 1505 1506 BuildVector.legalIf(isRegisterType(0)); 1507 1508 // FIXME: Clamp maximum size 1509 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1510 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1511 .clampMaxNumElements(0, S32, 32) 1512 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1513 .clampMaxNumElements(0, S16, 64); 1514 1515 // TODO: Don't fully scalarize v2s16 pieces? Or combine out those 1516 // pre-legalize. 1517 if (ST.hasVOP3PInsts()) { 1518 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1519 .customFor({V2S16, V2S16}) 1520 .lower(); 1521 } else 1522 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1523 1524 // Merge/Unmerge 1525 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1526 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1527 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1528 1529 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1530 const LLT Ty = Query.Types[TypeIdx]; 1531 if (Ty.isVector()) { 1532 const LLT &EltTy = Ty.getElementType(); 1533 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1534 return true; 1535 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1536 return true; 1537 } 1538 return false; 1539 }; 1540 1541 auto &Builder = getActionDefinitionsBuilder(Op) 1542 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1543 .lowerFor({{S16, V2S16}}) 1544 .lowerIf([=](const LegalityQuery &Query) { 1545 const LLT BigTy = Query.Types[BigTyIdx]; 1546 return BigTy.getSizeInBits() == 32; 1547 }) 1548 // Try to widen to s16 first for small types. 1549 // TODO: Only do this on targets with legal s16 shifts 1550 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1551 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1552 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1553 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1554 elementTypeIs(1, S16)), 1555 changeTo(1, V2S16)) 1556 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1557 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1558 // valid. 1559 .clampScalar(LitTyIdx, S32, S512) 1560 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1561 // Break up vectors with weird elements into scalars 1562 .fewerElementsIf( 1563 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1564 scalarize(0)) 1565 .fewerElementsIf( 1566 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1567 scalarize(1)) 1568 .clampScalar(BigTyIdx, S32, MaxScalar); 1569 1570 if (Op == G_MERGE_VALUES) { 1571 Builder.widenScalarIf( 1572 // TODO: Use 16-bit shifts if legal for 8-bit values? 1573 [=](const LegalityQuery &Query) { 1574 const LLT Ty = Query.Types[LitTyIdx]; 1575 return Ty.getSizeInBits() < 32; 1576 }, 1577 changeTo(LitTyIdx, S32)); 1578 } 1579 1580 Builder.widenScalarIf( 1581 [=](const LegalityQuery &Query) { 1582 const LLT Ty = Query.Types[BigTyIdx]; 1583 return !isPowerOf2_32(Ty.getSizeInBits()) && 1584 Ty.getSizeInBits() % 16 != 0; 1585 }, 1586 [=](const LegalityQuery &Query) { 1587 // Pick the next power of 2, or a multiple of 64 over 128. 1588 // Whichever is smaller. 1589 const LLT &Ty = Query.Types[BigTyIdx]; 1590 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1591 if (NewSizeInBits >= 256) { 1592 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1593 if (RoundedTo < NewSizeInBits) 1594 NewSizeInBits = RoundedTo; 1595 } 1596 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1597 }) 1598 // Any vectors left are the wrong size. Scalarize them. 1599 .scalarize(0) 1600 .scalarize(1); 1601 } 1602 1603 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1604 // RegBankSelect. 1605 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1606 .legalFor({{S32}, {S64}}); 1607 1608 if (ST.hasVOP3PInsts()) { 1609 SextInReg.lowerFor({{V2S16}}) 1610 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1611 // get more vector shift opportunities, since we'll get those when 1612 // expanded. 1613 .clampMaxNumElementsStrict(0, S16, 2); 1614 } else if (ST.has16BitInsts()) { 1615 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1616 } else { 1617 // Prefer to promote to s32 before lowering if we don't have 16-bit 1618 // shifts. This avoid a lot of intermediate truncate and extend operations. 1619 SextInReg.lowerFor({{S32}, {S64}}); 1620 } 1621 1622 SextInReg 1623 .scalarize(0) 1624 .clampScalar(0, S32, S64) 1625 .lower(); 1626 1627 getActionDefinitionsBuilder({G_ROTR, G_ROTL}) 1628 .scalarize(0) 1629 .lower(); 1630 1631 // TODO: Only Try to form v2s16 with legal packed instructions. 1632 getActionDefinitionsBuilder(G_FSHR) 1633 .legalFor({{S32, S32}}) 1634 .lowerFor({{V2S16, V2S16}}) 1635 .clampMaxNumElementsStrict(0, S16, 2) 1636 .scalarize(0) 1637 .lower(); 1638 1639 if (ST.hasVOP3PInsts()) { 1640 getActionDefinitionsBuilder(G_FSHL) 1641 .lowerFor({{V2S16, V2S16}}) 1642 .clampMaxNumElementsStrict(0, S16, 2) 1643 .scalarize(0) 1644 .lower(); 1645 } else { 1646 getActionDefinitionsBuilder(G_FSHL) 1647 .scalarize(0) 1648 .lower(); 1649 } 1650 1651 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1652 .legalFor({S64}); 1653 1654 getActionDefinitionsBuilder(G_FENCE) 1655 .alwaysLegal(); 1656 1657 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1658 .scalarize(0) 1659 .minScalar(0, S32) 1660 .lower(); 1661 1662 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1663 .legalFor({{S32, S32}, {S64, S32}}) 1664 .clampScalar(1, S32, S32) 1665 .clampScalar(0, S32, S64) 1666 .widenScalarToNextPow2(0) 1667 .scalarize(0); 1668 1669 getActionDefinitionsBuilder({ 1670 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1671 G_FCOPYSIGN, 1672 1673 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1674 G_ATOMICRMW_NAND, 1675 G_ATOMICRMW_FSUB, 1676 G_READ_REGISTER, 1677 G_WRITE_REGISTER, 1678 1679 G_SADDO, G_SSUBO, 1680 1681 // TODO: Implement 1682 G_FMINIMUM, G_FMAXIMUM}).lower(); 1683 1684 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) 1685 .lower(); 1686 1687 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1688 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1689 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1690 .unsupported(); 1691 1692 getLegacyLegalizerInfo().computeTables(); 1693 verify(*ST.getInstrInfo()); 1694 } 1695 1696 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1697 MachineInstr &MI) const { 1698 MachineIRBuilder &B = Helper.MIRBuilder; 1699 MachineRegisterInfo &MRI = *B.getMRI(); 1700 1701 switch (MI.getOpcode()) { 1702 case TargetOpcode::G_ADDRSPACE_CAST: 1703 return legalizeAddrSpaceCast(MI, MRI, B); 1704 case TargetOpcode::G_FRINT: 1705 return legalizeFrint(MI, MRI, B); 1706 case TargetOpcode::G_FCEIL: 1707 return legalizeFceil(MI, MRI, B); 1708 case TargetOpcode::G_FREM: 1709 return legalizeFrem(MI, MRI, B); 1710 case TargetOpcode::G_INTRINSIC_TRUNC: 1711 return legalizeIntrinsicTrunc(MI, MRI, B); 1712 case TargetOpcode::G_SITOFP: 1713 return legalizeITOFP(MI, MRI, B, true); 1714 case TargetOpcode::G_UITOFP: 1715 return legalizeITOFP(MI, MRI, B, false); 1716 case TargetOpcode::G_FPTOSI: 1717 return legalizeFPTOI(MI, MRI, B, true); 1718 case TargetOpcode::G_FPTOUI: 1719 return legalizeFPTOI(MI, MRI, B, false); 1720 case TargetOpcode::G_FMINNUM: 1721 case TargetOpcode::G_FMAXNUM: 1722 case TargetOpcode::G_FMINNUM_IEEE: 1723 case TargetOpcode::G_FMAXNUM_IEEE: 1724 return legalizeMinNumMaxNum(Helper, MI); 1725 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1726 return legalizeExtractVectorElt(MI, MRI, B); 1727 case TargetOpcode::G_INSERT_VECTOR_ELT: 1728 return legalizeInsertVectorElt(MI, MRI, B); 1729 case TargetOpcode::G_SHUFFLE_VECTOR: 1730 return legalizeShuffleVector(MI, MRI, B); 1731 case TargetOpcode::G_FSIN: 1732 case TargetOpcode::G_FCOS: 1733 return legalizeSinCos(MI, MRI, B); 1734 case TargetOpcode::G_GLOBAL_VALUE: 1735 return legalizeGlobalValue(MI, MRI, B); 1736 case TargetOpcode::G_LOAD: 1737 case TargetOpcode::G_SEXTLOAD: 1738 case TargetOpcode::G_ZEXTLOAD: 1739 return legalizeLoad(Helper, MI); 1740 case TargetOpcode::G_FMAD: 1741 return legalizeFMad(MI, MRI, B); 1742 case TargetOpcode::G_FDIV: 1743 return legalizeFDIV(MI, MRI, B); 1744 case TargetOpcode::G_UDIV: 1745 case TargetOpcode::G_UREM: 1746 case TargetOpcode::G_UDIVREM: 1747 return legalizeUnsignedDIV_REM(MI, MRI, B); 1748 case TargetOpcode::G_SDIV: 1749 case TargetOpcode::G_SREM: 1750 case TargetOpcode::G_SDIVREM: 1751 return legalizeSignedDIV_REM(MI, MRI, B); 1752 case TargetOpcode::G_ATOMIC_CMPXCHG: 1753 return legalizeAtomicCmpXChg(MI, MRI, B); 1754 case TargetOpcode::G_FLOG: 1755 return legalizeFlog(MI, B, numbers::ln2f); 1756 case TargetOpcode::G_FLOG10: 1757 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1758 case TargetOpcode::G_FEXP: 1759 return legalizeFExp(MI, B); 1760 case TargetOpcode::G_FPOW: 1761 return legalizeFPow(MI, B); 1762 case TargetOpcode::G_FFLOOR: 1763 return legalizeFFloor(MI, MRI, B); 1764 case TargetOpcode::G_BUILD_VECTOR: 1765 return legalizeBuildVector(MI, MRI, B); 1766 case TargetOpcode::G_CTLZ: 1767 case TargetOpcode::G_CTTZ: 1768 return legalizeCTLZ_CTTZ(MI, MRI, B); 1769 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND: 1770 return legalizeFPTruncRound(MI, B); 1771 default: 1772 return false; 1773 } 1774 1775 llvm_unreachable("expected switch to return"); 1776 } 1777 1778 Register AMDGPULegalizerInfo::getSegmentAperture( 1779 unsigned AS, 1780 MachineRegisterInfo &MRI, 1781 MachineIRBuilder &B) const { 1782 MachineFunction &MF = B.getMF(); 1783 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1784 const LLT S32 = LLT::scalar(32); 1785 1786 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1787 1788 if (ST.hasApertureRegs()) { 1789 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1790 // getreg. 1791 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1792 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1793 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1794 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1795 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1796 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1797 unsigned Encoding = 1798 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1799 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1800 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1801 1802 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1803 1804 B.buildInstr(AMDGPU::S_GETREG_B32) 1805 .addDef(GetReg) 1806 .addImm(Encoding); 1807 MRI.setType(GetReg, S32); 1808 1809 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1810 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1811 } 1812 1813 Register QueuePtr = MRI.createGenericVirtualRegister( 1814 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1815 1816 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1817 return Register(); 1818 1819 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1820 // private_segment_aperture_base_hi. 1821 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1822 1823 // TODO: can we be smarter about machine pointer info? 1824 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1825 MachineMemOperand *MMO = MF.getMachineMemOperand( 1826 PtrInfo, 1827 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1828 MachineMemOperand::MOInvariant, 1829 LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 1830 1831 Register LoadAddr; 1832 1833 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1834 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1835 } 1836 1837 /// Return true if the value is a known valid address, such that a null check is 1838 /// not necessary. 1839 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, 1840 const AMDGPUTargetMachine &TM, unsigned AddrSpace) { 1841 MachineInstr *Def = MRI.getVRegDef(Val); 1842 switch (Def->getOpcode()) { 1843 case AMDGPU::G_FRAME_INDEX: 1844 case AMDGPU::G_GLOBAL_VALUE: 1845 case AMDGPU::G_BLOCK_ADDR: 1846 return true; 1847 case AMDGPU::G_CONSTANT: { 1848 const ConstantInt *CI = Def->getOperand(1).getCImm(); 1849 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace); 1850 } 1851 default: 1852 return false; 1853 } 1854 1855 return false; 1856 } 1857 1858 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1859 MachineInstr &MI, MachineRegisterInfo &MRI, 1860 MachineIRBuilder &B) const { 1861 MachineFunction &MF = B.getMF(); 1862 1863 const LLT S32 = LLT::scalar(32); 1864 Register Dst = MI.getOperand(0).getReg(); 1865 Register Src = MI.getOperand(1).getReg(); 1866 1867 LLT DstTy = MRI.getType(Dst); 1868 LLT SrcTy = MRI.getType(Src); 1869 unsigned DestAS = DstTy.getAddressSpace(); 1870 unsigned SrcAS = SrcTy.getAddressSpace(); 1871 1872 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1873 // vector element. 1874 assert(!DstTy.isVector()); 1875 1876 const AMDGPUTargetMachine &TM 1877 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1878 1879 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1880 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1881 return true; 1882 } 1883 1884 if (SrcAS == AMDGPUAS::FLAT_ADDRESS && 1885 (DestAS == AMDGPUAS::LOCAL_ADDRESS || 1886 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) { 1887 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 1888 // Extract low 32-bits of the pointer. 1889 B.buildExtract(Dst, Src, 0); 1890 MI.eraseFromParent(); 1891 return true; 1892 } 1893 1894 unsigned NullVal = TM.getNullPointerValue(DestAS); 1895 1896 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1897 auto FlatNull = B.buildConstant(SrcTy, 0); 1898 1899 // Extract low 32-bits of the pointer. 1900 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1901 1902 auto CmpRes = 1903 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1904 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1905 1906 MI.eraseFromParent(); 1907 return true; 1908 } 1909 1910 if (DestAS == AMDGPUAS::FLAT_ADDRESS && 1911 (SrcAS == AMDGPUAS::LOCAL_ADDRESS || 1912 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) { 1913 if (!ST.hasFlatAddressSpace()) 1914 return false; 1915 1916 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1917 if (!ApertureReg.isValid()) 1918 return false; 1919 1920 // Coerce the type of the low half of the result so we can use merge_values. 1921 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1922 1923 // TODO: Should we allow mismatched types but matching sizes in merges to 1924 // avoid the ptrtoint? 1925 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1926 1927 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 1928 B.buildCopy(Dst, BuildPtr); 1929 MI.eraseFromParent(); 1930 return true; 1931 } 1932 1933 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1934 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1935 1936 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, 1937 SegmentNull.getReg(0)); 1938 1939 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1940 1941 MI.eraseFromParent(); 1942 return true; 1943 } 1944 1945 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 1946 SrcTy.getSizeInBits() == 64) { 1947 // Truncate. 1948 B.buildExtract(Dst, Src, 0); 1949 MI.eraseFromParent(); 1950 return true; 1951 } 1952 1953 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 1954 DstTy.getSizeInBits() == 64) { 1955 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1956 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1957 1958 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1959 // another. Merge operands are required to be the same type, but creating an 1960 // extra ptrtoint would be kind of pointless. 1961 auto HighAddr = B.buildConstant( 1962 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1963 B.buildMerge(Dst, {Src, HighAddr}); 1964 MI.eraseFromParent(); 1965 return true; 1966 } 1967 1968 DiagnosticInfoUnsupported InvalidAddrSpaceCast( 1969 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc()); 1970 1971 LLVMContext &Ctx = MF.getFunction().getContext(); 1972 Ctx.diagnose(InvalidAddrSpaceCast); 1973 B.buildUndef(Dst); 1974 MI.eraseFromParent(); 1975 return true; 1976 } 1977 1978 bool AMDGPULegalizerInfo::legalizeFrint( 1979 MachineInstr &MI, MachineRegisterInfo &MRI, 1980 MachineIRBuilder &B) const { 1981 Register Src = MI.getOperand(1).getReg(); 1982 LLT Ty = MRI.getType(Src); 1983 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1984 1985 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1986 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1987 1988 auto C1 = B.buildFConstant(Ty, C1Val); 1989 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1990 1991 // TODO: Should this propagate fast-math-flags? 1992 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1993 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1994 1995 auto C2 = B.buildFConstant(Ty, C2Val); 1996 auto Fabs = B.buildFAbs(Ty, Src); 1997 1998 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1999 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 2000 MI.eraseFromParent(); 2001 return true; 2002 } 2003 2004 bool AMDGPULegalizerInfo::legalizeFceil( 2005 MachineInstr &MI, MachineRegisterInfo &MRI, 2006 MachineIRBuilder &B) const { 2007 2008 const LLT S1 = LLT::scalar(1); 2009 const LLT S64 = LLT::scalar(64); 2010 2011 Register Src = MI.getOperand(1).getReg(); 2012 assert(MRI.getType(Src) == S64); 2013 2014 // result = trunc(src) 2015 // if (src > 0.0 && src != result) 2016 // result += 1.0 2017 2018 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 2019 2020 const auto Zero = B.buildFConstant(S64, 0.0); 2021 const auto One = B.buildFConstant(S64, 1.0); 2022 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 2023 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 2024 auto And = B.buildAnd(S1, Lt0, NeTrunc); 2025 auto Add = B.buildSelect(S64, And, One, Zero); 2026 2027 // TODO: Should this propagate fast-math-flags? 2028 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 2029 MI.eraseFromParent(); 2030 return true; 2031 } 2032 2033 bool AMDGPULegalizerInfo::legalizeFrem( 2034 MachineInstr &MI, MachineRegisterInfo &MRI, 2035 MachineIRBuilder &B) const { 2036 Register DstReg = MI.getOperand(0).getReg(); 2037 Register Src0Reg = MI.getOperand(1).getReg(); 2038 Register Src1Reg = MI.getOperand(2).getReg(); 2039 auto Flags = MI.getFlags(); 2040 LLT Ty = MRI.getType(DstReg); 2041 2042 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 2043 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 2044 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 2045 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 2046 MI.eraseFromParent(); 2047 return true; 2048 } 2049 2050 static MachineInstrBuilder extractF64Exponent(Register Hi, 2051 MachineIRBuilder &B) { 2052 const unsigned FractBits = 52; 2053 const unsigned ExpBits = 11; 2054 LLT S32 = LLT::scalar(32); 2055 2056 auto Const0 = B.buildConstant(S32, FractBits - 32); 2057 auto Const1 = B.buildConstant(S32, ExpBits); 2058 2059 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 2060 .addUse(Hi) 2061 .addUse(Const0.getReg(0)) 2062 .addUse(Const1.getReg(0)); 2063 2064 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 2065 } 2066 2067 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2068 MachineInstr &MI, MachineRegisterInfo &MRI, 2069 MachineIRBuilder &B) const { 2070 const LLT S1 = LLT::scalar(1); 2071 const LLT S32 = LLT::scalar(32); 2072 const LLT S64 = LLT::scalar(64); 2073 2074 Register Src = MI.getOperand(1).getReg(); 2075 assert(MRI.getType(Src) == S64); 2076 2077 // TODO: Should this use extract since the low half is unused? 2078 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2079 Register Hi = Unmerge.getReg(1); 2080 2081 // Extract the upper half, since this is where we will find the sign and 2082 // exponent. 2083 auto Exp = extractF64Exponent(Hi, B); 2084 2085 const unsigned FractBits = 52; 2086 2087 // Extract the sign bit. 2088 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2089 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2090 2091 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2092 2093 const auto Zero32 = B.buildConstant(S32, 0); 2094 2095 // Extend back to 64-bits. 2096 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2097 2098 auto Shr = B.buildAShr(S64, FractMask, Exp); 2099 auto Not = B.buildNot(S64, Shr); 2100 auto Tmp0 = B.buildAnd(S64, Src, Not); 2101 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2102 2103 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2104 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2105 2106 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2107 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2108 MI.eraseFromParent(); 2109 return true; 2110 } 2111 2112 bool AMDGPULegalizerInfo::legalizeITOFP( 2113 MachineInstr &MI, MachineRegisterInfo &MRI, 2114 MachineIRBuilder &B, bool Signed) const { 2115 2116 Register Dst = MI.getOperand(0).getReg(); 2117 Register Src = MI.getOperand(1).getReg(); 2118 2119 const LLT S64 = LLT::scalar(64); 2120 const LLT S32 = LLT::scalar(32); 2121 2122 assert(MRI.getType(Src) == S64); 2123 2124 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2125 auto ThirtyTwo = B.buildConstant(S32, 32); 2126 2127 if (MRI.getType(Dst) == S64) { 2128 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) 2129 : B.buildUITOFP(S64, Unmerge.getReg(1)); 2130 2131 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2132 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2133 .addUse(CvtHi.getReg(0)) 2134 .addUse(ThirtyTwo.getReg(0)); 2135 2136 // TODO: Should this propagate fast-math-flags? 2137 B.buildFAdd(Dst, LdExp, CvtLo); 2138 MI.eraseFromParent(); 2139 return true; 2140 } 2141 2142 assert(MRI.getType(Dst) == S32); 2143 2144 auto One = B.buildConstant(S32, 1); 2145 2146 MachineInstrBuilder ShAmt; 2147 if (Signed) { 2148 auto ThirtyOne = B.buildConstant(S32, 31); 2149 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); 2150 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); 2151 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); 2152 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, 2153 /*HasSideEffects=*/false) 2154 .addUse(Unmerge.getReg(1)); 2155 auto LS2 = B.buildSub(S32, LS, One); 2156 ShAmt = B.buildUMin(S32, LS2, MaxShAmt); 2157 } else 2158 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); 2159 auto Norm = B.buildShl(S64, Src, ShAmt); 2160 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); 2161 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); 2162 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); 2163 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); 2164 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); 2165 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst}, 2166 /*HasSideEffects=*/false) 2167 .addUse(FVal.getReg(0)) 2168 .addUse(Scale.getReg(0)); 2169 MI.eraseFromParent(); 2170 return true; 2171 } 2172 2173 // TODO: Copied from DAG implementation. Verify logic and document how this 2174 // actually works. 2175 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2176 MachineRegisterInfo &MRI, 2177 MachineIRBuilder &B, 2178 bool Signed) const { 2179 2180 Register Dst = MI.getOperand(0).getReg(); 2181 Register Src = MI.getOperand(1).getReg(); 2182 2183 const LLT S64 = LLT::scalar(64); 2184 const LLT S32 = LLT::scalar(32); 2185 2186 const LLT SrcLT = MRI.getType(Src); 2187 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2188 2189 unsigned Flags = MI.getFlags(); 2190 2191 // The basic idea of converting a floating point number into a pair of 32-bit 2192 // integers is illustrated as follows: 2193 // 2194 // tf := trunc(val); 2195 // hif := floor(tf * 2^-32); 2196 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2197 // hi := fptoi(hif); 2198 // lo := fptoi(lof); 2199 // 2200 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2201 MachineInstrBuilder Sign; 2202 if (Signed && SrcLT == S32) { 2203 // However, a 32-bit floating point number has only 23 bits mantissa and 2204 // it's not enough to hold all the significant bits of `lof` if val is 2205 // negative. To avoid the loss of precision, We need to take the absolute 2206 // value after truncating and flip the result back based on the original 2207 // signedness. 2208 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2209 Trunc = B.buildFAbs(S32, Trunc, Flags); 2210 } 2211 MachineInstrBuilder K0, K1; 2212 if (SrcLT == S64) { 2213 K0 = B.buildFConstant(S64, 2214 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2215 K1 = B.buildFConstant(S64, 2216 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2217 } else { 2218 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2219 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2220 } 2221 2222 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2223 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2224 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2225 2226 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2227 : B.buildFPTOUI(S32, FloorMul); 2228 auto Lo = B.buildFPTOUI(S32, Fma); 2229 2230 if (Signed && SrcLT == S32) { 2231 // Flip the result based on the signedness, which is either all 0s or 1s. 2232 Sign = B.buildMerge(S64, {Sign, Sign}); 2233 // r := xor({lo, hi}, sign) - sign; 2234 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); 2235 } else 2236 B.buildMerge(Dst, {Lo, Hi}); 2237 MI.eraseFromParent(); 2238 2239 return true; 2240 } 2241 2242 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2243 MachineInstr &MI) const { 2244 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2245 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2246 2247 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2248 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2249 2250 // With ieee_mode disabled, the instructions have the correct behavior 2251 // already for G_FMINNUM/G_FMAXNUM 2252 if (!MFI->getMode().IEEE) 2253 return !IsIEEEOp; 2254 2255 if (IsIEEEOp) 2256 return true; 2257 2258 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2259 } 2260 2261 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2262 MachineInstr &MI, MachineRegisterInfo &MRI, 2263 MachineIRBuilder &B) const { 2264 // TODO: Should move some of this into LegalizerHelper. 2265 2266 // TODO: Promote dynamic indexing of s16 to s32 2267 2268 // FIXME: Artifact combiner probably should have replaced the truncated 2269 // constant before this, so we shouldn't need 2270 // getIConstantVRegValWithLookThrough. 2271 Optional<ValueAndVReg> MaybeIdxVal = 2272 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2273 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2274 return true; 2275 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2276 2277 Register Dst = MI.getOperand(0).getReg(); 2278 Register Vec = MI.getOperand(1).getReg(); 2279 2280 LLT VecTy = MRI.getType(Vec); 2281 LLT EltTy = VecTy.getElementType(); 2282 assert(EltTy == MRI.getType(Dst)); 2283 2284 if (IdxVal < VecTy.getNumElements()) { 2285 auto Unmerge = B.buildUnmerge(EltTy, Vec); 2286 B.buildCopy(Dst, Unmerge.getReg(IdxVal)); 2287 } else { 2288 B.buildUndef(Dst); 2289 } 2290 2291 MI.eraseFromParent(); 2292 return true; 2293 } 2294 2295 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2296 MachineInstr &MI, MachineRegisterInfo &MRI, 2297 MachineIRBuilder &B) const { 2298 // TODO: Should move some of this into LegalizerHelper. 2299 2300 // TODO: Promote dynamic indexing of s16 to s32 2301 2302 // FIXME: Artifact combiner probably should have replaced the truncated 2303 // constant before this, so we shouldn't need 2304 // getIConstantVRegValWithLookThrough. 2305 Optional<ValueAndVReg> MaybeIdxVal = 2306 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2307 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2308 return true; 2309 2310 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2311 Register Dst = MI.getOperand(0).getReg(); 2312 Register Vec = MI.getOperand(1).getReg(); 2313 Register Ins = MI.getOperand(2).getReg(); 2314 2315 LLT VecTy = MRI.getType(Vec); 2316 LLT EltTy = VecTy.getElementType(); 2317 assert(EltTy == MRI.getType(Ins)); 2318 (void)Ins; 2319 2320 unsigned NumElts = VecTy.getNumElements(); 2321 if (IdxVal < NumElts) { 2322 SmallVector<Register, 8> SrcRegs; 2323 for (unsigned i = 0; i < NumElts; ++i) 2324 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy)); 2325 B.buildUnmerge(SrcRegs, Vec); 2326 2327 SrcRegs[IdxVal] = MI.getOperand(2).getReg(); 2328 B.buildMerge(Dst, SrcRegs); 2329 } else { 2330 B.buildUndef(Dst); 2331 } 2332 2333 MI.eraseFromParent(); 2334 return true; 2335 } 2336 2337 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2338 MachineInstr &MI, MachineRegisterInfo &MRI, 2339 MachineIRBuilder &B) const { 2340 const LLT V2S16 = LLT::fixed_vector(2, 16); 2341 2342 Register Dst = MI.getOperand(0).getReg(); 2343 Register Src0 = MI.getOperand(1).getReg(); 2344 LLT DstTy = MRI.getType(Dst); 2345 LLT SrcTy = MRI.getType(Src0); 2346 2347 if (SrcTy == V2S16 && DstTy == V2S16 && 2348 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2349 return true; 2350 2351 MachineIRBuilder HelperBuilder(MI); 2352 GISelObserverWrapper DummyObserver; 2353 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2354 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2355 } 2356 2357 bool AMDGPULegalizerInfo::legalizeSinCos( 2358 MachineInstr &MI, MachineRegisterInfo &MRI, 2359 MachineIRBuilder &B) const { 2360 2361 Register DstReg = MI.getOperand(0).getReg(); 2362 Register SrcReg = MI.getOperand(1).getReg(); 2363 LLT Ty = MRI.getType(DstReg); 2364 unsigned Flags = MI.getFlags(); 2365 2366 Register TrigVal; 2367 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2368 if (ST.hasTrigReducedRange()) { 2369 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2370 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2371 .addUse(MulVal.getReg(0)) 2372 .setMIFlags(Flags).getReg(0); 2373 } else 2374 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2375 2376 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2377 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2378 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2379 .addUse(TrigVal) 2380 .setMIFlags(Flags); 2381 MI.eraseFromParent(); 2382 return true; 2383 } 2384 2385 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2386 MachineIRBuilder &B, 2387 const GlobalValue *GV, 2388 int64_t Offset, 2389 unsigned GAFlags) const { 2390 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2391 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2392 // to the following code sequence: 2393 // 2394 // For constant address space: 2395 // s_getpc_b64 s[0:1] 2396 // s_add_u32 s0, s0, $symbol 2397 // s_addc_u32 s1, s1, 0 2398 // 2399 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2400 // a fixup or relocation is emitted to replace $symbol with a literal 2401 // constant, which is a pc-relative offset from the encoding of the $symbol 2402 // operand to the global variable. 2403 // 2404 // For global address space: 2405 // s_getpc_b64 s[0:1] 2406 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2407 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2408 // 2409 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2410 // fixups or relocations are emitted to replace $symbol@*@lo and 2411 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2412 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2413 // operand to the global variable. 2414 // 2415 // What we want here is an offset from the value returned by s_getpc 2416 // (which is the address of the s_add_u32 instruction) to the global 2417 // variable, but since the encoding of $symbol starts 4 bytes after the start 2418 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2419 // small. This requires us to add 4 to the global variable offset in order to 2420 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2421 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2422 // instruction. 2423 2424 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2425 2426 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2427 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2428 2429 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2430 .addDef(PCReg); 2431 2432 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2433 if (GAFlags == SIInstrInfo::MO_NONE) 2434 MIB.addImm(0); 2435 else 2436 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2437 2438 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2439 2440 if (PtrTy.getSizeInBits() == 32) 2441 B.buildExtract(DstReg, PCReg, 0); 2442 return true; 2443 } 2444 2445 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2446 MachineInstr &MI, MachineRegisterInfo &MRI, 2447 MachineIRBuilder &B) const { 2448 Register DstReg = MI.getOperand(0).getReg(); 2449 LLT Ty = MRI.getType(DstReg); 2450 unsigned AS = Ty.getAddressSpace(); 2451 2452 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2453 MachineFunction &MF = B.getMF(); 2454 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2455 2456 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2457 if (!MFI->isModuleEntryFunction() && 2458 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2459 const Function &Fn = MF.getFunction(); 2460 DiagnosticInfoUnsupported BadLDSDecl( 2461 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2462 DS_Warning); 2463 Fn.getContext().diagnose(BadLDSDecl); 2464 2465 // We currently don't have a way to correctly allocate LDS objects that 2466 // aren't directly associated with a kernel. We do force inlining of 2467 // functions that use local objects. However, if these dead functions are 2468 // not eliminated, we don't want a compile time error. Just emit a warning 2469 // and a trap, since there should be no callable path here. 2470 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2471 B.buildUndef(DstReg); 2472 MI.eraseFromParent(); 2473 return true; 2474 } 2475 2476 // TODO: We could emit code to handle the initialization somewhere. 2477 // We ignore the initializer for now and legalize it to allow selection. 2478 // The initializer will anyway get errored out during assembly emission. 2479 const SITargetLowering *TLI = ST.getTargetLowering(); 2480 if (!TLI->shouldUseLDSConstAddress(GV)) { 2481 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2482 return true; // Leave in place; 2483 } 2484 2485 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2486 Type *Ty = GV->getValueType(); 2487 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2488 // zero-sized type in other languages to declare the dynamic shared 2489 // memory which size is not known at the compile time. They will be 2490 // allocated by the runtime and placed directly after the static 2491 // allocated ones. They all share the same offset. 2492 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2493 // Adjust alignment for that dynamic shared memory array. 2494 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2495 LLT S32 = LLT::scalar(32); 2496 auto Sz = 2497 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2498 B.buildIntToPtr(DstReg, Sz); 2499 MI.eraseFromParent(); 2500 return true; 2501 } 2502 } 2503 2504 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), 2505 *cast<GlobalVariable>(GV))); 2506 MI.eraseFromParent(); 2507 return true; 2508 } 2509 2510 const SITargetLowering *TLI = ST.getTargetLowering(); 2511 2512 if (TLI->shouldEmitFixup(GV)) { 2513 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2514 MI.eraseFromParent(); 2515 return true; 2516 } 2517 2518 if (TLI->shouldEmitPCReloc(GV)) { 2519 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2520 MI.eraseFromParent(); 2521 return true; 2522 } 2523 2524 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2525 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2526 2527 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 2528 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2529 MachinePointerInfo::getGOT(MF), 2530 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2531 MachineMemOperand::MOInvariant, 2532 LoadTy, Align(8)); 2533 2534 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2535 2536 if (Ty.getSizeInBits() == 32) { 2537 // Truncate if this is a 32-bit constant address. 2538 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2539 B.buildExtract(DstReg, Load, 0); 2540 } else 2541 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2542 2543 MI.eraseFromParent(); 2544 return true; 2545 } 2546 2547 static LLT widenToNextPowerOf2(LLT Ty) { 2548 if (Ty.isVector()) 2549 return Ty.changeElementCount( 2550 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2551 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2552 } 2553 2554 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2555 MachineInstr &MI) const { 2556 MachineIRBuilder &B = Helper.MIRBuilder; 2557 MachineRegisterInfo &MRI = *B.getMRI(); 2558 GISelChangeObserver &Observer = Helper.Observer; 2559 2560 Register PtrReg = MI.getOperand(1).getReg(); 2561 LLT PtrTy = MRI.getType(PtrReg); 2562 unsigned AddrSpace = PtrTy.getAddressSpace(); 2563 2564 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2565 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2566 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2567 Observer.changingInstr(MI); 2568 MI.getOperand(1).setReg(Cast.getReg(0)); 2569 Observer.changedInstr(MI); 2570 return true; 2571 } 2572 2573 if (MI.getOpcode() != AMDGPU::G_LOAD) 2574 return false; 2575 2576 Register ValReg = MI.getOperand(0).getReg(); 2577 LLT ValTy = MRI.getType(ValReg); 2578 2579 MachineMemOperand *MMO = *MI.memoperands_begin(); 2580 const unsigned ValSize = ValTy.getSizeInBits(); 2581 const LLT MemTy = MMO->getMemoryType(); 2582 const Align MemAlign = MMO->getAlign(); 2583 const unsigned MemSize = MemTy.getSizeInBits(); 2584 const uint64_t AlignInBits = 8 * MemAlign.value(); 2585 2586 // Widen non-power-of-2 loads to the alignment if needed 2587 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 2588 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2589 2590 // This was already the correct extending load result type, so just adjust 2591 // the memory type. 2592 if (WideMemSize == ValSize) { 2593 MachineFunction &MF = B.getMF(); 2594 2595 MachineMemOperand *WideMMO = 2596 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2597 Observer.changingInstr(MI); 2598 MI.setMemRefs(MF, {WideMMO}); 2599 Observer.changedInstr(MI); 2600 return true; 2601 } 2602 2603 // Don't bother handling edge case that should probably never be produced. 2604 if (ValSize > WideMemSize) 2605 return false; 2606 2607 LLT WideTy = widenToNextPowerOf2(ValTy); 2608 2609 Register WideLoad; 2610 if (!WideTy.isVector()) { 2611 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2612 B.buildTrunc(ValReg, WideLoad).getReg(0); 2613 } else { 2614 // Extract the subvector. 2615 2616 if (isRegisterType(ValTy)) { 2617 // If this a case where G_EXTRACT is legal, use it. 2618 // (e.g. <3 x s32> -> <4 x s32>) 2619 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2620 B.buildExtract(ValReg, WideLoad, 0); 2621 } else { 2622 // For cases where the widened type isn't a nice register value, unmerge 2623 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2624 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2625 B.buildDeleteTrailingVectorElements(ValReg, WideLoad); 2626 } 2627 } 2628 2629 MI.eraseFromParent(); 2630 return true; 2631 } 2632 2633 return false; 2634 } 2635 2636 bool AMDGPULegalizerInfo::legalizeFMad( 2637 MachineInstr &MI, MachineRegisterInfo &MRI, 2638 MachineIRBuilder &B) const { 2639 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2640 assert(Ty.isScalar()); 2641 2642 MachineFunction &MF = B.getMF(); 2643 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2644 2645 // TODO: Always legal with future ftz flag. 2646 // FIXME: Do we need just output? 2647 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2648 return true; 2649 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2650 return true; 2651 2652 MachineIRBuilder HelperBuilder(MI); 2653 GISelObserverWrapper DummyObserver; 2654 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2655 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2656 } 2657 2658 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2659 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2660 Register DstReg = MI.getOperand(0).getReg(); 2661 Register PtrReg = MI.getOperand(1).getReg(); 2662 Register CmpVal = MI.getOperand(2).getReg(); 2663 Register NewVal = MI.getOperand(3).getReg(); 2664 2665 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2666 "this should not have been custom lowered"); 2667 2668 LLT ValTy = MRI.getType(CmpVal); 2669 LLT VecTy = LLT::fixed_vector(2, ValTy); 2670 2671 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2672 2673 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2674 .addDef(DstReg) 2675 .addUse(PtrReg) 2676 .addUse(PackedVal) 2677 .setMemRefs(MI.memoperands()); 2678 2679 MI.eraseFromParent(); 2680 return true; 2681 } 2682 2683 bool AMDGPULegalizerInfo::legalizeFlog( 2684 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2685 Register Dst = MI.getOperand(0).getReg(); 2686 Register Src = MI.getOperand(1).getReg(); 2687 LLT Ty = B.getMRI()->getType(Dst); 2688 unsigned Flags = MI.getFlags(); 2689 2690 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2691 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2692 2693 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2694 MI.eraseFromParent(); 2695 return true; 2696 } 2697 2698 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2699 MachineIRBuilder &B) const { 2700 Register Dst = MI.getOperand(0).getReg(); 2701 Register Src = MI.getOperand(1).getReg(); 2702 unsigned Flags = MI.getFlags(); 2703 LLT Ty = B.getMRI()->getType(Dst); 2704 2705 auto K = B.buildFConstant(Ty, numbers::log2e); 2706 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2707 B.buildFExp2(Dst, Mul, Flags); 2708 MI.eraseFromParent(); 2709 return true; 2710 } 2711 2712 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2713 MachineIRBuilder &B) const { 2714 Register Dst = MI.getOperand(0).getReg(); 2715 Register Src0 = MI.getOperand(1).getReg(); 2716 Register Src1 = MI.getOperand(2).getReg(); 2717 unsigned Flags = MI.getFlags(); 2718 LLT Ty = B.getMRI()->getType(Dst); 2719 const LLT S16 = LLT::scalar(16); 2720 const LLT S32 = LLT::scalar(32); 2721 2722 if (Ty == S32) { 2723 auto Log = B.buildFLog2(S32, Src0, Flags); 2724 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2725 .addUse(Log.getReg(0)) 2726 .addUse(Src1) 2727 .setMIFlags(Flags); 2728 B.buildFExp2(Dst, Mul, Flags); 2729 } else if (Ty == S16) { 2730 // There's no f16 fmul_legacy, so we need to convert for it. 2731 auto Log = B.buildFLog2(S16, Src0, Flags); 2732 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2733 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2734 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2735 .addUse(Ext0.getReg(0)) 2736 .addUse(Ext1.getReg(0)) 2737 .setMIFlags(Flags); 2738 2739 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2740 } else 2741 return false; 2742 2743 MI.eraseFromParent(); 2744 return true; 2745 } 2746 2747 // Find a source register, ignoring any possible source modifiers. 2748 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2749 Register ModSrc = OrigSrc; 2750 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2751 ModSrc = SrcFNeg->getOperand(1).getReg(); 2752 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2753 ModSrc = SrcFAbs->getOperand(1).getReg(); 2754 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2755 ModSrc = SrcFAbs->getOperand(1).getReg(); 2756 return ModSrc; 2757 } 2758 2759 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2760 MachineRegisterInfo &MRI, 2761 MachineIRBuilder &B) const { 2762 2763 const LLT S1 = LLT::scalar(1); 2764 const LLT S64 = LLT::scalar(64); 2765 Register Dst = MI.getOperand(0).getReg(); 2766 Register OrigSrc = MI.getOperand(1).getReg(); 2767 unsigned Flags = MI.getFlags(); 2768 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2769 "this should not have been custom lowered"); 2770 2771 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2772 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2773 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2774 // V_FRACT bug is: 2775 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2776 // 2777 // Convert floor(x) to (x - fract(x)) 2778 2779 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2780 .addUse(OrigSrc) 2781 .setMIFlags(Flags); 2782 2783 // Give source modifier matching some assistance before obscuring a foldable 2784 // pattern. 2785 2786 // TODO: We can avoid the neg on the fract? The input sign to fract 2787 // shouldn't matter? 2788 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2789 2790 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2791 2792 Register Min = MRI.createGenericVirtualRegister(S64); 2793 2794 // We don't need to concern ourselves with the snan handling difference, so 2795 // use the one which will directly select. 2796 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2797 if (MFI->getMode().IEEE) 2798 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2799 else 2800 B.buildFMinNum(Min, Fract, Const, Flags); 2801 2802 Register CorrectedFract = Min; 2803 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2804 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2805 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2806 } 2807 2808 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2809 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2810 2811 MI.eraseFromParent(); 2812 return true; 2813 } 2814 2815 // Turn an illegal packed v2s16 build vector into bit operations. 2816 // TODO: This should probably be a bitcast action in LegalizerHelper. 2817 bool AMDGPULegalizerInfo::legalizeBuildVector( 2818 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2819 Register Dst = MI.getOperand(0).getReg(); 2820 const LLT S32 = LLT::scalar(32); 2821 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 2822 2823 Register Src0 = MI.getOperand(1).getReg(); 2824 Register Src1 = MI.getOperand(2).getReg(); 2825 assert(MRI.getType(Src0) == LLT::scalar(16)); 2826 2827 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2828 B.buildBitcast(Dst, Merge); 2829 2830 MI.eraseFromParent(); 2831 return true; 2832 } 2833 2834 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to 2835 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input 2836 // case with a single min instruction instead of a compare+select. 2837 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, 2838 MachineRegisterInfo &MRI, 2839 MachineIRBuilder &B) const { 2840 Register Dst = MI.getOperand(0).getReg(); 2841 Register Src = MI.getOperand(1).getReg(); 2842 LLT DstTy = MRI.getType(Dst); 2843 LLT SrcTy = MRI.getType(Src); 2844 2845 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ 2846 ? AMDGPU::G_AMDGPU_FFBH_U32 2847 : AMDGPU::G_AMDGPU_FFBL_B32; 2848 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); 2849 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); 2850 2851 MI.eraseFromParent(); 2852 return true; 2853 } 2854 2855 // Check that this is a G_XOR x, -1 2856 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2857 if (MI.getOpcode() != TargetOpcode::G_XOR) 2858 return false; 2859 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2860 return ConstVal && *ConstVal == -1; 2861 } 2862 2863 // Return the use branch instruction, otherwise null if the usage is invalid. 2864 static MachineInstr * 2865 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2866 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2867 Register CondDef = MI.getOperand(0).getReg(); 2868 if (!MRI.hasOneNonDBGUse(CondDef)) 2869 return nullptr; 2870 2871 MachineBasicBlock *Parent = MI.getParent(); 2872 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2873 2874 if (isNot(MRI, *UseMI)) { 2875 Register NegatedCond = UseMI->getOperand(0).getReg(); 2876 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2877 return nullptr; 2878 2879 // We're deleting the def of this value, so we need to remove it. 2880 eraseInstr(*UseMI, MRI); 2881 2882 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2883 Negated = true; 2884 } 2885 2886 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2887 return nullptr; 2888 2889 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2890 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2891 if (Next == Parent->end()) { 2892 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2893 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2894 return nullptr; 2895 UncondBrTarget = &*NextMBB; 2896 } else { 2897 if (Next->getOpcode() != AMDGPU::G_BR) 2898 return nullptr; 2899 Br = &*Next; 2900 UncondBrTarget = Br->getOperand(0).getMBB(); 2901 } 2902 2903 return UseMI; 2904 } 2905 2906 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2907 const ArgDescriptor *Arg, 2908 const TargetRegisterClass *ArgRC, 2909 LLT ArgTy) const { 2910 MCRegister SrcReg = Arg->getRegister(); 2911 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2912 assert(DstReg.isVirtual() && "Virtual register expected"); 2913 2914 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, 2915 *ArgRC, B.getDebugLoc(), ArgTy); 2916 if (Arg->isMasked()) { 2917 // TODO: Should we try to emit this once in the entry block? 2918 const LLT S32 = LLT::scalar(32); 2919 const unsigned Mask = Arg->getMask(); 2920 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2921 2922 Register AndMaskSrc = LiveIn; 2923 2924 // TODO: Avoid clearing the high bits if we know workitem id y/z are always 2925 // 0. 2926 if (Shift != 0) { 2927 auto ShiftAmt = B.buildConstant(S32, Shift); 2928 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2929 } 2930 2931 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2932 } else { 2933 B.buildCopy(DstReg, LiveIn); 2934 } 2935 2936 return true; 2937 } 2938 2939 bool AMDGPULegalizerInfo::loadInputValue( 2940 Register DstReg, MachineIRBuilder &B, 2941 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2942 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2943 const ArgDescriptor *Arg; 2944 const TargetRegisterClass *ArgRC; 2945 LLT ArgTy; 2946 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2947 2948 if (!Arg) { 2949 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { 2950 // The intrinsic may appear when we have a 0 sized kernarg segment, in which 2951 // case the pointer argument may be missing and we use null. 2952 B.buildConstant(DstReg, 0); 2953 return true; 2954 } 2955 2956 // It's undefined behavior if a function marked with the amdgpu-no-* 2957 // attributes uses the corresponding intrinsic. 2958 B.buildUndef(DstReg); 2959 return true; 2960 } 2961 2962 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2963 return false; // TODO: Handle these 2964 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2965 } 2966 2967 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2968 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2969 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2970 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2971 return false; 2972 2973 MI.eraseFromParent(); 2974 return true; 2975 } 2976 2977 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2978 MachineRegisterInfo &MRI, 2979 MachineIRBuilder &B) const { 2980 Register Dst = MI.getOperand(0).getReg(); 2981 LLT DstTy = MRI.getType(Dst); 2982 LLT S16 = LLT::scalar(16); 2983 LLT S32 = LLT::scalar(32); 2984 LLT S64 = LLT::scalar(64); 2985 2986 if (DstTy == S16) 2987 return legalizeFDIV16(MI, MRI, B); 2988 if (DstTy == S32) 2989 return legalizeFDIV32(MI, MRI, B); 2990 if (DstTy == S64) 2991 return legalizeFDIV64(MI, MRI, B); 2992 2993 return false; 2994 } 2995 2996 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 2997 Register DstDivReg, 2998 Register DstRemReg, 2999 Register X, 3000 Register Y) const { 3001 const LLT S1 = LLT::scalar(1); 3002 const LLT S32 = LLT::scalar(32); 3003 3004 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 3005 // algorithm used here. 3006 3007 // Initial estimate of inv(y). 3008 auto FloatY = B.buildUITOFP(S32, Y); 3009 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 3010 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 3011 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 3012 auto Z = B.buildFPTOUI(S32, ScaledY); 3013 3014 // One round of UNR. 3015 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 3016 auto NegYZ = B.buildMul(S32, NegY, Z); 3017 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 3018 3019 // Quotient/remainder estimate. 3020 auto Q = B.buildUMulH(S32, X, Z); 3021 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 3022 3023 // First quotient/remainder refinement. 3024 auto One = B.buildConstant(S32, 1); 3025 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 3026 if (DstDivReg) 3027 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 3028 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 3029 3030 // Second quotient/remainder refinement. 3031 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 3032 if (DstDivReg) 3033 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 3034 3035 if (DstRemReg) 3036 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 3037 } 3038 3039 // Build integer reciprocal sequence around V_RCP_IFLAG_F32 3040 // 3041 // Return lo, hi of result 3042 // 3043 // %cvt.lo = G_UITOFP Val.lo 3044 // %cvt.hi = G_UITOFP Val.hi 3045 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 3046 // %rcp = G_AMDGPU_RCP_IFLAG %mad 3047 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 3048 // %mul2 = G_FMUL %mul1, 2**(-32) 3049 // %trunc = G_INTRINSIC_TRUNC %mul2 3050 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 3051 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 3052 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 3053 Register Val) { 3054 const LLT S32 = LLT::scalar(32); 3055 auto Unmerge = B.buildUnmerge(S32, Val); 3056 3057 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 3058 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 3059 3060 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 3061 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 3062 3063 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 3064 auto Mul1 = 3065 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 3066 3067 // 2**(-32) 3068 auto Mul2 = 3069 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 3070 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 3071 3072 // -(2**32) 3073 auto Mad2 = B.buildFMAD(S32, Trunc, 3074 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 3075 3076 auto ResultLo = B.buildFPTOUI(S32, Mad2); 3077 auto ResultHi = B.buildFPTOUI(S32, Trunc); 3078 3079 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 3080 } 3081 3082 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 3083 Register DstDivReg, 3084 Register DstRemReg, 3085 Register Numer, 3086 Register Denom) const { 3087 const LLT S32 = LLT::scalar(32); 3088 const LLT S64 = LLT::scalar(64); 3089 const LLT S1 = LLT::scalar(1); 3090 Register RcpLo, RcpHi; 3091 3092 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 3093 3094 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 3095 3096 auto Zero64 = B.buildConstant(S64, 0); 3097 auto NegDenom = B.buildSub(S64, Zero64, Denom); 3098 3099 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 3100 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 3101 3102 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 3103 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 3104 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 3105 3106 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 3107 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 3108 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 3109 3110 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 3111 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 3112 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 3113 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 3114 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 3115 3116 auto Zero32 = B.buildConstant(S32, 0); 3117 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 3118 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1)); 3119 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 3120 3121 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 3122 Register NumerLo = UnmergeNumer.getReg(0); 3123 Register NumerHi = UnmergeNumer.getReg(1); 3124 3125 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 3126 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 3127 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 3128 Register Mul3_Lo = UnmergeMul3.getReg(0); 3129 Register Mul3_Hi = UnmergeMul3.getReg(1); 3130 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 3131 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 3132 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 3133 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 3134 3135 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 3136 Register DenomLo = UnmergeDenom.getReg(0); 3137 Register DenomHi = UnmergeDenom.getReg(1); 3138 3139 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 3140 auto C1 = B.buildSExt(S32, CmpHi); 3141 3142 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 3143 auto C2 = B.buildSExt(S32, CmpLo); 3144 3145 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 3146 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 3147 3148 // TODO: Here and below portions of the code can be enclosed into if/endif. 3149 // Currently control flow is unconditional and we have 4 selects after 3150 // potential endif to substitute PHIs. 3151 3152 // if C3 != 0 ... 3153 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 3154 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 3155 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 3156 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 3157 3158 auto One64 = B.buildConstant(S64, 1); 3159 auto Add3 = B.buildAdd(S64, MulHi3, One64); 3160 3161 auto C4 = 3162 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 3163 auto C5 = 3164 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 3165 auto C6 = B.buildSelect( 3166 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 3167 3168 // if (C6 != 0) 3169 auto Add4 = B.buildAdd(S64, Add3, One64); 3170 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 3171 3172 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 3173 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 3174 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 3175 3176 // endif C6 3177 // endif C3 3178 3179 if (DstDivReg) { 3180 auto Sel1 = B.buildSelect( 3181 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3182 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3183 Sel1, MulHi3); 3184 } 3185 3186 if (DstRemReg) { 3187 auto Sel2 = B.buildSelect( 3188 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3189 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3190 Sel2, Sub1); 3191 } 3192 } 3193 3194 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3195 MachineRegisterInfo &MRI, 3196 MachineIRBuilder &B) const { 3197 Register DstDivReg, DstRemReg; 3198 switch (MI.getOpcode()) { 3199 default: 3200 llvm_unreachable("Unexpected opcode!"); 3201 case AMDGPU::G_UDIV: { 3202 DstDivReg = MI.getOperand(0).getReg(); 3203 break; 3204 } 3205 case AMDGPU::G_UREM: { 3206 DstRemReg = MI.getOperand(0).getReg(); 3207 break; 3208 } 3209 case AMDGPU::G_UDIVREM: { 3210 DstDivReg = MI.getOperand(0).getReg(); 3211 DstRemReg = MI.getOperand(1).getReg(); 3212 break; 3213 } 3214 } 3215 3216 const LLT S64 = LLT::scalar(64); 3217 const LLT S32 = LLT::scalar(32); 3218 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3219 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3220 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3221 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3222 3223 if (Ty == S32) 3224 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3225 else if (Ty == S64) 3226 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3227 else 3228 return false; 3229 3230 MI.eraseFromParent(); 3231 return true; 3232 } 3233 3234 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3235 MachineRegisterInfo &MRI, 3236 MachineIRBuilder &B) const { 3237 const LLT S64 = LLT::scalar(64); 3238 const LLT S32 = LLT::scalar(32); 3239 3240 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3241 if (Ty != S32 && Ty != S64) 3242 return false; 3243 3244 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3245 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3246 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3247 3248 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3249 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3250 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3251 3252 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3253 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3254 3255 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3256 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3257 3258 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3259 switch (MI.getOpcode()) { 3260 default: 3261 llvm_unreachable("Unexpected opcode!"); 3262 case AMDGPU::G_SDIV: { 3263 DstDivReg = MI.getOperand(0).getReg(); 3264 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3265 break; 3266 } 3267 case AMDGPU::G_SREM: { 3268 DstRemReg = MI.getOperand(0).getReg(); 3269 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3270 break; 3271 } 3272 case AMDGPU::G_SDIVREM: { 3273 DstDivReg = MI.getOperand(0).getReg(); 3274 DstRemReg = MI.getOperand(1).getReg(); 3275 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3276 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3277 break; 3278 } 3279 } 3280 3281 if (Ty == S32) 3282 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3283 else 3284 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3285 3286 if (DstDivReg) { 3287 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3288 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3289 B.buildSub(DstDivReg, SignXor, Sign); 3290 } 3291 3292 if (DstRemReg) { 3293 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3294 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3295 B.buildSub(DstRemReg, SignXor, Sign); 3296 } 3297 3298 MI.eraseFromParent(); 3299 return true; 3300 } 3301 3302 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3303 MachineRegisterInfo &MRI, 3304 MachineIRBuilder &B) const { 3305 Register Res = MI.getOperand(0).getReg(); 3306 Register LHS = MI.getOperand(1).getReg(); 3307 Register RHS = MI.getOperand(2).getReg(); 3308 uint16_t Flags = MI.getFlags(); 3309 LLT ResTy = MRI.getType(Res); 3310 3311 const MachineFunction &MF = B.getMF(); 3312 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3313 MI.getFlag(MachineInstr::FmAfn); 3314 3315 if (!AllowInaccurateRcp) 3316 return false; 3317 3318 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3319 // 1 / x -> RCP(x) 3320 if (CLHS->isExactlyValue(1.0)) { 3321 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3322 .addUse(RHS) 3323 .setMIFlags(Flags); 3324 3325 MI.eraseFromParent(); 3326 return true; 3327 } 3328 3329 // -1 / x -> RCP( FNEG(x) ) 3330 if (CLHS->isExactlyValue(-1.0)) { 3331 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3332 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3333 .addUse(FNeg.getReg(0)) 3334 .setMIFlags(Flags); 3335 3336 MI.eraseFromParent(); 3337 return true; 3338 } 3339 } 3340 3341 // x / y -> x * (1.0 / y) 3342 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3343 .addUse(RHS) 3344 .setMIFlags(Flags); 3345 B.buildFMul(Res, LHS, RCP, Flags); 3346 3347 MI.eraseFromParent(); 3348 return true; 3349 } 3350 3351 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3352 MachineRegisterInfo &MRI, 3353 MachineIRBuilder &B) const { 3354 Register Res = MI.getOperand(0).getReg(); 3355 Register X = MI.getOperand(1).getReg(); 3356 Register Y = MI.getOperand(2).getReg(); 3357 uint16_t Flags = MI.getFlags(); 3358 LLT ResTy = MRI.getType(Res); 3359 3360 const MachineFunction &MF = B.getMF(); 3361 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3362 MI.getFlag(MachineInstr::FmAfn); 3363 3364 if (!AllowInaccurateRcp) 3365 return false; 3366 3367 auto NegY = B.buildFNeg(ResTy, Y); 3368 auto One = B.buildFConstant(ResTy, 1.0); 3369 3370 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3371 .addUse(Y) 3372 .setMIFlags(Flags); 3373 3374 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3375 R = B.buildFMA(ResTy, Tmp0, R, R); 3376 3377 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3378 R = B.buildFMA(ResTy, Tmp1, R, R); 3379 3380 auto Ret = B.buildFMul(ResTy, X, R); 3381 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3382 3383 B.buildFMA(Res, Tmp2, R, Ret); 3384 MI.eraseFromParent(); 3385 return true; 3386 } 3387 3388 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3389 MachineRegisterInfo &MRI, 3390 MachineIRBuilder &B) const { 3391 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3392 return true; 3393 3394 Register Res = MI.getOperand(0).getReg(); 3395 Register LHS = MI.getOperand(1).getReg(); 3396 Register RHS = MI.getOperand(2).getReg(); 3397 3398 uint16_t Flags = MI.getFlags(); 3399 3400 LLT S16 = LLT::scalar(16); 3401 LLT S32 = LLT::scalar(32); 3402 3403 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3404 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3405 3406 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3407 .addUse(RHSExt.getReg(0)) 3408 .setMIFlags(Flags); 3409 3410 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3411 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3412 3413 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3414 .addUse(RDst.getReg(0)) 3415 .addUse(RHS) 3416 .addUse(LHS) 3417 .setMIFlags(Flags); 3418 3419 MI.eraseFromParent(); 3420 return true; 3421 } 3422 3423 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3424 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3425 static void toggleSPDenormMode(bool Enable, 3426 MachineIRBuilder &B, 3427 const GCNSubtarget &ST, 3428 AMDGPU::SIModeRegisterDefaults Mode) { 3429 // Set SP denorm mode to this value. 3430 unsigned SPDenormMode = 3431 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3432 3433 if (ST.hasDenormModeInst()) { 3434 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3435 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3436 3437 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3438 B.buildInstr(AMDGPU::S_DENORM_MODE) 3439 .addImm(NewDenormModeValue); 3440 3441 } else { 3442 // Select FP32 bit field in mode register. 3443 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3444 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3445 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3446 3447 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3448 .addImm(SPDenormMode) 3449 .addImm(SPDenormModeBitField); 3450 } 3451 } 3452 3453 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3454 MachineRegisterInfo &MRI, 3455 MachineIRBuilder &B) const { 3456 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3457 return true; 3458 3459 Register Res = MI.getOperand(0).getReg(); 3460 Register LHS = MI.getOperand(1).getReg(); 3461 Register RHS = MI.getOperand(2).getReg(); 3462 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3463 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3464 3465 uint16_t Flags = MI.getFlags(); 3466 3467 LLT S32 = LLT::scalar(32); 3468 LLT S1 = LLT::scalar(1); 3469 3470 auto One = B.buildFConstant(S32, 1.0f); 3471 3472 auto DenominatorScaled = 3473 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3474 .addUse(LHS) 3475 .addUse(RHS) 3476 .addImm(0) 3477 .setMIFlags(Flags); 3478 auto NumeratorScaled = 3479 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3480 .addUse(LHS) 3481 .addUse(RHS) 3482 .addImm(1) 3483 .setMIFlags(Flags); 3484 3485 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3486 .addUse(DenominatorScaled.getReg(0)) 3487 .setMIFlags(Flags); 3488 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3489 3490 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3491 // aren't modeled as reading it. 3492 if (!Mode.allFP32Denormals()) 3493 toggleSPDenormMode(true, B, ST, Mode); 3494 3495 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3496 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3497 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3498 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3499 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3500 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3501 3502 if (!Mode.allFP32Denormals()) 3503 toggleSPDenormMode(false, B, ST, Mode); 3504 3505 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3506 .addUse(Fma4.getReg(0)) 3507 .addUse(Fma1.getReg(0)) 3508 .addUse(Fma3.getReg(0)) 3509 .addUse(NumeratorScaled.getReg(1)) 3510 .setMIFlags(Flags); 3511 3512 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3513 .addUse(Fmas.getReg(0)) 3514 .addUse(RHS) 3515 .addUse(LHS) 3516 .setMIFlags(Flags); 3517 3518 MI.eraseFromParent(); 3519 return true; 3520 } 3521 3522 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3523 MachineRegisterInfo &MRI, 3524 MachineIRBuilder &B) const { 3525 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3526 return true; 3527 3528 Register Res = MI.getOperand(0).getReg(); 3529 Register LHS = MI.getOperand(1).getReg(); 3530 Register RHS = MI.getOperand(2).getReg(); 3531 3532 uint16_t Flags = MI.getFlags(); 3533 3534 LLT S64 = LLT::scalar(64); 3535 LLT S1 = LLT::scalar(1); 3536 3537 auto One = B.buildFConstant(S64, 1.0); 3538 3539 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3540 .addUse(LHS) 3541 .addUse(RHS) 3542 .addImm(0) 3543 .setMIFlags(Flags); 3544 3545 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3546 3547 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3548 .addUse(DivScale0.getReg(0)) 3549 .setMIFlags(Flags); 3550 3551 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3552 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3553 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3554 3555 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3556 .addUse(LHS) 3557 .addUse(RHS) 3558 .addImm(1) 3559 .setMIFlags(Flags); 3560 3561 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3562 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3563 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3564 3565 Register Scale; 3566 if (!ST.hasUsableDivScaleConditionOutput()) { 3567 // Workaround a hardware bug on SI where the condition output from div_scale 3568 // is not usable. 3569 3570 LLT S32 = LLT::scalar(32); 3571 3572 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3573 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3574 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3575 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3576 3577 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3578 Scale1Unmerge.getReg(1)); 3579 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3580 Scale0Unmerge.getReg(1)); 3581 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3582 } else { 3583 Scale = DivScale1.getReg(1); 3584 } 3585 3586 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3587 .addUse(Fma4.getReg(0)) 3588 .addUse(Fma3.getReg(0)) 3589 .addUse(Mul.getReg(0)) 3590 .addUse(Scale) 3591 .setMIFlags(Flags); 3592 3593 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3594 .addUse(Fmas.getReg(0)) 3595 .addUse(RHS) 3596 .addUse(LHS) 3597 .setMIFlags(Flags); 3598 3599 MI.eraseFromParent(); 3600 return true; 3601 } 3602 3603 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3604 MachineRegisterInfo &MRI, 3605 MachineIRBuilder &B) const { 3606 Register Res = MI.getOperand(0).getReg(); 3607 Register LHS = MI.getOperand(2).getReg(); 3608 Register RHS = MI.getOperand(3).getReg(); 3609 uint16_t Flags = MI.getFlags(); 3610 3611 LLT S32 = LLT::scalar(32); 3612 LLT S1 = LLT::scalar(1); 3613 3614 auto Abs = B.buildFAbs(S32, RHS, Flags); 3615 const APFloat C0Val(1.0f); 3616 3617 auto C0 = B.buildConstant(S32, 0x6f800000); 3618 auto C1 = B.buildConstant(S32, 0x2f800000); 3619 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3620 3621 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3622 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3623 3624 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3625 3626 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3627 .addUse(Mul0.getReg(0)) 3628 .setMIFlags(Flags); 3629 3630 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3631 3632 B.buildFMul(Res, Sel, Mul1, Flags); 3633 3634 MI.eraseFromParent(); 3635 return true; 3636 } 3637 3638 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3639 // FIXME: Why do we handle this one but not other removed instructions? 3640 // 3641 // Reciprocal square root. The clamp prevents infinite results, clamping 3642 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3643 // +-max_float. 3644 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3645 MachineRegisterInfo &MRI, 3646 MachineIRBuilder &B) const { 3647 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3648 return true; 3649 3650 Register Dst = MI.getOperand(0).getReg(); 3651 Register Src = MI.getOperand(2).getReg(); 3652 auto Flags = MI.getFlags(); 3653 3654 LLT Ty = MRI.getType(Dst); 3655 3656 const fltSemantics *FltSemantics; 3657 if (Ty == LLT::scalar(32)) 3658 FltSemantics = &APFloat::IEEEsingle(); 3659 else if (Ty == LLT::scalar(64)) 3660 FltSemantics = &APFloat::IEEEdouble(); 3661 else 3662 return false; 3663 3664 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3665 .addUse(Src) 3666 .setMIFlags(Flags); 3667 3668 // We don't need to concern ourselves with the snan handling difference, since 3669 // the rsq quieted (or not) so use the one which will directly select. 3670 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3671 const bool UseIEEE = MFI->getMode().IEEE; 3672 3673 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3674 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3675 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3676 3677 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3678 3679 if (UseIEEE) 3680 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3681 else 3682 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3683 MI.eraseFromParent(); 3684 return true; 3685 } 3686 3687 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3688 switch (IID) { 3689 case Intrinsic::amdgcn_ds_fadd: 3690 return AMDGPU::G_ATOMICRMW_FADD; 3691 case Intrinsic::amdgcn_ds_fmin: 3692 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3693 case Intrinsic::amdgcn_ds_fmax: 3694 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3695 default: 3696 llvm_unreachable("not a DS FP intrinsic"); 3697 } 3698 } 3699 3700 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3701 MachineInstr &MI, 3702 Intrinsic::ID IID) const { 3703 GISelChangeObserver &Observer = Helper.Observer; 3704 Observer.changingInstr(MI); 3705 3706 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3707 3708 // The remaining operands were used to set fields in the MemOperand on 3709 // construction. 3710 for (int I = 6; I > 3; --I) 3711 MI.RemoveOperand(I); 3712 3713 MI.RemoveOperand(1); // Remove the intrinsic ID. 3714 Observer.changedInstr(MI); 3715 return true; 3716 } 3717 3718 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3719 MachineRegisterInfo &MRI, 3720 MachineIRBuilder &B) const { 3721 uint64_t Offset = 3722 ST.getTargetLowering()->getImplicitParameterOffset( 3723 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3724 LLT DstTy = MRI.getType(DstReg); 3725 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3726 3727 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3728 if (!loadInputValue(KernargPtrReg, B, 3729 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3730 return false; 3731 3732 // FIXME: This should be nuw 3733 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3734 return true; 3735 } 3736 3737 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3738 MachineRegisterInfo &MRI, 3739 MachineIRBuilder &B) const { 3740 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3741 if (!MFI->isEntryFunction()) { 3742 return legalizePreloadedArgIntrin(MI, MRI, B, 3743 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3744 } 3745 3746 Register DstReg = MI.getOperand(0).getReg(); 3747 if (!getImplicitArgPtr(DstReg, MRI, B)) 3748 return false; 3749 3750 MI.eraseFromParent(); 3751 return true; 3752 } 3753 3754 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3755 MachineRegisterInfo &MRI, 3756 MachineIRBuilder &B, 3757 unsigned AddrSpace) const { 3758 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3759 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3760 Register Hi32 = Unmerge.getReg(1); 3761 3762 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3763 MI.eraseFromParent(); 3764 return true; 3765 } 3766 3767 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3768 // offset (the offset that is included in bounds checking and swizzling, to be 3769 // split between the instruction's voffset and immoffset fields) and soffset 3770 // (the offset that is excluded from bounds checking and swizzling, to go in 3771 // the instruction's soffset field). This function takes the first kind of 3772 // offset and figures out how to split it between voffset and immoffset. 3773 std::pair<Register, unsigned> 3774 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3775 Register OrigOffset) const { 3776 const unsigned MaxImm = 4095; 3777 Register BaseReg; 3778 unsigned ImmOffset; 3779 const LLT S32 = LLT::scalar(32); 3780 MachineRegisterInfo &MRI = *B.getMRI(); 3781 3782 std::tie(BaseReg, ImmOffset) = 3783 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3784 3785 // If BaseReg is a pointer, convert it to int. 3786 if (MRI.getType(BaseReg).isPointer()) 3787 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3788 3789 // If the immediate value is too big for the immoffset field, put the value 3790 // and -4096 into the immoffset field so that the value that is copied/added 3791 // for the voffset field is a multiple of 4096, and it stands more chance 3792 // of being CSEd with the copy/add for another similar load/store. 3793 // However, do not do that rounding down to a multiple of 4096 if that is a 3794 // negative number, as it appears to be illegal to have a negative offset 3795 // in the vgpr, even if adding the immediate offset makes it positive. 3796 unsigned Overflow = ImmOffset & ~MaxImm; 3797 ImmOffset -= Overflow; 3798 if ((int32_t)Overflow < 0) { 3799 Overflow += ImmOffset; 3800 ImmOffset = 0; 3801 } 3802 3803 if (Overflow != 0) { 3804 if (!BaseReg) { 3805 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3806 } else { 3807 auto OverflowVal = B.buildConstant(S32, Overflow); 3808 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3809 } 3810 } 3811 3812 if (!BaseReg) 3813 BaseReg = B.buildConstant(S32, 0).getReg(0); 3814 3815 return std::make_pair(BaseReg, ImmOffset); 3816 } 3817 3818 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic. 3819 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, 3820 Register VOffset, Register SOffset, 3821 unsigned ImmOffset, Register VIndex, 3822 MachineRegisterInfo &MRI) const { 3823 Optional<ValueAndVReg> MaybeVOffsetVal = 3824 getIConstantVRegValWithLookThrough(VOffset, MRI); 3825 Optional<ValueAndVReg> MaybeSOffsetVal = 3826 getIConstantVRegValWithLookThrough(SOffset, MRI); 3827 Optional<ValueAndVReg> MaybeVIndexVal = 3828 getIConstantVRegValWithLookThrough(VIndex, MRI); 3829 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant, 3830 // update the MMO with that offset. The stride is unknown so we can only do 3831 // this if VIndex is constant 0. 3832 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && 3833 MaybeVIndexVal->Value == 0) { 3834 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + 3835 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; 3836 MMO->setOffset(TotalOffset); 3837 } else { 3838 // We don't have a constant combined offset to use in the MMO. Give up. 3839 MMO->setValue((Value *)nullptr); 3840 } 3841 } 3842 3843 /// Handle register layout difference for f16 images for some subtargets. 3844 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3845 MachineRegisterInfo &MRI, 3846 Register Reg, 3847 bool ImageStore) const { 3848 const LLT S16 = LLT::scalar(16); 3849 const LLT S32 = LLT::scalar(32); 3850 LLT StoreVT = MRI.getType(Reg); 3851 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3852 3853 if (ST.hasUnpackedD16VMem()) { 3854 auto Unmerge = B.buildUnmerge(S16, Reg); 3855 3856 SmallVector<Register, 4> WideRegs; 3857 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3858 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3859 3860 int NumElts = StoreVT.getNumElements(); 3861 3862 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 3863 .getReg(0); 3864 } 3865 3866 if (ImageStore && ST.hasImageStoreD16Bug()) { 3867 if (StoreVT.getNumElements() == 2) { 3868 SmallVector<Register, 4> PackedRegs; 3869 Reg = B.buildBitcast(S32, Reg).getReg(0); 3870 PackedRegs.push_back(Reg); 3871 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3872 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 3873 .getReg(0); 3874 } 3875 3876 if (StoreVT.getNumElements() == 3) { 3877 SmallVector<Register, 4> PackedRegs; 3878 auto Unmerge = B.buildUnmerge(S16, Reg); 3879 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3880 PackedRegs.push_back(Unmerge.getReg(I)); 3881 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3882 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 3883 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 3884 } 3885 3886 if (StoreVT.getNumElements() == 4) { 3887 SmallVector<Register, 4> PackedRegs; 3888 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 3889 auto Unmerge = B.buildUnmerge(S32, Reg); 3890 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3891 PackedRegs.push_back(Unmerge.getReg(I)); 3892 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3893 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 3894 .getReg(0); 3895 } 3896 3897 llvm_unreachable("invalid data type"); 3898 } 3899 3900 if (StoreVT == LLT::fixed_vector(3, S16)) { 3901 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) 3902 .getReg(0); 3903 } 3904 return Reg; 3905 } 3906 3907 Register AMDGPULegalizerInfo::fixStoreSourceType( 3908 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3909 MachineRegisterInfo *MRI = B.getMRI(); 3910 LLT Ty = MRI->getType(VData); 3911 3912 const LLT S16 = LLT::scalar(16); 3913 3914 // Fixup illegal register types for i8 stores. 3915 if (Ty == LLT::scalar(8) || Ty == S16) { 3916 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3917 return AnyExt; 3918 } 3919 3920 if (Ty.isVector()) { 3921 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3922 if (IsFormat) 3923 return handleD16VData(B, *MRI, VData); 3924 } 3925 } 3926 3927 return VData; 3928 } 3929 3930 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3931 MachineRegisterInfo &MRI, 3932 MachineIRBuilder &B, 3933 bool IsTyped, 3934 bool IsFormat) const { 3935 Register VData = MI.getOperand(1).getReg(); 3936 LLT Ty = MRI.getType(VData); 3937 LLT EltTy = Ty.getScalarType(); 3938 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3939 const LLT S32 = LLT::scalar(32); 3940 3941 VData = fixStoreSourceType(B, VData, IsFormat); 3942 Register RSrc = MI.getOperand(2).getReg(); 3943 3944 MachineMemOperand *MMO = *MI.memoperands_begin(); 3945 const int MemSize = MMO->getSize(); 3946 3947 unsigned ImmOffset; 3948 3949 // The typed intrinsics add an immediate after the registers. 3950 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3951 3952 // The struct intrinsic variants add one additional operand over raw. 3953 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3954 Register VIndex; 3955 int OpOffset = 0; 3956 if (HasVIndex) { 3957 VIndex = MI.getOperand(3).getReg(); 3958 OpOffset = 1; 3959 } else { 3960 VIndex = B.buildConstant(S32, 0).getReg(0); 3961 } 3962 3963 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3964 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3965 3966 unsigned Format = 0; 3967 if (IsTyped) { 3968 Format = MI.getOperand(5 + OpOffset).getImm(); 3969 ++OpOffset; 3970 } 3971 3972 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3973 3974 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 3975 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 3976 3977 unsigned Opc; 3978 if (IsTyped) { 3979 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3980 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3981 } else if (IsFormat) { 3982 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3983 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3984 } else { 3985 switch (MemSize) { 3986 case 1: 3987 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3988 break; 3989 case 2: 3990 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3991 break; 3992 default: 3993 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3994 break; 3995 } 3996 } 3997 3998 auto MIB = B.buildInstr(Opc) 3999 .addUse(VData) // vdata 4000 .addUse(RSrc) // rsrc 4001 .addUse(VIndex) // vindex 4002 .addUse(VOffset) // voffset 4003 .addUse(SOffset) // soffset 4004 .addImm(ImmOffset); // offset(imm) 4005 4006 if (IsTyped) 4007 MIB.addImm(Format); 4008 4009 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4010 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4011 .addMemOperand(MMO); 4012 4013 MI.eraseFromParent(); 4014 return true; 4015 } 4016 4017 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 4018 MachineRegisterInfo &MRI, 4019 MachineIRBuilder &B, 4020 bool IsFormat, 4021 bool IsTyped) const { 4022 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 4023 MachineMemOperand *MMO = *MI.memoperands_begin(); 4024 const LLT MemTy = MMO->getMemoryType(); 4025 const LLT S32 = LLT::scalar(32); 4026 4027 Register Dst = MI.getOperand(0).getReg(); 4028 Register RSrc = MI.getOperand(2).getReg(); 4029 4030 // The typed intrinsics add an immediate after the registers. 4031 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 4032 4033 // The struct intrinsic variants add one additional operand over raw. 4034 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4035 Register VIndex; 4036 int OpOffset = 0; 4037 if (HasVIndex) { 4038 VIndex = MI.getOperand(3).getReg(); 4039 OpOffset = 1; 4040 } else { 4041 VIndex = B.buildConstant(S32, 0).getReg(0); 4042 } 4043 4044 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 4045 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 4046 4047 unsigned Format = 0; 4048 if (IsTyped) { 4049 Format = MI.getOperand(5 + OpOffset).getImm(); 4050 ++OpOffset; 4051 } 4052 4053 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 4054 unsigned ImmOffset; 4055 4056 LLT Ty = MRI.getType(Dst); 4057 LLT EltTy = Ty.getScalarType(); 4058 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 4059 const bool Unpacked = ST.hasUnpackedD16VMem(); 4060 4061 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4062 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 4063 4064 unsigned Opc; 4065 4066 if (IsTyped) { 4067 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 4068 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 4069 } else if (IsFormat) { 4070 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 4071 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 4072 } else { 4073 switch (MemTy.getSizeInBits()) { 4074 case 8: 4075 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 4076 break; 4077 case 16: 4078 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 4079 break; 4080 default: 4081 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 4082 break; 4083 } 4084 } 4085 4086 Register LoadDstReg; 4087 4088 bool IsExtLoad = 4089 (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector()); 4090 LLT UnpackedTy = Ty.changeElementSize(32); 4091 4092 if (IsExtLoad) 4093 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 4094 else if (Unpacked && IsD16 && Ty.isVector()) 4095 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 4096 else 4097 LoadDstReg = Dst; 4098 4099 auto MIB = B.buildInstr(Opc) 4100 .addDef(LoadDstReg) // vdata 4101 .addUse(RSrc) // rsrc 4102 .addUse(VIndex) // vindex 4103 .addUse(VOffset) // voffset 4104 .addUse(SOffset) // soffset 4105 .addImm(ImmOffset); // offset(imm) 4106 4107 if (IsTyped) 4108 MIB.addImm(Format); 4109 4110 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4111 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4112 .addMemOperand(MMO); 4113 4114 if (LoadDstReg != Dst) { 4115 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 4116 4117 // Widen result for extending loads was widened. 4118 if (IsExtLoad) 4119 B.buildTrunc(Dst, LoadDstReg); 4120 else { 4121 // Repack to original 16-bit vector result 4122 // FIXME: G_TRUNC should work, but legalization currently fails 4123 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 4124 SmallVector<Register, 4> Repack; 4125 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 4126 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 4127 B.buildMerge(Dst, Repack); 4128 } 4129 } 4130 4131 MI.eraseFromParent(); 4132 return true; 4133 } 4134 4135 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 4136 MachineIRBuilder &B, 4137 bool IsInc) const { 4138 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 4139 AMDGPU::G_AMDGPU_ATOMIC_DEC; 4140 B.buildInstr(Opc) 4141 .addDef(MI.getOperand(0).getReg()) 4142 .addUse(MI.getOperand(2).getReg()) 4143 .addUse(MI.getOperand(3).getReg()) 4144 .cloneMemRefs(MI); 4145 MI.eraseFromParent(); 4146 return true; 4147 } 4148 4149 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 4150 switch (IntrID) { 4151 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4152 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4153 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 4154 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4155 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4156 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 4157 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4158 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4159 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 4160 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4161 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4162 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 4163 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4164 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4165 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 4166 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4167 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4168 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 4169 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4170 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4171 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 4172 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4173 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4174 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 4175 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4176 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4177 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 4178 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4179 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4180 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 4181 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4182 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4183 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 4184 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4185 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4186 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 4187 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4188 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4189 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4190 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4191 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4192 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4193 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4194 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4195 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4196 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4197 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4198 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 4199 default: 4200 llvm_unreachable("unhandled atomic opcode"); 4201 } 4202 } 4203 4204 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4205 MachineIRBuilder &B, 4206 Intrinsic::ID IID) const { 4207 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4208 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4209 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4210 4211 Register Dst; 4212 4213 int OpOffset = 0; 4214 if (HasReturn) { 4215 // A few FP atomics do not support return values. 4216 Dst = MI.getOperand(0).getReg(); 4217 } else { 4218 OpOffset = -1; 4219 } 4220 4221 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4222 Register CmpVal; 4223 4224 if (IsCmpSwap) { 4225 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4226 ++OpOffset; 4227 } 4228 4229 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4230 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4231 4232 // The struct intrinsic variants add one additional operand over raw. 4233 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4234 Register VIndex; 4235 if (HasVIndex) { 4236 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4237 ++OpOffset; 4238 } else { 4239 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4240 } 4241 4242 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4243 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4244 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4245 4246 MachineMemOperand *MMO = *MI.memoperands_begin(); 4247 4248 unsigned ImmOffset; 4249 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4250 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); 4251 4252 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4253 4254 if (HasReturn) 4255 MIB.addDef(Dst); 4256 4257 MIB.addUse(VData); // vdata 4258 4259 if (IsCmpSwap) 4260 MIB.addReg(CmpVal); 4261 4262 MIB.addUse(RSrc) // rsrc 4263 .addUse(VIndex) // vindex 4264 .addUse(VOffset) // voffset 4265 .addUse(SOffset) // soffset 4266 .addImm(ImmOffset) // offset(imm) 4267 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4268 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4269 .addMemOperand(MMO); 4270 4271 MI.eraseFromParent(); 4272 return true; 4273 } 4274 4275 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4276 /// vector with s16 typed elements. 4277 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4278 SmallVectorImpl<Register> &PackedAddrs, 4279 unsigned ArgOffset, 4280 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4281 bool IsA16, bool IsG16) { 4282 const LLT S16 = LLT::scalar(16); 4283 const LLT V2S16 = LLT::fixed_vector(2, 16); 4284 auto EndIdx = Intr->VAddrEnd; 4285 4286 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4287 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4288 if (!SrcOp.isReg()) 4289 continue; // _L to _LZ may have eliminated this. 4290 4291 Register AddrReg = SrcOp.getReg(); 4292 4293 if ((I < Intr->GradientStart) || 4294 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4295 (I >= Intr->CoordStart && !IsA16)) { 4296 if ((I < Intr->GradientStart) && IsA16 && 4297 (B.getMRI()->getType(AddrReg) == S16)) { 4298 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument"); 4299 // Special handling of bias when A16 is on. Bias is of type half but 4300 // occupies full 32-bit. 4301 PackedAddrs.push_back( 4302 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4303 .getReg(0)); 4304 } else { 4305 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && 4306 "Bias needs to be converted to 16 bit in A16 mode"); 4307 // Handle any gradient or coordinate operands that should not be packed 4308 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4309 PackedAddrs.push_back(AddrReg); 4310 } 4311 } else { 4312 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4313 // derivatives dx/dh and dx/dv are packed with undef. 4314 if (((I + 1) >= EndIdx) || 4315 ((Intr->NumGradients / 2) % 2 == 1 && 4316 (I == static_cast<unsigned>(Intr->GradientStart + 4317 (Intr->NumGradients / 2) - 1) || 4318 I == static_cast<unsigned>(Intr->GradientStart + 4319 Intr->NumGradients - 1))) || 4320 // Check for _L to _LZ optimization 4321 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4322 PackedAddrs.push_back( 4323 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4324 .getReg(0)); 4325 } else { 4326 PackedAddrs.push_back( 4327 B.buildBuildVector( 4328 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4329 .getReg(0)); 4330 ++I; 4331 } 4332 } 4333 } 4334 } 4335 4336 /// Convert from separate vaddr components to a single vector address register, 4337 /// and replace the remaining operands with $noreg. 4338 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4339 int DimIdx, int NumVAddrs) { 4340 const LLT S32 = LLT::scalar(32); 4341 4342 SmallVector<Register, 8> AddrRegs; 4343 for (int I = 0; I != NumVAddrs; ++I) { 4344 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4345 if (SrcOp.isReg()) { 4346 AddrRegs.push_back(SrcOp.getReg()); 4347 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4348 } 4349 } 4350 4351 int NumAddrRegs = AddrRegs.size(); 4352 if (NumAddrRegs != 1) { 4353 // Above 8 elements round up to next power of 2 (i.e. 16). 4354 if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) { 4355 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4356 auto Undef = B.buildUndef(S32); 4357 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4358 NumAddrRegs = RoundedNumRegs; 4359 } 4360 4361 auto VAddr = 4362 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 4363 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4364 } 4365 4366 for (int I = 1; I != NumVAddrs; ++I) { 4367 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4368 if (SrcOp.isReg()) 4369 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4370 } 4371 } 4372 4373 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4374 /// 4375 /// Depending on the subtarget, load/store with 16-bit element data need to be 4376 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4377 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4378 /// registers. 4379 /// 4380 /// We don't want to directly select image instructions just yet, but also want 4381 /// to exposes all register repacking to the legalizer/combiners. We also don't 4382 /// want a selected instruction entering RegBankSelect. In order to avoid 4383 /// defining a multitude of intermediate image instructions, directly hack on 4384 /// the intrinsic's arguments. In cases like a16 addresses, this requires 4385 /// padding now unnecessary arguments with $noreg. 4386 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4387 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4388 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4389 4390 const unsigned NumDefs = MI.getNumExplicitDefs(); 4391 const unsigned ArgOffset = NumDefs + 1; 4392 bool IsTFE = NumDefs == 2; 4393 // We are only processing the operands of d16 image operations on subtargets 4394 // that use the unpacked register layout, or need to repack the TFE result. 4395 4396 // TODO: Do we need to guard against already legalized intrinsics? 4397 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4398 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4399 4400 MachineRegisterInfo *MRI = B.getMRI(); 4401 const LLT S32 = LLT::scalar(32); 4402 const LLT S16 = LLT::scalar(16); 4403 const LLT V2S16 = LLT::fixed_vector(2, 16); 4404 4405 unsigned DMask = 0; 4406 Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg(); 4407 LLT Ty = MRI->getType(VData); 4408 4409 // Check for 16 bit addresses and pack if true. 4410 LLT GradTy = 4411 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4412 LLT AddrTy = 4413 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4414 const bool IsG16 = GradTy == S16; 4415 const bool IsA16 = AddrTy == S16; 4416 const bool IsD16 = Ty.getScalarType() == S16; 4417 4418 int DMaskLanes = 0; 4419 if (!BaseOpcode->Atomic) { 4420 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4421 if (BaseOpcode->Gather4) { 4422 DMaskLanes = 4; 4423 } else if (DMask != 0) { 4424 DMaskLanes = countPopulation(DMask); 4425 } else if (!IsTFE && !BaseOpcode->Store) { 4426 // If dmask is 0, this is a no-op load. This can be eliminated. 4427 B.buildUndef(MI.getOperand(0)); 4428 MI.eraseFromParent(); 4429 return true; 4430 } 4431 } 4432 4433 Observer.changingInstr(MI); 4434 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4435 4436 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16 4437 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE; 4438 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 4439 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4440 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode; 4441 4442 // Track that we legalized this 4443 MI.setDesc(B.getTII().get(NewOpcode)); 4444 4445 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4446 // dmask to be at least 1 otherwise the instruction will fail 4447 if (IsTFE && DMask == 0) { 4448 DMask = 0x1; 4449 DMaskLanes = 1; 4450 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4451 } 4452 4453 if (BaseOpcode->Atomic) { 4454 Register VData0 = MI.getOperand(2).getReg(); 4455 LLT Ty = MRI->getType(VData0); 4456 4457 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4458 if (Ty.isVector()) 4459 return false; 4460 4461 if (BaseOpcode->AtomicX2) { 4462 Register VData1 = MI.getOperand(3).getReg(); 4463 // The two values are packed in one register. 4464 LLT PackedTy = LLT::fixed_vector(2, Ty); 4465 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4466 MI.getOperand(2).setReg(Concat.getReg(0)); 4467 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4468 } 4469 } 4470 4471 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4472 4473 // Rewrite the addressing register layout before doing anything else. 4474 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4475 // 16 bit gradients are supported, but are tied to the A16 control 4476 // so both gradients and addresses must be 16 bit 4477 return false; 4478 } 4479 4480 if (IsA16 && !ST.hasA16()) { 4481 // A16 not supported 4482 return false; 4483 } 4484 4485 if (IsA16 || IsG16) { 4486 if (Intr->NumVAddrs > 1) { 4487 SmallVector<Register, 4> PackedRegs; 4488 4489 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4490 IsG16); 4491 4492 // See also below in the non-a16 branch 4493 const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 && 4494 PackedRegs.size() <= ST.getNSAMaxSize(); 4495 4496 if (!UseNSA && PackedRegs.size() > 1) { 4497 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 4498 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4499 PackedRegs[0] = Concat.getReg(0); 4500 PackedRegs.resize(1); 4501 } 4502 4503 const unsigned NumPacked = PackedRegs.size(); 4504 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4505 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4506 if (!SrcOp.isReg()) { 4507 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4508 continue; 4509 } 4510 4511 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4512 4513 if (I - Intr->VAddrStart < NumPacked) 4514 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4515 else 4516 SrcOp.setReg(AMDGPU::NoRegister); 4517 } 4518 } 4519 } else { 4520 // If the register allocator cannot place the address registers contiguously 4521 // without introducing moves, then using the non-sequential address encoding 4522 // is always preferable, since it saves VALU instructions and is usually a 4523 // wash in terms of code size or even better. 4524 // 4525 // However, we currently have no way of hinting to the register allocator 4526 // that MIMG addresses should be placed contiguously when it is possible to 4527 // do so, so force non-NSA for the common 2-address case as a heuristic. 4528 // 4529 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4530 // allocation when possible. 4531 const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 && 4532 CorrectedNumVAddrs <= ST.getNSAMaxSize(); 4533 4534 if (!UseNSA && Intr->NumVAddrs > 1) 4535 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4536 Intr->NumVAddrs); 4537 } 4538 4539 int Flags = 0; 4540 if (IsA16) 4541 Flags |= 1; 4542 if (IsG16) 4543 Flags |= 2; 4544 MI.addOperand(MachineOperand::CreateImm(Flags)); 4545 4546 if (BaseOpcode->Store) { // No TFE for stores? 4547 // TODO: Handle dmask trim 4548 if (!Ty.isVector() || !IsD16) 4549 return true; 4550 4551 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4552 if (RepackedReg != VData) { 4553 MI.getOperand(1).setReg(RepackedReg); 4554 } 4555 4556 return true; 4557 } 4558 4559 Register DstReg = MI.getOperand(0).getReg(); 4560 const LLT EltTy = Ty.getScalarType(); 4561 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4562 4563 // Confirm that the return type is large enough for the dmask specified 4564 if (NumElts < DMaskLanes) 4565 return false; 4566 4567 if (NumElts > 4 || DMaskLanes > 4) 4568 return false; 4569 4570 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4571 const LLT AdjustedTy = 4572 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 4573 4574 // The raw dword aligned data component of the load. The only legal cases 4575 // where this matters should be when using the packed D16 format, for 4576 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4577 LLT RoundedTy; 4578 4579 // S32 vector to to cover all data, plus TFE result element. 4580 LLT TFETy; 4581 4582 // Register type to use for each loaded component. Will be S32 or V2S16. 4583 LLT RegTy; 4584 4585 if (IsD16 && ST.hasUnpackedD16VMem()) { 4586 RoundedTy = 4587 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 4588 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 4589 RegTy = S32; 4590 } else { 4591 unsigned EltSize = EltTy.getSizeInBits(); 4592 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4593 unsigned RoundedSize = 32 * RoundedElts; 4594 RoundedTy = LLT::scalarOrVector( 4595 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 4596 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 4597 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4598 } 4599 4600 // The return type does not need adjustment. 4601 // TODO: Should we change s16 case to s32 or <2 x s16>? 4602 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4603 return true; 4604 4605 Register Dst1Reg; 4606 4607 // Insert after the instruction. 4608 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4609 4610 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4611 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4612 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4613 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4614 4615 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4616 4617 MI.getOperand(0).setReg(NewResultReg); 4618 4619 // In the IR, TFE is supposed to be used with a 2 element struct return 4620 // type. The instruction really returns these two values in one contiguous 4621 // register, with one additional dword beyond the loaded data. Rewrite the 4622 // return type to use a single register result. 4623 4624 if (IsTFE) { 4625 Dst1Reg = MI.getOperand(1).getReg(); 4626 if (MRI->getType(Dst1Reg) != S32) 4627 return false; 4628 4629 // TODO: Make sure the TFE operand bit is set. 4630 MI.RemoveOperand(1); 4631 4632 // Handle the easy case that requires no repack instructions. 4633 if (Ty == S32) { 4634 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4635 return true; 4636 } 4637 } 4638 4639 // Now figure out how to copy the new result register back into the old 4640 // result. 4641 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4642 4643 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4644 4645 if (ResultNumRegs == 1) { 4646 assert(!IsTFE); 4647 ResultRegs[0] = NewResultReg; 4648 } else { 4649 // We have to repack into a new vector of some kind. 4650 for (int I = 0; I != NumDataRegs; ++I) 4651 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4652 B.buildUnmerge(ResultRegs, NewResultReg); 4653 4654 // Drop the final TFE element to get the data part. The TFE result is 4655 // directly written to the right place already. 4656 if (IsTFE) 4657 ResultRegs.resize(NumDataRegs); 4658 } 4659 4660 // For an s16 scalar result, we form an s32 result with a truncate regardless 4661 // of packed vs. unpacked. 4662 if (IsD16 && !Ty.isVector()) { 4663 B.buildTrunc(DstReg, ResultRegs[0]); 4664 return true; 4665 } 4666 4667 // Avoid a build/concat_vector of 1 entry. 4668 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4669 B.buildBitcast(DstReg, ResultRegs[0]); 4670 return true; 4671 } 4672 4673 assert(Ty.isVector()); 4674 4675 if (IsD16) { 4676 // For packed D16 results with TFE enabled, all the data components are 4677 // S32. Cast back to the expected type. 4678 // 4679 // TODO: We don't really need to use load s32 elements. We would only need one 4680 // cast for the TFE result if a multiple of v2s16 was used. 4681 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4682 for (Register &Reg : ResultRegs) 4683 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4684 } else if (ST.hasUnpackedD16VMem()) { 4685 for (Register &Reg : ResultRegs) 4686 Reg = B.buildTrunc(S16, Reg).getReg(0); 4687 } 4688 } 4689 4690 auto padWithUndef = [&](LLT Ty, int NumElts) { 4691 if (NumElts == 0) 4692 return; 4693 Register Undef = B.buildUndef(Ty).getReg(0); 4694 for (int I = 0; I != NumElts; ++I) 4695 ResultRegs.push_back(Undef); 4696 }; 4697 4698 // Pad out any elements eliminated due to the dmask. 4699 LLT ResTy = MRI->getType(ResultRegs[0]); 4700 if (!ResTy.isVector()) { 4701 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4702 B.buildBuildVector(DstReg, ResultRegs); 4703 return true; 4704 } 4705 4706 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4707 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4708 4709 // Deal with the one annoying legal case. 4710 const LLT V3S16 = LLT::fixed_vector(3, 16); 4711 if (Ty == V3S16) { 4712 if (IsTFE) { 4713 if (ResultRegs.size() == 1) { 4714 NewResultReg = ResultRegs[0]; 4715 } else if (ResultRegs.size() == 2) { 4716 LLT V4S16 = LLT::fixed_vector(4, 16); 4717 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); 4718 } else { 4719 return false; 4720 } 4721 } 4722 4723 if (MRI->getType(DstReg).getNumElements() < 4724 MRI->getType(NewResultReg).getNumElements()) { 4725 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg); 4726 } else { 4727 B.buildPadVectorWithUndefElements(DstReg, NewResultReg); 4728 } 4729 return true; 4730 } 4731 4732 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4733 B.buildConcatVectors(DstReg, ResultRegs); 4734 return true; 4735 } 4736 4737 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4738 LegalizerHelper &Helper, MachineInstr &MI) const { 4739 MachineIRBuilder &B = Helper.MIRBuilder; 4740 GISelChangeObserver &Observer = Helper.Observer; 4741 4742 Register Dst = MI.getOperand(0).getReg(); 4743 LLT Ty = B.getMRI()->getType(Dst); 4744 unsigned Size = Ty.getSizeInBits(); 4745 MachineFunction &MF = B.getMF(); 4746 4747 Observer.changingInstr(MI); 4748 4749 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 4750 Ty = getBitcastRegisterType(Ty); 4751 Helper.bitcastDst(MI, Ty, 0); 4752 Dst = MI.getOperand(0).getReg(); 4753 B.setInsertPt(B.getMBB(), MI); 4754 } 4755 4756 // FIXME: We don't really need this intermediate instruction. The intrinsic 4757 // should be fixed to have a memory operand. Since it's readnone, we're not 4758 // allowed to add one. 4759 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4760 MI.RemoveOperand(1); // Remove intrinsic ID 4761 4762 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4763 // TODO: Should this use datalayout alignment? 4764 const unsigned MemSize = (Size + 7) / 8; 4765 const Align MemAlign(4); 4766 MachineMemOperand *MMO = MF.getMachineMemOperand( 4767 MachinePointerInfo(), 4768 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4769 MachineMemOperand::MOInvariant, 4770 MemSize, MemAlign); 4771 MI.addMemOperand(MF, MMO); 4772 4773 // There are no 96-bit result scalar loads, but widening to 128-bit should 4774 // always be legal. We may need to restore this to a 96-bit result if it turns 4775 // out this needs to be converted to a vector load during RegBankSelect. 4776 if (!isPowerOf2_32(Size)) { 4777 if (Ty.isVector()) 4778 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4779 else 4780 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4781 } 4782 4783 Observer.changedInstr(MI); 4784 return true; 4785 } 4786 4787 // TODO: Move to selection 4788 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4789 MachineRegisterInfo &MRI, 4790 MachineIRBuilder &B) const { 4791 if (!ST.isTrapHandlerEnabled() || 4792 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4793 return legalizeTrapEndpgm(MI, MRI, B); 4794 4795 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4796 switch (*HsaAbiVer) { 4797 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4798 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4799 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4800 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4801 case ELF::ELFABIVERSION_AMDGPU_HSA_V5: 4802 return ST.supportsGetDoorbellID() ? 4803 legalizeTrapHsa(MI, MRI, B) : 4804 legalizeTrapHsaQueuePtr(MI, MRI, B); 4805 } 4806 } 4807 4808 llvm_unreachable("Unknown trap handler"); 4809 } 4810 4811 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4812 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4813 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4814 MI.eraseFromParent(); 4815 return true; 4816 } 4817 4818 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4819 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4820 // Pass queue pointer to trap handler as input, and insert trap instruction 4821 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4822 Register LiveIn = 4823 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4824 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4825 return false; 4826 4827 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4828 B.buildCopy(SGPR01, LiveIn); 4829 B.buildInstr(AMDGPU::S_TRAP) 4830 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4831 .addReg(SGPR01, RegState::Implicit); 4832 4833 MI.eraseFromParent(); 4834 return true; 4835 } 4836 4837 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4838 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4839 B.buildInstr(AMDGPU::S_TRAP) 4840 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4841 MI.eraseFromParent(); 4842 return true; 4843 } 4844 4845 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4846 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4847 // Is non-HSA path or trap-handler disabled? Then, report a warning 4848 // accordingly 4849 if (!ST.isTrapHandlerEnabled() || 4850 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4851 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4852 "debugtrap handler not supported", 4853 MI.getDebugLoc(), DS_Warning); 4854 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4855 Ctx.diagnose(NoTrap); 4856 } else { 4857 // Insert debug-trap instruction 4858 B.buildInstr(AMDGPU::S_TRAP) 4859 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4860 } 4861 4862 MI.eraseFromParent(); 4863 return true; 4864 } 4865 4866 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4867 MachineIRBuilder &B) const { 4868 MachineRegisterInfo &MRI = *B.getMRI(); 4869 const LLT S16 = LLT::scalar(16); 4870 const LLT S32 = LLT::scalar(32); 4871 4872 Register DstReg = MI.getOperand(0).getReg(); 4873 Register NodePtr = MI.getOperand(2).getReg(); 4874 Register RayExtent = MI.getOperand(3).getReg(); 4875 Register RayOrigin = MI.getOperand(4).getReg(); 4876 Register RayDir = MI.getOperand(5).getReg(); 4877 Register RayInvDir = MI.getOperand(6).getReg(); 4878 Register TDescr = MI.getOperand(7).getReg(); 4879 4880 if (!ST.hasGFX10_AEncoding()) { 4881 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 4882 "intrinsic not supported on subtarget", 4883 MI.getDebugLoc()); 4884 B.getMF().getFunction().getContext().diagnose(BadIntrin); 4885 return false; 4886 } 4887 4888 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4889 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4890 const unsigned NumVDataDwords = 4; 4891 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 4892 const bool UseNSA = 4893 ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize(); 4894 const unsigned BaseOpcodes[2][2] = { 4895 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 4896 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 4897 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 4898 int Opcode; 4899 if (UseNSA) { 4900 Opcode = 4901 AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA, 4902 NumVDataDwords, NumVAddrDwords); 4903 } else { 4904 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 4905 AMDGPU::MIMGEncGfx10Default, NumVDataDwords, 4906 PowerOf2Ceil(NumVAddrDwords)); 4907 } 4908 assert(Opcode != -1); 4909 4910 SmallVector<Register, 12> Ops; 4911 if (Is64) { 4912 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4913 Ops.push_back(Unmerge.getReg(0)); 4914 Ops.push_back(Unmerge.getReg(1)); 4915 } else { 4916 Ops.push_back(NodePtr); 4917 } 4918 Ops.push_back(RayExtent); 4919 4920 auto packLanes = [&Ops, &S32, &B](Register Src) { 4921 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 4922 Ops.push_back(Unmerge.getReg(0)); 4923 Ops.push_back(Unmerge.getReg(1)); 4924 Ops.push_back(Unmerge.getReg(2)); 4925 }; 4926 4927 packLanes(RayOrigin); 4928 if (IsA16) { 4929 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 4930 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 4931 Register R1 = MRI.createGenericVirtualRegister(S32); 4932 Register R2 = MRI.createGenericVirtualRegister(S32); 4933 Register R3 = MRI.createGenericVirtualRegister(S32); 4934 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4935 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4936 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4937 Ops.push_back(R1); 4938 Ops.push_back(R2); 4939 Ops.push_back(R3); 4940 } else { 4941 packLanes(RayDir); 4942 packLanes(RayInvDir); 4943 } 4944 4945 if (!UseNSA) { 4946 // Build a single vector containing all the operands so far prepared. 4947 LLT OpTy = LLT::fixed_vector(Ops.size(), 32); 4948 Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0); 4949 Ops.clear(); 4950 Ops.push_back(MergedOps); 4951 } 4952 4953 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4954 .addDef(DstReg) 4955 .addImm(Opcode); 4956 4957 for (Register R : Ops) { 4958 MIB.addUse(R); 4959 } 4960 4961 MIB.addUse(TDescr) 4962 .addImm(IsA16 ? 1 : 0) 4963 .cloneMemRefs(MI); 4964 4965 MI.eraseFromParent(); 4966 return true; 4967 } 4968 4969 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C) { 4970 B.buildConstant(MI.getOperand(0).getReg(), C); 4971 MI.eraseFromParent(); 4972 return true; 4973 } 4974 4975 bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI, 4976 MachineIRBuilder &B) const { 4977 unsigned Opc; 4978 int RoundMode = MI.getOperand(2).getImm(); 4979 4980 if (RoundMode == (int)RoundingMode::TowardPositive) 4981 Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD; 4982 else if (RoundMode == (int)RoundingMode::TowardNegative) 4983 Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD; 4984 else 4985 return false; 4986 4987 B.buildInstr(Opc) 4988 .addDef(MI.getOperand(0).getReg()) 4989 .addUse(MI.getOperand(1).getReg()); 4990 4991 MI.eraseFromParent(); 4992 4993 return true; 4994 } 4995 4996 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4997 MachineInstr &MI) const { 4998 MachineIRBuilder &B = Helper.MIRBuilder; 4999 MachineRegisterInfo &MRI = *B.getMRI(); 5000 5001 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 5002 auto IntrID = MI.getIntrinsicID(); 5003 switch (IntrID) { 5004 case Intrinsic::amdgcn_if: 5005 case Intrinsic::amdgcn_else: { 5006 MachineInstr *Br = nullptr; 5007 MachineBasicBlock *UncondBrTarget = nullptr; 5008 bool Negated = false; 5009 if (MachineInstr *BrCond = 5010 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 5011 const SIRegisterInfo *TRI 5012 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 5013 5014 Register Def = MI.getOperand(1).getReg(); 5015 Register Use = MI.getOperand(3).getReg(); 5016 5017 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 5018 5019 if (Negated) 5020 std::swap(CondBrTarget, UncondBrTarget); 5021 5022 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 5023 if (IntrID == Intrinsic::amdgcn_if) { 5024 B.buildInstr(AMDGPU::SI_IF) 5025 .addDef(Def) 5026 .addUse(Use) 5027 .addMBB(UncondBrTarget); 5028 } else { 5029 B.buildInstr(AMDGPU::SI_ELSE) 5030 .addDef(Def) 5031 .addUse(Use) 5032 .addMBB(UncondBrTarget); 5033 } 5034 5035 if (Br) { 5036 Br->getOperand(0).setMBB(CondBrTarget); 5037 } else { 5038 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 5039 // since we're swapping branch targets it needs to be reinserted. 5040 // FIXME: IRTranslator should probably not do this 5041 B.buildBr(*CondBrTarget); 5042 } 5043 5044 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 5045 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 5046 MI.eraseFromParent(); 5047 BrCond->eraseFromParent(); 5048 return true; 5049 } 5050 5051 return false; 5052 } 5053 case Intrinsic::amdgcn_loop: { 5054 MachineInstr *Br = nullptr; 5055 MachineBasicBlock *UncondBrTarget = nullptr; 5056 bool Negated = false; 5057 if (MachineInstr *BrCond = 5058 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 5059 const SIRegisterInfo *TRI 5060 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 5061 5062 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 5063 Register Reg = MI.getOperand(2).getReg(); 5064 5065 if (Negated) 5066 std::swap(CondBrTarget, UncondBrTarget); 5067 5068 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 5069 B.buildInstr(AMDGPU::SI_LOOP) 5070 .addUse(Reg) 5071 .addMBB(UncondBrTarget); 5072 5073 if (Br) 5074 Br->getOperand(0).setMBB(CondBrTarget); 5075 else 5076 B.buildBr(*CondBrTarget); 5077 5078 MI.eraseFromParent(); 5079 BrCond->eraseFromParent(); 5080 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 5081 return true; 5082 } 5083 5084 return false; 5085 } 5086 case Intrinsic::amdgcn_kernarg_segment_ptr: 5087 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 5088 // This only makes sense to call in a kernel, so just lower to null. 5089 B.buildConstant(MI.getOperand(0).getReg(), 0); 5090 MI.eraseFromParent(); 5091 return true; 5092 } 5093 5094 return legalizePreloadedArgIntrin( 5095 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 5096 case Intrinsic::amdgcn_implicitarg_ptr: 5097 return legalizeImplicitArgPtr(MI, MRI, B); 5098 case Intrinsic::amdgcn_workitem_id_x: 5099 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 0) == 0) 5100 return replaceWithConstant(B, MI, 0); 5101 return legalizePreloadedArgIntrin(MI, MRI, B, 5102 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 5103 case Intrinsic::amdgcn_workitem_id_y: 5104 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 1) == 0) 5105 return replaceWithConstant(B, MI, 0); 5106 5107 return legalizePreloadedArgIntrin(MI, MRI, B, 5108 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 5109 case Intrinsic::amdgcn_workitem_id_z: 5110 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 2) == 0) 5111 return replaceWithConstant(B, MI, 0); 5112 5113 return legalizePreloadedArgIntrin(MI, MRI, B, 5114 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 5115 case Intrinsic::amdgcn_workgroup_id_x: 5116 return legalizePreloadedArgIntrin(MI, MRI, B, 5117 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 5118 case Intrinsic::amdgcn_workgroup_id_y: 5119 return legalizePreloadedArgIntrin(MI, MRI, B, 5120 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 5121 case Intrinsic::amdgcn_workgroup_id_z: 5122 return legalizePreloadedArgIntrin(MI, MRI, B, 5123 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 5124 case Intrinsic::amdgcn_dispatch_ptr: 5125 return legalizePreloadedArgIntrin(MI, MRI, B, 5126 AMDGPUFunctionArgInfo::DISPATCH_PTR); 5127 case Intrinsic::amdgcn_queue_ptr: 5128 return legalizePreloadedArgIntrin(MI, MRI, B, 5129 AMDGPUFunctionArgInfo::QUEUE_PTR); 5130 case Intrinsic::amdgcn_implicit_buffer_ptr: 5131 return legalizePreloadedArgIntrin( 5132 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 5133 case Intrinsic::amdgcn_dispatch_id: 5134 return legalizePreloadedArgIntrin(MI, MRI, B, 5135 AMDGPUFunctionArgInfo::DISPATCH_ID); 5136 case Intrinsic::amdgcn_fdiv_fast: 5137 return legalizeFDIVFastIntrin(MI, MRI, B); 5138 case Intrinsic::amdgcn_is_shared: 5139 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 5140 case Intrinsic::amdgcn_is_private: 5141 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 5142 case Intrinsic::amdgcn_wavefrontsize: { 5143 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 5144 MI.eraseFromParent(); 5145 return true; 5146 } 5147 case Intrinsic::amdgcn_s_buffer_load: 5148 return legalizeSBufferLoad(Helper, MI); 5149 case Intrinsic::amdgcn_raw_buffer_store: 5150 case Intrinsic::amdgcn_struct_buffer_store: 5151 return legalizeBufferStore(MI, MRI, B, false, false); 5152 case Intrinsic::amdgcn_raw_buffer_store_format: 5153 case Intrinsic::amdgcn_struct_buffer_store_format: 5154 return legalizeBufferStore(MI, MRI, B, false, true); 5155 case Intrinsic::amdgcn_raw_tbuffer_store: 5156 case Intrinsic::amdgcn_struct_tbuffer_store: 5157 return legalizeBufferStore(MI, MRI, B, true, true); 5158 case Intrinsic::amdgcn_raw_buffer_load: 5159 case Intrinsic::amdgcn_struct_buffer_load: 5160 return legalizeBufferLoad(MI, MRI, B, false, false); 5161 case Intrinsic::amdgcn_raw_buffer_load_format: 5162 case Intrinsic::amdgcn_struct_buffer_load_format: 5163 return legalizeBufferLoad(MI, MRI, B, true, false); 5164 case Intrinsic::amdgcn_raw_tbuffer_load: 5165 case Intrinsic::amdgcn_struct_tbuffer_load: 5166 return legalizeBufferLoad(MI, MRI, B, true, true); 5167 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 5168 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 5169 case Intrinsic::amdgcn_raw_buffer_atomic_add: 5170 case Intrinsic::amdgcn_struct_buffer_atomic_add: 5171 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 5172 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 5173 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 5174 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 5175 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 5176 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 5177 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 5178 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 5179 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 5180 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 5181 case Intrinsic::amdgcn_raw_buffer_atomic_and: 5182 case Intrinsic::amdgcn_struct_buffer_atomic_and: 5183 case Intrinsic::amdgcn_raw_buffer_atomic_or: 5184 case Intrinsic::amdgcn_struct_buffer_atomic_or: 5185 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 5186 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 5187 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 5188 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 5189 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 5190 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 5191 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 5192 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 5193 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 5194 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 5195 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 5196 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 5197 return legalizeBufferAtomic(MI, B, IntrID); 5198 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 5199 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: { 5200 Register DstReg = MI.getOperand(0).getReg(); 5201 if (!MRI.use_empty(DstReg) && !ST.hasGFX90AInsts()) { 5202 Function &F = B.getMF().getFunction(); 5203 DiagnosticInfoUnsupported NoFpRet( 5204 F, "return versions of fp atomics not supported", B.getDebugLoc(), 5205 DS_Error); 5206 F.getContext().diagnose(NoFpRet); 5207 B.buildUndef(DstReg); 5208 MI.eraseFromParent(); 5209 return true; 5210 } 5211 5212 return legalizeBufferAtomic(MI, B, IntrID); 5213 } 5214 case Intrinsic::amdgcn_atomic_inc: 5215 return legalizeAtomicIncDec(MI, B, true); 5216 case Intrinsic::amdgcn_atomic_dec: 5217 return legalizeAtomicIncDec(MI, B, false); 5218 case Intrinsic::trap: 5219 return legalizeTrapIntrinsic(MI, MRI, B); 5220 case Intrinsic::debugtrap: 5221 return legalizeDebugTrapIntrinsic(MI, MRI, B); 5222 case Intrinsic::amdgcn_rsq_clamp: 5223 return legalizeRsqClampIntrinsic(MI, MRI, B); 5224 case Intrinsic::amdgcn_ds_fadd: 5225 case Intrinsic::amdgcn_ds_fmin: 5226 case Intrinsic::amdgcn_ds_fmax: 5227 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 5228 case Intrinsic::amdgcn_image_bvh_intersect_ray: 5229 return legalizeBVHIntrinsic(MI, B); 5230 default: { 5231 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 5232 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 5233 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 5234 return true; 5235 } 5236 } 5237 5238 return true; 5239 } 5240