1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-legalinfo" 31 32 using namespace llvm; 33 using namespace LegalizeActions; 34 using namespace LegalizeMutations; 35 using namespace LegalityPredicates; 36 using namespace MIPatternMatch; 37 38 // Hack until load/store selection patterns support any tuple of legal types. 39 static cl::opt<bool> EnableNewLegality( 40 "amdgpu-global-isel-new-legality", 41 cl::desc("Use GlobalISel desired legality, rather than try to use" 42 "rules compatible with selection patterns"), 43 cl::init(false), 44 cl::ReallyHidden); 45 46 static constexpr unsigned MaxRegisterSize = 1024; 47 48 // Round the number of elements to the next power of two elements 49 static LLT getPow2VectorType(LLT Ty) { 50 unsigned NElts = Ty.getNumElements(); 51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 52 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 53 } 54 55 // Round the number of bits to the next power of two bits 56 static LLT getPow2ScalarType(LLT Ty) { 57 unsigned Bits = Ty.getSizeInBits(); 58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 59 return LLT::scalar(Pow2Bits); 60 } 61 62 /// \returns true if this is an odd sized vector which should widen by adding an 63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 64 /// excludes s1 vectors, which should always be scalarized. 65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 66 return [=](const LegalityQuery &Query) { 67 const LLT Ty = Query.Types[TypeIdx]; 68 if (!Ty.isVector()) 69 return false; 70 71 const LLT EltTy = Ty.getElementType(); 72 const unsigned EltSize = EltTy.getSizeInBits(); 73 return Ty.getNumElements() % 2 != 0 && 74 EltSize > 1 && EltSize < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 return Ty.getSizeInBits() % 32 == 0; 83 }; 84 } 85 86 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getScalarType(); 90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 91 }; 92 } 93 94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 95 return [=](const LegalityQuery &Query) { 96 const LLT Ty = Query.Types[TypeIdx]; 97 const LLT EltTy = Ty.getElementType(); 98 return std::make_pair(TypeIdx, 99 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 100 }; 101 } 102 103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 104 return [=](const LegalityQuery &Query) { 105 const LLT Ty = Query.Types[TypeIdx]; 106 const LLT EltTy = Ty.getElementType(); 107 unsigned Size = Ty.getSizeInBits(); 108 unsigned Pieces = (Size + 63) / 64; 109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 110 return std::make_pair( 111 TypeIdx, 112 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy)); 113 }; 114 } 115 116 // Increase the number of vector elements to reach the next multiple of 32-bit 117 // type. 118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 122 const LLT EltTy = Ty.getElementType(); 123 const int Size = Ty.getSizeInBits(); 124 const int EltSize = EltTy.getSizeInBits(); 125 const int NextMul32 = (Size + 31) / 32; 126 127 assert(EltSize < 32); 128 129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 130 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 131 }; 132 } 133 134 static LLT getBitcastRegisterType(const LLT Ty) { 135 const unsigned Size = Ty.getSizeInBits(); 136 137 LLT CoercedTy; 138 if (Size <= 32) { 139 // <2 x s8> -> s16 140 // <4 x s8> -> s32 141 return LLT::scalar(Size); 142 } 143 144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 145 } 146 147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 148 return [=](const LegalityQuery &Query) { 149 const LLT Ty = Query.Types[TypeIdx]; 150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 151 }; 152 } 153 154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 155 return [=](const LegalityQuery &Query) { 156 const LLT Ty = Query.Types[TypeIdx]; 157 unsigned Size = Ty.getSizeInBits(); 158 assert(Size % 32 == 0); 159 return std::make_pair( 160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 161 }; 162 } 163 164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 165 return [=](const LegalityQuery &Query) { 166 const LLT QueryTy = Query.Types[TypeIdx]; 167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 168 }; 169 } 170 171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 172 return [=](const LegalityQuery &Query) { 173 const LLT QueryTy = Query.Types[TypeIdx]; 174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 175 }; 176 } 177 178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 179 return [=](const LegalityQuery &Query) { 180 const LLT QueryTy = Query.Types[TypeIdx]; 181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 182 }; 183 } 184 185 static bool isRegisterSize(unsigned Size) { 186 return Size % 32 == 0 && Size <= MaxRegisterSize; 187 } 188 189 static bool isRegisterVectorElementType(LLT EltTy) { 190 const int EltSize = EltTy.getSizeInBits(); 191 return EltSize == 16 || EltSize % 32 == 0; 192 } 193 194 static bool isRegisterVectorType(LLT Ty) { 195 const int EltSize = Ty.getElementType().getSizeInBits(); 196 return EltSize == 32 || EltSize == 64 || 197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 198 EltSize == 128 || EltSize == 256; 199 } 200 201 static bool isRegisterType(LLT Ty) { 202 if (!isRegisterSize(Ty.getSizeInBits())) 203 return false; 204 205 if (Ty.isVector()) 206 return isRegisterVectorType(Ty); 207 208 return true; 209 } 210 211 // Any combination of 32 or 64-bit elements up the maximum register size, and 212 // multiples of v2s16. 213 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 214 return [=](const LegalityQuery &Query) { 215 return isRegisterType(Query.Types[TypeIdx]); 216 }; 217 } 218 219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 220 return [=](const LegalityQuery &Query) { 221 const LLT QueryTy = Query.Types[TypeIdx]; 222 if (!QueryTy.isVector()) 223 return false; 224 const LLT EltTy = QueryTy.getElementType(); 225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 226 }; 227 } 228 229 // If we have a truncating store or an extending load with a data size larger 230 // than 32-bits, we need to reduce to a 32-bit type. 231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 232 return [=](const LegalityQuery &Query) { 233 const LLT Ty = Query.Types[TypeIdx]; 234 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 236 }; 237 } 238 239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 240 // handle some operations by just promoting the register during 241 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 243 bool IsLoad) { 244 switch (AS) { 245 case AMDGPUAS::PRIVATE_ADDRESS: 246 // FIXME: Private element size. 247 return ST.enableFlatScratch() ? 128 : 32; 248 case AMDGPUAS::LOCAL_ADDRESS: 249 return ST.useDS128() ? 128 : 64; 250 case AMDGPUAS::GLOBAL_ADDRESS: 251 case AMDGPUAS::CONSTANT_ADDRESS: 252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 253 // Treat constant and global as identical. SMRD loads are sometimes usable for 254 // global loads (ideally constant address space should be eliminated) 255 // depending on the context. Legality cannot be context dependent, but 256 // RegBankSelect can split the load as necessary depending on the pointer 257 // register bank/uniformity and if the memory is invariant or not written in a 258 // kernel. 259 return IsLoad ? 512 : 128; 260 default: 261 // Flat addresses may contextually need to be split to 32-bit parts if they 262 // may alias scratch depending on the subtarget. 263 return 128; 264 } 265 } 266 267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 268 const LegalityQuery &Query) { 269 const LLT Ty = Query.Types[0]; 270 271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 273 274 unsigned RegSize = Ty.getSizeInBits(); 275 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 276 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits; 277 unsigned AS = Query.Types[1].getAddressSpace(); 278 279 // All of these need to be custom lowered to cast the pointer operand. 280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 281 return false; 282 283 // Do not handle extending vector loads. 284 if (Ty.isVector() && MemSize != RegSize) 285 return false; 286 287 // TODO: We should be able to widen loads if the alignment is high enough, but 288 // we also need to modify the memory access size. 289 #if 0 290 // Accept widening loads based on alignment. 291 if (IsLoad && MemSize < Size) 292 MemSize = std::max(MemSize, Align); 293 #endif 294 295 // Only 1-byte and 2-byte to 32-bit extloads are valid. 296 if (MemSize != RegSize && RegSize != 32) 297 return false; 298 299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 300 return false; 301 302 switch (MemSize) { 303 case 8: 304 case 16: 305 case 32: 306 case 64: 307 case 128: 308 break; 309 case 96: 310 if (!ST.hasDwordx3LoadStores()) 311 return false; 312 break; 313 case 256: 314 case 512: 315 // These may contextually need to be broken down. 316 break; 317 default: 318 return false; 319 } 320 321 assert(RegSize >= MemSize); 322 323 if (AlignBits < MemSize) { 324 const SITargetLowering *TLI = ST.getTargetLowering(); 325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 326 Align(AlignBits / 8))) 327 return false; 328 } 329 330 return true; 331 } 332 333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 334 // workaround this. Eventually it should ignore the type for loads and only care 335 // about the size. Return true in cases where we will workaround this for now by 336 // bitcasting. 337 static bool loadStoreBitcastWorkaround(const LLT Ty) { 338 if (EnableNewLegality) 339 return false; 340 341 const unsigned Size = Ty.getSizeInBits(); 342 if (Size <= 64) 343 return false; 344 if (!Ty.isVector()) 345 return true; 346 347 LLT EltTy = Ty.getElementType(); 348 if (EltTy.isPointer()) 349 return true; 350 351 unsigned EltSize = EltTy.getSizeInBits(); 352 return EltSize != 32 && EltSize != 64; 353 } 354 355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 356 const LLT Ty = Query.Types[0]; 357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 358 !loadStoreBitcastWorkaround(Ty); 359 } 360 361 /// Return true if a load or store of the type should be lowered with a bitcast 362 /// to a different type. 363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 364 const LLT MemTy) { 365 const unsigned MemSizeInBits = MemTy.getSizeInBits(); 366 const unsigned Size = Ty.getSizeInBits(); 367 if (Size != MemSizeInBits) 368 return Size <= 32 && Ty.isVector(); 369 370 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 371 return true; 372 373 // Don't try to handle bitcasting vector ext loads for now. 374 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 375 (Size <= 32 || isRegisterSize(Size)) && 376 !isRegisterVectorElementType(Ty.getElementType()); 377 } 378 379 /// Return true if we should legalize a load by widening an odd sized memory 380 /// access up to the alignment. Note this case when the memory access itself 381 /// changes, not the size of the result register. 382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 383 uint64_t AlignInBits, unsigned AddrSpace, 384 unsigned Opcode) { 385 unsigned SizeInBits = MemoryTy.getSizeInBits(); 386 // We don't want to widen cases that are naturally legal. 387 if (isPowerOf2_32(SizeInBits)) 388 return false; 389 390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 391 // end up widening these for a scalar load during RegBankSelect, since there 392 // aren't 96-bit scalar loads. 393 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 394 return false; 395 396 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 397 return false; 398 399 // A load is known dereferenceable up to the alignment, so it's legal to widen 400 // to it. 401 // 402 // TODO: Could check dereferenceable for less aligned cases. 403 unsigned RoundedSize = NextPowerOf2(SizeInBits); 404 if (AlignInBits < RoundedSize) 405 return false; 406 407 // Do not widen if it would introduce a slow unaligned load. 408 const SITargetLowering *TLI = ST.getTargetLowering(); 409 bool Fast = false; 410 return TLI->allowsMisalignedMemoryAccessesImpl( 411 RoundedSize, AddrSpace, Align(AlignInBits / 8), 412 MachineMemOperand::MOLoad, &Fast) && 413 Fast; 414 } 415 416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 417 unsigned Opcode) { 418 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 419 return false; 420 421 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 422 Query.MMODescrs[0].AlignInBits, 423 Query.Types[1].getAddressSpace(), Opcode); 424 } 425 426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 427 const GCNTargetMachine &TM) 428 : ST(ST_) { 429 using namespace TargetOpcode; 430 431 auto GetAddrSpacePtr = [&TM](unsigned AS) { 432 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 433 }; 434 435 const LLT S1 = LLT::scalar(1); 436 const LLT S8 = LLT::scalar(8); 437 const LLT S16 = LLT::scalar(16); 438 const LLT S32 = LLT::scalar(32); 439 const LLT S64 = LLT::scalar(64); 440 const LLT S128 = LLT::scalar(128); 441 const LLT S256 = LLT::scalar(256); 442 const LLT S512 = LLT::scalar(512); 443 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 444 445 const LLT V2S8 = LLT::fixed_vector(2, 8); 446 const LLT V2S16 = LLT::fixed_vector(2, 16); 447 const LLT V4S16 = LLT::fixed_vector(4, 16); 448 449 const LLT V2S32 = LLT::fixed_vector(2, 32); 450 const LLT V3S32 = LLT::fixed_vector(3, 32); 451 const LLT V4S32 = LLT::fixed_vector(4, 32); 452 const LLT V5S32 = LLT::fixed_vector(5, 32); 453 const LLT V6S32 = LLT::fixed_vector(6, 32); 454 const LLT V7S32 = LLT::fixed_vector(7, 32); 455 const LLT V8S32 = LLT::fixed_vector(8, 32); 456 const LLT V9S32 = LLT::fixed_vector(9, 32); 457 const LLT V10S32 = LLT::fixed_vector(10, 32); 458 const LLT V11S32 = LLT::fixed_vector(11, 32); 459 const LLT V12S32 = LLT::fixed_vector(12, 32); 460 const LLT V13S32 = LLT::fixed_vector(13, 32); 461 const LLT V14S32 = LLT::fixed_vector(14, 32); 462 const LLT V15S32 = LLT::fixed_vector(15, 32); 463 const LLT V16S32 = LLT::fixed_vector(16, 32); 464 const LLT V32S32 = LLT::fixed_vector(32, 32); 465 466 const LLT V2S64 = LLT::fixed_vector(2, 64); 467 const LLT V3S64 = LLT::fixed_vector(3, 64); 468 const LLT V4S64 = LLT::fixed_vector(4, 64); 469 const LLT V5S64 = LLT::fixed_vector(5, 64); 470 const LLT V6S64 = LLT::fixed_vector(6, 64); 471 const LLT V7S64 = LLT::fixed_vector(7, 64); 472 const LLT V8S64 = LLT::fixed_vector(8, 64); 473 const LLT V16S64 = LLT::fixed_vector(16, 64); 474 475 std::initializer_list<LLT> AllS32Vectors = 476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 478 std::initializer_list<LLT> AllS64Vectors = 479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 480 481 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 482 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 483 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 484 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 485 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 486 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 487 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 488 489 const LLT CodePtr = FlatPtr; 490 491 const std::initializer_list<LLT> AddrSpaces64 = { 492 GlobalPtr, ConstantPtr, FlatPtr 493 }; 494 495 const std::initializer_list<LLT> AddrSpaces32 = { 496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 497 }; 498 499 const std::initializer_list<LLT> FPTypesBase = { 500 S32, S64 501 }; 502 503 const std::initializer_list<LLT> FPTypes16 = { 504 S32, S64, S16 505 }; 506 507 const std::initializer_list<LLT> FPTypesPK16 = { 508 S32, S64, S16, V2S16 509 }; 510 511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 512 513 // s1 for VCC branches, s32 for SCC branches. 514 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 515 516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 517 // elements for v3s16 518 getActionDefinitionsBuilder(G_PHI) 519 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 520 .legalFor(AllS32Vectors) 521 .legalFor(AllS64Vectors) 522 .legalFor(AddrSpaces64) 523 .legalFor(AddrSpaces32) 524 .legalIf(isPointer(0)) 525 .clampScalar(0, S16, S256) 526 .widenScalarToNextPow2(0, 32) 527 .clampMaxNumElements(0, S32, 16) 528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 529 .scalarize(0); 530 531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 532 // Full set of gfx9 features. 533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 534 .legalFor({S32, S16, V2S16}) 535 .minScalar(0, S16) 536 .clampMaxNumElementsStrict(0, S16, 2) 537 .widenScalarToNextMultipleOf(0, 32) 538 .maxScalar(0, S32) 539 .scalarize(0); 540 541 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 542 .legalFor({S32, S16, V2S16}) // Clamp modifier 543 .minScalarOrElt(0, S16) 544 .clampMaxNumElementsStrict(0, S16, 2) 545 .scalarize(0) 546 .widenScalarToNextPow2(0, 32) 547 .lower(); 548 } else if (ST.has16BitInsts()) { 549 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 550 .legalFor({S32, S16}) 551 .minScalar(0, S16) 552 .widenScalarToNextMultipleOf(0, 32) 553 .maxScalar(0, S32) 554 .scalarize(0); 555 556 // Technically the saturating operations require clamp bit support, but this 557 // was introduced at the same time as 16-bit operations. 558 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 559 .legalFor({S32, S16}) // Clamp modifier 560 .minScalar(0, S16) 561 .scalarize(0) 562 .widenScalarToNextPow2(0, 16) 563 .lower(); 564 565 // We're just lowering this, but it helps get a better result to try to 566 // coerce to the desired type first. 567 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 568 .minScalar(0, S16) 569 .scalarize(0) 570 .lower(); 571 } else { 572 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 573 .legalFor({S32}) 574 .widenScalarToNextMultipleOf(0, 32) 575 .clampScalar(0, S32, S32) 576 .scalarize(0); 577 578 if (ST.hasIntClamp()) { 579 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 580 .legalFor({S32}) // Clamp modifier. 581 .scalarize(0) 582 .minScalarOrElt(0, S32) 583 .lower(); 584 } else { 585 // Clamp bit support was added in VI, along with 16-bit operations. 586 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 587 .minScalar(0, S32) 588 .scalarize(0) 589 .lower(); 590 } 591 592 // FIXME: DAG expansion gets better results. The widening uses the smaller 593 // range values and goes for the min/max lowering directly. 594 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 595 .minScalar(0, S32) 596 .scalarize(0) 597 .lower(); 598 } 599 600 getActionDefinitionsBuilder( 601 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 602 .customFor({S32, S64}) 603 .clampScalar(0, S32, S64) 604 .widenScalarToNextPow2(0, 32) 605 .scalarize(0); 606 607 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 608 .legalFor({S32}) 609 .maxScalar(0, S32); 610 611 if (ST.hasVOP3PInsts()) { 612 Mulh 613 .clampMaxNumElements(0, S8, 2) 614 .lowerFor({V2S8}); 615 } 616 617 Mulh 618 .scalarize(0) 619 .lower(); 620 621 // Report legal for any types we can handle anywhere. For the cases only legal 622 // on the SALU, RegBankSelect will be able to re-legalize. 623 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 624 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 625 .clampScalar(0, S32, S64) 626 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 627 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 628 .widenScalarToNextPow2(0) 629 .scalarize(0); 630 631 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 632 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 633 .legalFor({{S32, S1}, {S32, S32}}) 634 .minScalar(0, S32) 635 // TODO: .scalarize(0) 636 .lower(); 637 638 getActionDefinitionsBuilder(G_BITCAST) 639 // Don't worry about the size constraint. 640 .legalIf(all(isRegisterType(0), isRegisterType(1))) 641 .lower(); 642 643 644 getActionDefinitionsBuilder(G_CONSTANT) 645 .legalFor({S1, S32, S64, S16, GlobalPtr, 646 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 647 .legalIf(isPointer(0)) 648 .clampScalar(0, S32, S64) 649 .widenScalarToNextPow2(0); 650 651 getActionDefinitionsBuilder(G_FCONSTANT) 652 .legalFor({S32, S64, S16}) 653 .clampScalar(0, S16, S64); 654 655 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 656 .legalIf(isRegisterType(0)) 657 // s1 and s16 are special cases because they have legal operations on 658 // them, but don't really occupy registers in the normal way. 659 .legalFor({S1, S16}) 660 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 661 .clampScalarOrElt(0, S32, MaxScalar) 662 .widenScalarToNextPow2(0, 32) 663 .clampMaxNumElements(0, S32, 16); 664 665 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 666 667 // If the amount is divergent, we have to do a wave reduction to get the 668 // maximum value, so this is expanded during RegBankSelect. 669 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 670 .legalFor({{PrivatePtr, S32}}); 671 672 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 673 .customIf(typeIsNot(0, PrivatePtr)); 674 675 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 676 677 auto &FPOpActions = getActionDefinitionsBuilder( 678 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 679 .legalFor({S32, S64}); 680 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 681 .customFor({S32, S64}); 682 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 683 .customFor({S32, S64}); 684 685 if (ST.has16BitInsts()) { 686 if (ST.hasVOP3PInsts()) 687 FPOpActions.legalFor({S16, V2S16}); 688 else 689 FPOpActions.legalFor({S16}); 690 691 TrigActions.customFor({S16}); 692 FDIVActions.customFor({S16}); 693 } 694 695 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 696 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 697 698 if (ST.hasVOP3PInsts()) { 699 MinNumMaxNum.customFor(FPTypesPK16) 700 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 701 .clampMaxNumElements(0, S16, 2) 702 .clampScalar(0, S16, S64) 703 .scalarize(0); 704 } else if (ST.has16BitInsts()) { 705 MinNumMaxNum.customFor(FPTypes16) 706 .clampScalar(0, S16, S64) 707 .scalarize(0); 708 } else { 709 MinNumMaxNum.customFor(FPTypesBase) 710 .clampScalar(0, S32, S64) 711 .scalarize(0); 712 } 713 714 if (ST.hasVOP3PInsts()) 715 FPOpActions.clampMaxNumElementsStrict(0, S16, 2); 716 717 FPOpActions 718 .scalarize(0) 719 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 720 721 TrigActions 722 .scalarize(0) 723 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 724 725 FDIVActions 726 .scalarize(0) 727 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 728 729 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 730 .legalFor(FPTypesPK16) 731 .clampMaxNumElementsStrict(0, S16, 2) 732 .scalarize(0) 733 .clampScalar(0, S16, S64); 734 735 if (ST.has16BitInsts()) { 736 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 737 .legalFor({S32, S64, S16}) 738 .scalarize(0) 739 .clampScalar(0, S16, S64); 740 } else { 741 getActionDefinitionsBuilder(G_FSQRT) 742 .legalFor({S32, S64}) 743 .scalarize(0) 744 .clampScalar(0, S32, S64); 745 746 if (ST.hasFractBug()) { 747 getActionDefinitionsBuilder(G_FFLOOR) 748 .customFor({S64}) 749 .legalFor({S32, S64}) 750 .scalarize(0) 751 .clampScalar(0, S32, S64); 752 } else { 753 getActionDefinitionsBuilder(G_FFLOOR) 754 .legalFor({S32, S64}) 755 .scalarize(0) 756 .clampScalar(0, S32, S64); 757 } 758 } 759 760 getActionDefinitionsBuilder(G_FPTRUNC) 761 .legalFor({{S32, S64}, {S16, S32}}) 762 .scalarize(0) 763 .lower(); 764 765 getActionDefinitionsBuilder(G_FPEXT) 766 .legalFor({{S64, S32}, {S32, S16}}) 767 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 768 .scalarize(0); 769 770 getActionDefinitionsBuilder(G_FSUB) 771 // Use actual fsub instruction 772 .legalFor({S32}) 773 // Must use fadd + fneg 774 .lowerFor({S64, S16, V2S16}) 775 .scalarize(0) 776 .clampScalar(0, S32, S64); 777 778 // Whether this is legal depends on the floating point mode for the function. 779 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 780 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 781 FMad.customFor({S32, S16}); 782 else if (ST.hasMadMacF32Insts()) 783 FMad.customFor({S32}); 784 else if (ST.hasMadF16()) 785 FMad.customFor({S16}); 786 FMad.scalarize(0) 787 .lower(); 788 789 auto &FRem = getActionDefinitionsBuilder(G_FREM); 790 if (ST.has16BitInsts()) { 791 FRem.customFor({S16, S32, S64}); 792 } else { 793 FRem.minScalar(0, S32) 794 .customFor({S32, S64}); 795 } 796 FRem.scalarize(0); 797 798 // TODO: Do we need to clamp maximum bitwidth? 799 getActionDefinitionsBuilder(G_TRUNC) 800 .legalIf(isScalar(0)) 801 .legalFor({{V2S16, V2S32}}) 802 .clampMaxNumElements(0, S16, 2) 803 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 804 // situations (like an invalid implicit use), we don't want to infinite loop 805 // in the legalizer. 806 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 807 .alwaysLegal(); 808 809 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 810 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 811 {S32, S1}, {S64, S1}, {S16, S1}}) 812 .scalarize(0) 813 .clampScalar(0, S32, S64) 814 .widenScalarToNextPow2(1, 32); 815 816 // TODO: Split s1->s64 during regbankselect for VALU. 817 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 818 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 819 .lowerIf(typeIs(1, S1)) 820 .customFor({{S32, S64}, {S64, S64}}); 821 if (ST.has16BitInsts()) 822 IToFP.legalFor({{S16, S16}}); 823 IToFP.clampScalar(1, S32, S64) 824 .minScalar(0, S32) 825 .scalarize(0) 826 .widenScalarToNextPow2(1); 827 828 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 829 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 830 .customFor({{S64, S32}, {S64, S64}}) 831 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 832 if (ST.has16BitInsts()) 833 FPToI.legalFor({{S16, S16}}); 834 else 835 FPToI.minScalar(1, S32); 836 837 FPToI.minScalar(0, S32) 838 .widenScalarToNextPow2(0, 32) 839 .scalarize(0) 840 .lower(); 841 842 // Lower roundeven into G_FRINT 843 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 844 .scalarize(0) 845 .lower(); 846 847 if (ST.has16BitInsts()) { 848 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 849 .legalFor({S16, S32, S64}) 850 .clampScalar(0, S16, S64) 851 .scalarize(0); 852 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 853 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 854 .legalFor({S32, S64}) 855 .clampScalar(0, S32, S64) 856 .scalarize(0); 857 } else { 858 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 859 .legalFor({S32}) 860 .customFor({S64}) 861 .clampScalar(0, S32, S64) 862 .scalarize(0); 863 } 864 865 getActionDefinitionsBuilder(G_PTR_ADD) 866 .legalIf(all(isPointer(0), sameSize(0, 1))) 867 .scalarize(0) 868 .scalarSameSizeAs(1, 0); 869 870 getActionDefinitionsBuilder(G_PTRMASK) 871 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 872 .scalarSameSizeAs(1, 0) 873 .scalarize(0); 874 875 auto &CmpBuilder = 876 getActionDefinitionsBuilder(G_ICMP) 877 // The compare output type differs based on the register bank of the output, 878 // so make both s1 and s32 legal. 879 // 880 // Scalar compares producing output in scc will be promoted to s32, as that 881 // is the allocatable register type that will be needed for the copy from 882 // scc. This will be promoted during RegBankSelect, and we assume something 883 // before that won't try to use s32 result types. 884 // 885 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 886 // bank. 887 .legalForCartesianProduct( 888 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 889 .legalForCartesianProduct( 890 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 891 if (ST.has16BitInsts()) { 892 CmpBuilder.legalFor({{S1, S16}}); 893 } 894 895 CmpBuilder 896 .widenScalarToNextPow2(1) 897 .clampScalar(1, S32, S64) 898 .scalarize(0) 899 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 900 901 getActionDefinitionsBuilder(G_FCMP) 902 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 903 .widenScalarToNextPow2(1) 904 .clampScalar(1, S32, S64) 905 .scalarize(0); 906 907 // FIXME: fpow has a selection pattern that should move to custom lowering. 908 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 909 if (ST.has16BitInsts()) 910 Exp2Ops.legalFor({S32, S16}); 911 else 912 Exp2Ops.legalFor({S32}); 913 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 914 Exp2Ops.scalarize(0); 915 916 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 917 if (ST.has16BitInsts()) 918 ExpOps.customFor({{S32}, {S16}}); 919 else 920 ExpOps.customFor({S32}); 921 ExpOps.clampScalar(0, MinScalarFPTy, S32) 922 .scalarize(0); 923 924 getActionDefinitionsBuilder(G_FPOWI) 925 .clampScalar(0, MinScalarFPTy, S32) 926 .lower(); 927 928 // The 64-bit versions produce 32-bit results, but only on the SALU. 929 getActionDefinitionsBuilder(G_CTPOP) 930 .legalFor({{S32, S32}, {S32, S64}}) 931 .clampScalar(0, S32, S32) 932 .clampScalar(1, S32, S64) 933 .scalarize(0) 934 .widenScalarToNextPow2(0, 32) 935 .widenScalarToNextPow2(1, 32); 936 937 // The hardware instructions return a different result on 0 than the generic 938 // instructions expect. The hardware produces -1, but these produce the 939 // bitwidth. 940 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 941 .scalarize(0) 942 .clampScalar(0, S32, S32) 943 .clampScalar(1, S32, S64) 944 .widenScalarToNextPow2(0, 32) 945 .widenScalarToNextPow2(1, 32) 946 .custom(); 947 948 // The 64-bit versions produce 32-bit results, but only on the SALU. 949 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 950 .legalFor({{S32, S32}, {S32, S64}}) 951 .clampScalar(0, S32, S32) 952 .clampScalar(1, S32, S64) 953 .scalarize(0) 954 .widenScalarToNextPow2(0, 32) 955 .widenScalarToNextPow2(1, 32); 956 957 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 958 // RegBankSelect. 959 getActionDefinitionsBuilder(G_BITREVERSE) 960 .legalFor({S32, S64}) 961 .clampScalar(0, S32, S64) 962 .scalarize(0) 963 .widenScalarToNextPow2(0); 964 965 if (ST.has16BitInsts()) { 966 getActionDefinitionsBuilder(G_BSWAP) 967 .legalFor({S16, S32, V2S16}) 968 .clampMaxNumElementsStrict(0, S16, 2) 969 // FIXME: Fixing non-power-of-2 before clamp is workaround for 970 // narrowScalar limitation. 971 .widenScalarToNextPow2(0) 972 .clampScalar(0, S16, S32) 973 .scalarize(0); 974 975 if (ST.hasVOP3PInsts()) { 976 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 977 .legalFor({S32, S16, V2S16}) 978 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 979 .clampMaxNumElements(0, S16, 2) 980 .minScalar(0, S16) 981 .widenScalarToNextPow2(0) 982 .scalarize(0) 983 .lower(); 984 } else { 985 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 986 .legalFor({S32, S16}) 987 .widenScalarToNextPow2(0) 988 .minScalar(0, S16) 989 .scalarize(0) 990 .lower(); 991 } 992 } else { 993 // TODO: Should have same legality without v_perm_b32 994 getActionDefinitionsBuilder(G_BSWAP) 995 .legalFor({S32}) 996 .lowerIf(scalarNarrowerThan(0, 32)) 997 // FIXME: Fixing non-power-of-2 before clamp is workaround for 998 // narrowScalar limitation. 999 .widenScalarToNextPow2(0) 1000 .maxScalar(0, S32) 1001 .scalarize(0) 1002 .lower(); 1003 1004 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1005 .legalFor({S32}) 1006 .minScalar(0, S32) 1007 .widenScalarToNextPow2(0) 1008 .scalarize(0) 1009 .lower(); 1010 } 1011 1012 getActionDefinitionsBuilder(G_INTTOPTR) 1013 // List the common cases 1014 .legalForCartesianProduct(AddrSpaces64, {S64}) 1015 .legalForCartesianProduct(AddrSpaces32, {S32}) 1016 .scalarize(0) 1017 // Accept any address space as long as the size matches 1018 .legalIf(sameSize(0, 1)) 1019 .widenScalarIf(smallerThan(1, 0), 1020 [](const LegalityQuery &Query) { 1021 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1022 }) 1023 .narrowScalarIf(largerThan(1, 0), 1024 [](const LegalityQuery &Query) { 1025 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1026 }); 1027 1028 getActionDefinitionsBuilder(G_PTRTOINT) 1029 // List the common cases 1030 .legalForCartesianProduct(AddrSpaces64, {S64}) 1031 .legalForCartesianProduct(AddrSpaces32, {S32}) 1032 .scalarize(0) 1033 // Accept any address space as long as the size matches 1034 .legalIf(sameSize(0, 1)) 1035 .widenScalarIf(smallerThan(0, 1), 1036 [](const LegalityQuery &Query) { 1037 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1038 }) 1039 .narrowScalarIf( 1040 largerThan(0, 1), 1041 [](const LegalityQuery &Query) { 1042 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1043 }); 1044 1045 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1046 .scalarize(0) 1047 .custom(); 1048 1049 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1050 bool IsLoad) -> bool { 1051 const LLT DstTy = Query.Types[0]; 1052 1053 // Split vector extloads. 1054 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1055 1056 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1057 return true; 1058 1059 const LLT PtrTy = Query.Types[1]; 1060 unsigned AS = PtrTy.getAddressSpace(); 1061 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1062 return true; 1063 1064 // Catch weird sized loads that don't evenly divide into the access sizes 1065 // TODO: May be able to widen depending on alignment etc. 1066 unsigned NumRegs = (MemSize + 31) / 32; 1067 if (NumRegs == 3) { 1068 if (!ST.hasDwordx3LoadStores()) 1069 return true; 1070 } else { 1071 // If the alignment allows, these should have been widened. 1072 if (!isPowerOf2_32(NumRegs)) 1073 return true; 1074 } 1075 1076 return false; 1077 }; 1078 1079 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1080 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1081 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1082 1083 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1084 // LDS 1085 // TODO: Unsupported flat for SI. 1086 1087 for (unsigned Op : {G_LOAD, G_STORE}) { 1088 const bool IsStore = Op == G_STORE; 1089 1090 auto &Actions = getActionDefinitionsBuilder(Op); 1091 // Explicitly list some common cases. 1092 // TODO: Does this help compile time at all? 1093 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1094 {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1095 {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1096 {S64, GlobalPtr, S64, GlobalAlign32}, 1097 {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1098 {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1099 {S32, GlobalPtr, S8, GlobalAlign8}, 1100 {S32, GlobalPtr, S16, GlobalAlign16}, 1101 1102 {S32, LocalPtr, S32, 32}, 1103 {S64, LocalPtr, S64, 32}, 1104 {V2S32, LocalPtr, V2S32, 32}, 1105 {S32, LocalPtr, S8, 8}, 1106 {S32, LocalPtr, S16, 16}, 1107 {V2S16, LocalPtr, S32, 32}, 1108 1109 {S32, PrivatePtr, S32, 32}, 1110 {S32, PrivatePtr, S8, 8}, 1111 {S32, PrivatePtr, S16, 16}, 1112 {V2S16, PrivatePtr, S32, 32}, 1113 1114 {S32, ConstantPtr, S32, GlobalAlign32}, 1115 {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1116 {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1117 {S64, ConstantPtr, S64, GlobalAlign32}, 1118 {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 1119 Actions.legalIf( 1120 [=](const LegalityQuery &Query) -> bool { 1121 return isLoadStoreLegal(ST, Query); 1122 }); 1123 1124 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1125 // 64-bits. 1126 // 1127 // TODO: Should generalize bitcast action into coerce, which will also cover 1128 // inserting addrspacecasts. 1129 Actions.customIf(typeIs(1, Constant32Ptr)); 1130 1131 // Turn any illegal element vectors into something easier to deal 1132 // with. These will ultimately produce 32-bit scalar shifts to extract the 1133 // parts anyway. 1134 // 1135 // For odd 16-bit element vectors, prefer to split those into pieces with 1136 // 16-bit vector parts. 1137 Actions.bitcastIf( 1138 [=](const LegalityQuery &Query) -> bool { 1139 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1140 Query.MMODescrs[0].MemoryTy); 1141 }, bitcastToRegisterType(0)); 1142 1143 if (!IsStore) { 1144 // Widen suitably aligned loads by loading extra bytes. The standard 1145 // legalization actions can't properly express widening memory operands. 1146 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1147 return shouldWidenLoad(ST, Query, G_LOAD); 1148 }); 1149 } 1150 1151 // FIXME: load/store narrowing should be moved to lower action 1152 Actions 1153 .narrowScalarIf( 1154 [=](const LegalityQuery &Query) -> bool { 1155 return !Query.Types[0].isVector() && 1156 needToSplitMemOp(Query, Op == G_LOAD); 1157 }, 1158 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1159 const LLT DstTy = Query.Types[0]; 1160 const LLT PtrTy = Query.Types[1]; 1161 1162 const unsigned DstSize = DstTy.getSizeInBits(); 1163 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1164 1165 // Split extloads. 1166 if (DstSize > MemSize) 1167 return std::make_pair(0, LLT::scalar(MemSize)); 1168 1169 unsigned MaxSize = maxSizeForAddrSpace(ST, 1170 PtrTy.getAddressSpace(), 1171 Op == G_LOAD); 1172 if (MemSize > MaxSize) 1173 return std::make_pair(0, LLT::scalar(MaxSize)); 1174 1175 uint64_t Align = Query.MMODescrs[0].AlignInBits; 1176 return std::make_pair(0, LLT::scalar(Align)); 1177 }) 1178 .fewerElementsIf( 1179 [=](const LegalityQuery &Query) -> bool { 1180 return Query.Types[0].isVector() && 1181 needToSplitMemOp(Query, Op == G_LOAD); 1182 }, 1183 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1184 const LLT DstTy = Query.Types[0]; 1185 const LLT PtrTy = Query.Types[1]; 1186 1187 LLT EltTy = DstTy.getElementType(); 1188 unsigned MaxSize = maxSizeForAddrSpace(ST, 1189 PtrTy.getAddressSpace(), 1190 Op == G_LOAD); 1191 1192 // FIXME: Handle widened to power of 2 results better. This ends 1193 // up scalarizing. 1194 // FIXME: 3 element stores scalarized on SI 1195 1196 // Split if it's too large for the address space. 1197 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1198 if (MemSize > MaxSize) { 1199 unsigned NumElts = DstTy.getNumElements(); 1200 unsigned EltSize = EltTy.getSizeInBits(); 1201 1202 if (MaxSize % EltSize == 0) { 1203 return std::make_pair( 1204 0, LLT::scalarOrVector( 1205 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1206 } 1207 1208 unsigned NumPieces = MemSize / MaxSize; 1209 1210 // FIXME: Refine when odd breakdowns handled 1211 // The scalars will need to be re-legalized. 1212 if (NumPieces == 1 || NumPieces >= NumElts || 1213 NumElts % NumPieces != 0) 1214 return std::make_pair(0, EltTy); 1215 1216 return std::make_pair( 1217 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1218 } 1219 1220 // FIXME: We could probably handle weird extending loads better. 1221 if (DstTy.getSizeInBits() > MemSize) 1222 return std::make_pair(0, EltTy); 1223 1224 unsigned EltSize = EltTy.getSizeInBits(); 1225 unsigned DstSize = DstTy.getSizeInBits(); 1226 if (!isPowerOf2_32(DstSize)) { 1227 // We're probably decomposing an odd sized store. Try to split 1228 // to the widest type. TODO: Account for alignment. As-is it 1229 // should be OK, since the new parts will be further legalized. 1230 unsigned FloorSize = PowerOf2Floor(DstSize); 1231 return std::make_pair( 1232 0, LLT::scalarOrVector( 1233 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1234 } 1235 1236 // May need relegalization for the scalars. 1237 return std::make_pair(0, EltTy); 1238 }) 1239 .minScalar(0, S32) 1240 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1241 .widenScalarToNextPow2(0) 1242 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1243 .lower(); 1244 } 1245 1246 // FIXME: Unaligned accesses not lowered. 1247 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1248 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1249 {S32, GlobalPtr, S16, 2 * 8}, 1250 {S32, LocalPtr, S8, 8}, 1251 {S32, LocalPtr, S16, 16}, 1252 {S32, PrivatePtr, S8, 8}, 1253 {S32, PrivatePtr, S16, 16}, 1254 {S32, ConstantPtr, S8, 8}, 1255 {S32, ConstantPtr, S16, 2 * 8}}) 1256 .legalIf( 1257 [=](const LegalityQuery &Query) -> bool { 1258 return isLoadStoreLegal(ST, Query); 1259 }); 1260 1261 if (ST.hasFlatAddressSpace()) { 1262 ExtLoads.legalForTypesWithMemDesc( 1263 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 1264 } 1265 1266 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1267 // 64-bits. 1268 // 1269 // TODO: Should generalize bitcast action into coerce, which will also cover 1270 // inserting addrspacecasts. 1271 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1272 1273 ExtLoads.clampScalar(0, S32, S32) 1274 .widenScalarToNextPow2(0) 1275 .lower(); 1276 1277 auto &Atomics = getActionDefinitionsBuilder( 1278 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1279 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1280 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1281 G_ATOMICRMW_UMIN}) 1282 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1283 {S64, GlobalPtr}, {S64, LocalPtr}, 1284 {S32, RegionPtr}, {S64, RegionPtr}}); 1285 if (ST.hasFlatAddressSpace()) { 1286 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1287 } 1288 1289 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1290 if (ST.hasLDSFPAtomicAdd()) { 1291 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1292 if (ST.hasGFX90AInsts()) 1293 Atomic.legalFor({{S64, LocalPtr}}); 1294 } 1295 if (ST.hasAtomicFaddInsts()) 1296 Atomic.legalFor({{S32, GlobalPtr}}); 1297 1298 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1299 // demarshalling 1300 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1301 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1302 {S32, FlatPtr}, {S64, FlatPtr}}) 1303 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1304 {S32, RegionPtr}, {S64, RegionPtr}}); 1305 // TODO: Pointer types, any 32-bit or 64-bit vector 1306 1307 // Condition should be s32 for scalar, s1 for vector. 1308 getActionDefinitionsBuilder(G_SELECT) 1309 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1310 LocalPtr, FlatPtr, PrivatePtr, 1311 LLT::fixed_vector(2, LocalPtr), 1312 LLT::fixed_vector(2, PrivatePtr)}, 1313 {S1, S32}) 1314 .clampScalar(0, S16, S64) 1315 .scalarize(1) 1316 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1317 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1318 .clampMaxNumElements(0, S32, 2) 1319 .clampMaxNumElements(0, LocalPtr, 2) 1320 .clampMaxNumElements(0, PrivatePtr, 2) 1321 .scalarize(0) 1322 .widenScalarToNextPow2(0) 1323 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1324 1325 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1326 // be more flexible with the shift amount type. 1327 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1328 .legalFor({{S32, S32}, {S64, S32}}); 1329 if (ST.has16BitInsts()) { 1330 if (ST.hasVOP3PInsts()) { 1331 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1332 .clampMaxNumElements(0, S16, 2); 1333 } else 1334 Shifts.legalFor({{S16, S16}}); 1335 1336 // TODO: Support 16-bit shift amounts for all types 1337 Shifts.widenScalarIf( 1338 [=](const LegalityQuery &Query) { 1339 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1340 // 32-bit amount. 1341 const LLT ValTy = Query.Types[0]; 1342 const LLT AmountTy = Query.Types[1]; 1343 return ValTy.getSizeInBits() <= 16 && 1344 AmountTy.getSizeInBits() < 16; 1345 }, changeTo(1, S16)); 1346 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1347 Shifts.clampScalar(1, S32, S32); 1348 Shifts.clampScalar(0, S16, S64); 1349 Shifts.widenScalarToNextPow2(0, 16); 1350 1351 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1352 .minScalar(0, S16) 1353 .scalarize(0) 1354 .lower(); 1355 } else { 1356 // Make sure we legalize the shift amount type first, as the general 1357 // expansion for the shifted type will produce much worse code if it hasn't 1358 // been truncated already. 1359 Shifts.clampScalar(1, S32, S32); 1360 Shifts.clampScalar(0, S32, S64); 1361 Shifts.widenScalarToNextPow2(0, 32); 1362 1363 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1364 .minScalar(0, S32) 1365 .scalarize(0) 1366 .lower(); 1367 } 1368 Shifts.scalarize(0); 1369 1370 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1371 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1372 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1373 unsigned IdxTypeIdx = 2; 1374 1375 getActionDefinitionsBuilder(Op) 1376 .customIf([=](const LegalityQuery &Query) { 1377 const LLT EltTy = Query.Types[EltTypeIdx]; 1378 const LLT VecTy = Query.Types[VecTypeIdx]; 1379 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1380 const unsigned EltSize = EltTy.getSizeInBits(); 1381 return (EltSize == 32 || EltSize == 64) && 1382 VecTy.getSizeInBits() % 32 == 0 && 1383 VecTy.getSizeInBits() <= MaxRegisterSize && 1384 IdxTy.getSizeInBits() == 32; 1385 }) 1386 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1387 bitcastToVectorElement32(VecTypeIdx)) 1388 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1389 .bitcastIf( 1390 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1391 [=](const LegalityQuery &Query) { 1392 // For > 64-bit element types, try to turn this into a 64-bit 1393 // element vector since we may be able to do better indexing 1394 // if this is scalar. If not, fall back to 32. 1395 const LLT EltTy = Query.Types[EltTypeIdx]; 1396 const LLT VecTy = Query.Types[VecTypeIdx]; 1397 const unsigned DstEltSize = EltTy.getSizeInBits(); 1398 const unsigned VecSize = VecTy.getSizeInBits(); 1399 1400 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1401 return std::make_pair( 1402 VecTypeIdx, 1403 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1404 }) 1405 .clampScalar(EltTypeIdx, S32, S64) 1406 .clampScalar(VecTypeIdx, S32, S64) 1407 .clampScalar(IdxTypeIdx, S32, S32) 1408 .clampMaxNumElements(VecTypeIdx, S32, 32) 1409 // TODO: Clamp elements for 64-bit vectors? 1410 // It should only be necessary with variable indexes. 1411 // As a last resort, lower to the stack 1412 .lower(); 1413 } 1414 1415 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1416 .unsupportedIf([=](const LegalityQuery &Query) { 1417 const LLT &EltTy = Query.Types[1].getElementType(); 1418 return Query.Types[0] != EltTy; 1419 }); 1420 1421 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1422 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1423 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1424 1425 // FIXME: Doesn't handle extract of illegal sizes. 1426 getActionDefinitionsBuilder(Op) 1427 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1428 .lowerIf([=](const LegalityQuery &Query) { 1429 // Sub-vector(or single element) insert and extract. 1430 // TODO: verify immediate offset here since lower only works with 1431 // whole elements. 1432 const LLT BigTy = Query.Types[BigTyIdx]; 1433 return BigTy.isVector(); 1434 }) 1435 // FIXME: Multiples of 16 should not be legal. 1436 .legalIf([=](const LegalityQuery &Query) { 1437 const LLT BigTy = Query.Types[BigTyIdx]; 1438 const LLT LitTy = Query.Types[LitTyIdx]; 1439 return (BigTy.getSizeInBits() % 32 == 0) && 1440 (LitTy.getSizeInBits() % 16 == 0); 1441 }) 1442 .widenScalarIf( 1443 [=](const LegalityQuery &Query) { 1444 const LLT BigTy = Query.Types[BigTyIdx]; 1445 return (BigTy.getScalarSizeInBits() < 16); 1446 }, 1447 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1448 .widenScalarIf( 1449 [=](const LegalityQuery &Query) { 1450 const LLT LitTy = Query.Types[LitTyIdx]; 1451 return (LitTy.getScalarSizeInBits() < 16); 1452 }, 1453 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1454 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1455 .widenScalarToNextPow2(BigTyIdx, 32); 1456 1457 } 1458 1459 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1460 .legalForCartesianProduct(AllS32Vectors, {S32}) 1461 .legalForCartesianProduct(AllS64Vectors, {S64}) 1462 .clampNumElements(0, V16S32, V32S32) 1463 .clampNumElements(0, V2S64, V16S64) 1464 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1465 1466 if (ST.hasScalarPackInsts()) { 1467 BuildVector 1468 // FIXME: Should probably widen s1 vectors straight to s32 1469 .minScalarOrElt(0, S16) 1470 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1471 .minScalar(1, S32); 1472 1473 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1474 .legalFor({V2S16, S32}) 1475 .lower(); 1476 BuildVector.minScalarOrElt(0, S32); 1477 } else { 1478 BuildVector.customFor({V2S16, S16}); 1479 BuildVector.minScalarOrElt(0, S32); 1480 1481 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1482 .customFor({V2S16, S32}) 1483 .lower(); 1484 } 1485 1486 BuildVector.legalIf(isRegisterType(0)); 1487 1488 // FIXME: Clamp maximum size 1489 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1490 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1491 .clampMaxNumElements(0, S32, 32) 1492 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1493 .clampMaxNumElements(0, S16, 64); 1494 1495 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1496 // pre-legalize. 1497 if (ST.hasVOP3PInsts()) { 1498 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1499 .customFor({V2S16, V2S16}) 1500 .lower(); 1501 } else 1502 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1503 1504 // Merge/Unmerge 1505 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1506 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1507 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1508 1509 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1510 const LLT Ty = Query.Types[TypeIdx]; 1511 if (Ty.isVector()) { 1512 const LLT &EltTy = Ty.getElementType(); 1513 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1514 return true; 1515 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1516 return true; 1517 } 1518 return false; 1519 }; 1520 1521 auto &Builder = getActionDefinitionsBuilder(Op) 1522 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1523 .lowerFor({{S16, V2S16}}) 1524 .lowerIf([=](const LegalityQuery &Query) { 1525 const LLT BigTy = Query.Types[BigTyIdx]; 1526 return BigTy.getSizeInBits() == 32; 1527 }) 1528 // Try to widen to s16 first for small types. 1529 // TODO: Only do this on targets with legal s16 shifts 1530 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1531 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1532 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1533 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1534 elementTypeIs(1, S16)), 1535 changeTo(1, V2S16)) 1536 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1537 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1538 // valid. 1539 .clampScalar(LitTyIdx, S32, S512) 1540 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1541 // Break up vectors with weird elements into scalars 1542 .fewerElementsIf( 1543 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1544 scalarize(0)) 1545 .fewerElementsIf( 1546 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1547 scalarize(1)) 1548 .clampScalar(BigTyIdx, S32, MaxScalar); 1549 1550 if (Op == G_MERGE_VALUES) { 1551 Builder.widenScalarIf( 1552 // TODO: Use 16-bit shifts if legal for 8-bit values? 1553 [=](const LegalityQuery &Query) { 1554 const LLT Ty = Query.Types[LitTyIdx]; 1555 return Ty.getSizeInBits() < 32; 1556 }, 1557 changeTo(LitTyIdx, S32)); 1558 } 1559 1560 Builder.widenScalarIf( 1561 [=](const LegalityQuery &Query) { 1562 const LLT Ty = Query.Types[BigTyIdx]; 1563 return !isPowerOf2_32(Ty.getSizeInBits()) && 1564 Ty.getSizeInBits() % 16 != 0; 1565 }, 1566 [=](const LegalityQuery &Query) { 1567 // Pick the next power of 2, or a multiple of 64 over 128. 1568 // Whichever is smaller. 1569 const LLT &Ty = Query.Types[BigTyIdx]; 1570 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1571 if (NewSizeInBits >= 256) { 1572 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1573 if (RoundedTo < NewSizeInBits) 1574 NewSizeInBits = RoundedTo; 1575 } 1576 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1577 }) 1578 // Any vectors left are the wrong size. Scalarize them. 1579 .scalarize(0) 1580 .scalarize(1); 1581 } 1582 1583 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1584 // RegBankSelect. 1585 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1586 .legalFor({{S32}, {S64}}); 1587 1588 if (ST.hasVOP3PInsts()) { 1589 SextInReg.lowerFor({{V2S16}}) 1590 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1591 // get more vector shift opportunities, since we'll get those when 1592 // expanded. 1593 .clampMaxNumElementsStrict(0, S16, 2); 1594 } else if (ST.has16BitInsts()) { 1595 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1596 } else { 1597 // Prefer to promote to s32 before lowering if we don't have 16-bit 1598 // shifts. This avoid a lot of intermediate truncate and extend operations. 1599 SextInReg.lowerFor({{S32}, {S64}}); 1600 } 1601 1602 SextInReg 1603 .scalarize(0) 1604 .clampScalar(0, S32, S64) 1605 .lower(); 1606 1607 getActionDefinitionsBuilder({G_ROTR, G_ROTL}) 1608 .scalarize(0) 1609 .lower(); 1610 1611 // TODO: Only Try to form v2s16 with legal packed instructions. 1612 getActionDefinitionsBuilder(G_FSHR) 1613 .legalFor({{S32, S32}}) 1614 .lowerFor({{V2S16, V2S16}}) 1615 .clampMaxNumElementsStrict(0, S16, 2) 1616 .scalarize(0) 1617 .lower(); 1618 1619 if (ST.hasVOP3PInsts()) { 1620 getActionDefinitionsBuilder(G_FSHL) 1621 .lowerFor({{V2S16, V2S16}}) 1622 .clampMaxNumElementsStrict(0, S16, 2) 1623 .scalarize(0) 1624 .lower(); 1625 } else { 1626 getActionDefinitionsBuilder(G_FSHL) 1627 .scalarize(0) 1628 .lower(); 1629 } 1630 1631 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1632 .legalFor({S64}); 1633 1634 getActionDefinitionsBuilder(G_FENCE) 1635 .alwaysLegal(); 1636 1637 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1638 .scalarize(0) 1639 .minScalar(0, S32) 1640 .lower(); 1641 1642 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1643 .legalFor({{S32, S32}, {S64, S32}}) 1644 .clampScalar(1, S32, S32) 1645 .clampScalar(0, S32, S64) 1646 .widenScalarToNextPow2(0) 1647 .scalarize(0); 1648 1649 getActionDefinitionsBuilder({ 1650 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1651 G_FCOPYSIGN, 1652 1653 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1654 G_ATOMICRMW_NAND, 1655 G_ATOMICRMW_FSUB, 1656 G_READ_REGISTER, 1657 G_WRITE_REGISTER, 1658 1659 G_SADDO, G_SSUBO, 1660 1661 // TODO: Implement 1662 G_FMINIMUM, G_FMAXIMUM}).lower(); 1663 1664 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) 1665 .lower(); 1666 1667 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1668 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1669 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1670 .unsupported(); 1671 1672 getLegacyLegalizerInfo().computeTables(); 1673 verify(*ST.getInstrInfo()); 1674 } 1675 1676 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1677 MachineInstr &MI) const { 1678 MachineIRBuilder &B = Helper.MIRBuilder; 1679 MachineRegisterInfo &MRI = *B.getMRI(); 1680 1681 switch (MI.getOpcode()) { 1682 case TargetOpcode::G_ADDRSPACE_CAST: 1683 return legalizeAddrSpaceCast(MI, MRI, B); 1684 case TargetOpcode::G_FRINT: 1685 return legalizeFrint(MI, MRI, B); 1686 case TargetOpcode::G_FCEIL: 1687 return legalizeFceil(MI, MRI, B); 1688 case TargetOpcode::G_FREM: 1689 return legalizeFrem(MI, MRI, B); 1690 case TargetOpcode::G_INTRINSIC_TRUNC: 1691 return legalizeIntrinsicTrunc(MI, MRI, B); 1692 case TargetOpcode::G_SITOFP: 1693 return legalizeITOFP(MI, MRI, B, true); 1694 case TargetOpcode::G_UITOFP: 1695 return legalizeITOFP(MI, MRI, B, false); 1696 case TargetOpcode::G_FPTOSI: 1697 return legalizeFPTOI(MI, MRI, B, true); 1698 case TargetOpcode::G_FPTOUI: 1699 return legalizeFPTOI(MI, MRI, B, false); 1700 case TargetOpcode::G_FMINNUM: 1701 case TargetOpcode::G_FMAXNUM: 1702 case TargetOpcode::G_FMINNUM_IEEE: 1703 case TargetOpcode::G_FMAXNUM_IEEE: 1704 return legalizeMinNumMaxNum(Helper, MI); 1705 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1706 return legalizeExtractVectorElt(MI, MRI, B); 1707 case TargetOpcode::G_INSERT_VECTOR_ELT: 1708 return legalizeInsertVectorElt(MI, MRI, B); 1709 case TargetOpcode::G_SHUFFLE_VECTOR: 1710 return legalizeShuffleVector(MI, MRI, B); 1711 case TargetOpcode::G_FSIN: 1712 case TargetOpcode::G_FCOS: 1713 return legalizeSinCos(MI, MRI, B); 1714 case TargetOpcode::G_GLOBAL_VALUE: 1715 return legalizeGlobalValue(MI, MRI, B); 1716 case TargetOpcode::G_LOAD: 1717 case TargetOpcode::G_SEXTLOAD: 1718 case TargetOpcode::G_ZEXTLOAD: 1719 return legalizeLoad(Helper, MI); 1720 case TargetOpcode::G_FMAD: 1721 return legalizeFMad(MI, MRI, B); 1722 case TargetOpcode::G_FDIV: 1723 return legalizeFDIV(MI, MRI, B); 1724 case TargetOpcode::G_UDIV: 1725 case TargetOpcode::G_UREM: 1726 case TargetOpcode::G_UDIVREM: 1727 return legalizeUnsignedDIV_REM(MI, MRI, B); 1728 case TargetOpcode::G_SDIV: 1729 case TargetOpcode::G_SREM: 1730 case TargetOpcode::G_SDIVREM: 1731 return legalizeSignedDIV_REM(MI, MRI, B); 1732 case TargetOpcode::G_ATOMIC_CMPXCHG: 1733 return legalizeAtomicCmpXChg(MI, MRI, B); 1734 case TargetOpcode::G_FLOG: 1735 return legalizeFlog(MI, B, numbers::ln2f); 1736 case TargetOpcode::G_FLOG10: 1737 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1738 case TargetOpcode::G_FEXP: 1739 return legalizeFExp(MI, B); 1740 case TargetOpcode::G_FPOW: 1741 return legalizeFPow(MI, B); 1742 case TargetOpcode::G_FFLOOR: 1743 return legalizeFFloor(MI, MRI, B); 1744 case TargetOpcode::G_BUILD_VECTOR: 1745 return legalizeBuildVector(MI, MRI, B); 1746 case TargetOpcode::G_CTLZ: 1747 case TargetOpcode::G_CTTZ: 1748 return legalizeCTLZ_CTTZ(MI, MRI, B); 1749 default: 1750 return false; 1751 } 1752 1753 llvm_unreachable("expected switch to return"); 1754 } 1755 1756 Register AMDGPULegalizerInfo::getSegmentAperture( 1757 unsigned AS, 1758 MachineRegisterInfo &MRI, 1759 MachineIRBuilder &B) const { 1760 MachineFunction &MF = B.getMF(); 1761 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1762 const LLT S32 = LLT::scalar(32); 1763 1764 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1765 1766 if (ST.hasApertureRegs()) { 1767 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1768 // getreg. 1769 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1770 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1771 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1772 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1773 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1774 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1775 unsigned Encoding = 1776 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1777 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1778 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1779 1780 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1781 1782 B.buildInstr(AMDGPU::S_GETREG_B32) 1783 .addDef(GetReg) 1784 .addImm(Encoding); 1785 MRI.setType(GetReg, S32); 1786 1787 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1788 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1789 } 1790 1791 Register QueuePtr = MRI.createGenericVirtualRegister( 1792 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1793 1794 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1795 return Register(); 1796 1797 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1798 // private_segment_aperture_base_hi. 1799 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1800 1801 // TODO: can we be smarter about machine pointer info? 1802 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1803 MachineMemOperand *MMO = MF.getMachineMemOperand( 1804 PtrInfo, 1805 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1806 MachineMemOperand::MOInvariant, 1807 LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 1808 1809 Register LoadAddr; 1810 1811 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1812 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1813 } 1814 1815 /// Return true if the value is a known valid address, such that a null check is 1816 /// not necessary. 1817 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, 1818 const AMDGPUTargetMachine &TM, unsigned AddrSpace) { 1819 MachineInstr *Def = MRI.getVRegDef(Val); 1820 switch (Def->getOpcode()) { 1821 case AMDGPU::G_FRAME_INDEX: 1822 case AMDGPU::G_GLOBAL_VALUE: 1823 case AMDGPU::G_BLOCK_ADDR: 1824 return true; 1825 case AMDGPU::G_CONSTANT: { 1826 const ConstantInt *CI = Def->getOperand(1).getCImm(); 1827 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace); 1828 } 1829 default: 1830 return false; 1831 } 1832 1833 return false; 1834 } 1835 1836 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1837 MachineInstr &MI, MachineRegisterInfo &MRI, 1838 MachineIRBuilder &B) const { 1839 MachineFunction &MF = B.getMF(); 1840 1841 const LLT S32 = LLT::scalar(32); 1842 Register Dst = MI.getOperand(0).getReg(); 1843 Register Src = MI.getOperand(1).getReg(); 1844 1845 LLT DstTy = MRI.getType(Dst); 1846 LLT SrcTy = MRI.getType(Src); 1847 unsigned DestAS = DstTy.getAddressSpace(); 1848 unsigned SrcAS = SrcTy.getAddressSpace(); 1849 1850 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1851 // vector element. 1852 assert(!DstTy.isVector()); 1853 1854 const AMDGPUTargetMachine &TM 1855 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1856 1857 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1858 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1859 return true; 1860 } 1861 1862 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1863 // Truncate. 1864 B.buildExtract(Dst, Src, 0); 1865 MI.eraseFromParent(); 1866 return true; 1867 } 1868 1869 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1870 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1871 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1872 1873 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1874 // another. Merge operands are required to be the same type, but creating an 1875 // extra ptrtoint would be kind of pointless. 1876 auto HighAddr = B.buildConstant( 1877 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1878 B.buildMerge(Dst, {Src, HighAddr}); 1879 MI.eraseFromParent(); 1880 return true; 1881 } 1882 1883 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1884 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1885 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1886 1887 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 1888 // Extract low 32-bits of the pointer. 1889 B.buildExtract(Dst, Src, 0); 1890 MI.eraseFromParent(); 1891 return true; 1892 } 1893 1894 unsigned NullVal = TM.getNullPointerValue(DestAS); 1895 1896 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1897 auto FlatNull = B.buildConstant(SrcTy, 0); 1898 1899 // Extract low 32-bits of the pointer. 1900 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1901 1902 auto CmpRes = 1903 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1904 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1905 1906 MI.eraseFromParent(); 1907 return true; 1908 } 1909 1910 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1911 return false; 1912 1913 if (!ST.hasFlatAddressSpace()) 1914 return false; 1915 1916 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1917 if (!ApertureReg.isValid()) 1918 return false; 1919 1920 // Coerce the type of the low half of the result so we can use merge_values. 1921 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1922 1923 // TODO: Should we allow mismatched types but matching sizes in merges to 1924 // avoid the ptrtoint? 1925 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1926 1927 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 1928 B.buildCopy(Dst, BuildPtr); 1929 MI.eraseFromParent(); 1930 return true; 1931 } 1932 1933 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1934 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1935 1936 auto CmpRes = 1937 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1938 1939 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1940 1941 MI.eraseFromParent(); 1942 return true; 1943 } 1944 1945 bool AMDGPULegalizerInfo::legalizeFrint( 1946 MachineInstr &MI, MachineRegisterInfo &MRI, 1947 MachineIRBuilder &B) const { 1948 Register Src = MI.getOperand(1).getReg(); 1949 LLT Ty = MRI.getType(Src); 1950 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1951 1952 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1953 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1954 1955 auto C1 = B.buildFConstant(Ty, C1Val); 1956 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1957 1958 // TODO: Should this propagate fast-math-flags? 1959 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1960 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1961 1962 auto C2 = B.buildFConstant(Ty, C2Val); 1963 auto Fabs = B.buildFAbs(Ty, Src); 1964 1965 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1966 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1967 MI.eraseFromParent(); 1968 return true; 1969 } 1970 1971 bool AMDGPULegalizerInfo::legalizeFceil( 1972 MachineInstr &MI, MachineRegisterInfo &MRI, 1973 MachineIRBuilder &B) const { 1974 1975 const LLT S1 = LLT::scalar(1); 1976 const LLT S64 = LLT::scalar(64); 1977 1978 Register Src = MI.getOperand(1).getReg(); 1979 assert(MRI.getType(Src) == S64); 1980 1981 // result = trunc(src) 1982 // if (src > 0.0 && src != result) 1983 // result += 1.0 1984 1985 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1986 1987 const auto Zero = B.buildFConstant(S64, 0.0); 1988 const auto One = B.buildFConstant(S64, 1.0); 1989 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1990 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1991 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1992 auto Add = B.buildSelect(S64, And, One, Zero); 1993 1994 // TODO: Should this propagate fast-math-flags? 1995 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1996 MI.eraseFromParent(); 1997 return true; 1998 } 1999 2000 bool AMDGPULegalizerInfo::legalizeFrem( 2001 MachineInstr &MI, MachineRegisterInfo &MRI, 2002 MachineIRBuilder &B) const { 2003 Register DstReg = MI.getOperand(0).getReg(); 2004 Register Src0Reg = MI.getOperand(1).getReg(); 2005 Register Src1Reg = MI.getOperand(2).getReg(); 2006 auto Flags = MI.getFlags(); 2007 LLT Ty = MRI.getType(DstReg); 2008 2009 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 2010 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 2011 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 2012 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 2013 MI.eraseFromParent(); 2014 return true; 2015 } 2016 2017 static MachineInstrBuilder extractF64Exponent(Register Hi, 2018 MachineIRBuilder &B) { 2019 const unsigned FractBits = 52; 2020 const unsigned ExpBits = 11; 2021 LLT S32 = LLT::scalar(32); 2022 2023 auto Const0 = B.buildConstant(S32, FractBits - 32); 2024 auto Const1 = B.buildConstant(S32, ExpBits); 2025 2026 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 2027 .addUse(Hi) 2028 .addUse(Const0.getReg(0)) 2029 .addUse(Const1.getReg(0)); 2030 2031 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 2032 } 2033 2034 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2035 MachineInstr &MI, MachineRegisterInfo &MRI, 2036 MachineIRBuilder &B) const { 2037 const LLT S1 = LLT::scalar(1); 2038 const LLT S32 = LLT::scalar(32); 2039 const LLT S64 = LLT::scalar(64); 2040 2041 Register Src = MI.getOperand(1).getReg(); 2042 assert(MRI.getType(Src) == S64); 2043 2044 // TODO: Should this use extract since the low half is unused? 2045 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2046 Register Hi = Unmerge.getReg(1); 2047 2048 // Extract the upper half, since this is where we will find the sign and 2049 // exponent. 2050 auto Exp = extractF64Exponent(Hi, B); 2051 2052 const unsigned FractBits = 52; 2053 2054 // Extract the sign bit. 2055 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2056 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2057 2058 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2059 2060 const auto Zero32 = B.buildConstant(S32, 0); 2061 2062 // Extend back to 64-bits. 2063 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2064 2065 auto Shr = B.buildAShr(S64, FractMask, Exp); 2066 auto Not = B.buildNot(S64, Shr); 2067 auto Tmp0 = B.buildAnd(S64, Src, Not); 2068 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2069 2070 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2071 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2072 2073 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2074 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2075 MI.eraseFromParent(); 2076 return true; 2077 } 2078 2079 bool AMDGPULegalizerInfo::legalizeITOFP( 2080 MachineInstr &MI, MachineRegisterInfo &MRI, 2081 MachineIRBuilder &B, bool Signed) const { 2082 2083 Register Dst = MI.getOperand(0).getReg(); 2084 Register Src = MI.getOperand(1).getReg(); 2085 2086 const LLT S64 = LLT::scalar(64); 2087 const LLT S32 = LLT::scalar(32); 2088 2089 assert(MRI.getType(Src) == S64); 2090 2091 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2092 auto ThirtyTwo = B.buildConstant(S32, 32); 2093 2094 if (MRI.getType(Dst) == S64) { 2095 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) 2096 : B.buildUITOFP(S64, Unmerge.getReg(1)); 2097 2098 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2099 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2100 .addUse(CvtHi.getReg(0)) 2101 .addUse(ThirtyTwo.getReg(0)); 2102 2103 // TODO: Should this propagate fast-math-flags? 2104 B.buildFAdd(Dst, LdExp, CvtLo); 2105 MI.eraseFromParent(); 2106 return true; 2107 } 2108 2109 assert(MRI.getType(Dst) == S32); 2110 2111 auto One = B.buildConstant(S32, 1); 2112 2113 MachineInstrBuilder ShAmt; 2114 if (Signed) { 2115 auto ThirtyOne = B.buildConstant(S32, 31); 2116 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); 2117 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); 2118 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); 2119 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, 2120 /*HasSideEffects=*/false) 2121 .addUse(Unmerge.getReg(1)); 2122 auto LS2 = B.buildSub(S32, LS, One); 2123 ShAmt = B.buildUMin(S32, LS2, MaxShAmt); 2124 } else 2125 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); 2126 auto Norm = B.buildShl(S64, Src, ShAmt); 2127 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); 2128 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); 2129 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); 2130 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); 2131 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); 2132 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst}, 2133 /*HasSideEffects=*/false) 2134 .addUse(FVal.getReg(0)) 2135 .addUse(Scale.getReg(0)); 2136 MI.eraseFromParent(); 2137 return true; 2138 } 2139 2140 // TODO: Copied from DAG implementation. Verify logic and document how this 2141 // actually works. 2142 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2143 MachineRegisterInfo &MRI, 2144 MachineIRBuilder &B, 2145 bool Signed) const { 2146 2147 Register Dst = MI.getOperand(0).getReg(); 2148 Register Src = MI.getOperand(1).getReg(); 2149 2150 const LLT S64 = LLT::scalar(64); 2151 const LLT S32 = LLT::scalar(32); 2152 2153 const LLT SrcLT = MRI.getType(Src); 2154 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2155 2156 unsigned Flags = MI.getFlags(); 2157 2158 // The basic idea of converting a floating point number into a pair of 32-bit 2159 // integers is illustrated as follows: 2160 // 2161 // tf := trunc(val); 2162 // hif := floor(tf * 2^-32); 2163 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2164 // hi := fptoi(hif); 2165 // lo := fptoi(lof); 2166 // 2167 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2168 MachineInstrBuilder Sign; 2169 if (Signed && SrcLT == S32) { 2170 // However, a 32-bit floating point number has only 23 bits mantissa and 2171 // it's not enough to hold all the significant bits of `lof` if val is 2172 // negative. To avoid the loss of precision, We need to take the absolute 2173 // value after truncating and flip the result back based on the original 2174 // signedness. 2175 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2176 Trunc = B.buildFAbs(S32, Trunc, Flags); 2177 } 2178 MachineInstrBuilder K0, K1; 2179 if (SrcLT == S64) { 2180 K0 = B.buildFConstant(S64, 2181 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2182 K1 = B.buildFConstant(S64, 2183 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2184 } else { 2185 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2186 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2187 } 2188 2189 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2190 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2191 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2192 2193 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2194 : B.buildFPTOUI(S32, FloorMul); 2195 auto Lo = B.buildFPTOUI(S32, Fma); 2196 2197 if (Signed && SrcLT == S32) { 2198 // Flip the result based on the signedness, which is either all 0s or 1s. 2199 Sign = B.buildMerge(S64, {Sign, Sign}); 2200 // r := xor({lo, hi}, sign) - sign; 2201 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); 2202 } else 2203 B.buildMerge(Dst, {Lo, Hi}); 2204 MI.eraseFromParent(); 2205 2206 return true; 2207 } 2208 2209 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2210 MachineInstr &MI) const { 2211 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2212 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2213 2214 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2215 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2216 2217 // With ieee_mode disabled, the instructions have the correct behavior 2218 // already for G_FMINNUM/G_FMAXNUM 2219 if (!MFI->getMode().IEEE) 2220 return !IsIEEEOp; 2221 2222 if (IsIEEEOp) 2223 return true; 2224 2225 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2226 } 2227 2228 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2229 MachineInstr &MI, MachineRegisterInfo &MRI, 2230 MachineIRBuilder &B) const { 2231 // TODO: Should move some of this into LegalizerHelper. 2232 2233 // TODO: Promote dynamic indexing of s16 to s32 2234 2235 // FIXME: Artifact combiner probably should have replaced the truncated 2236 // constant before this, so we shouldn't need 2237 // getIConstantVRegValWithLookThrough. 2238 Optional<ValueAndVReg> MaybeIdxVal = 2239 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2240 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2241 return true; 2242 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2243 2244 Register Dst = MI.getOperand(0).getReg(); 2245 Register Vec = MI.getOperand(1).getReg(); 2246 2247 LLT VecTy = MRI.getType(Vec); 2248 LLT EltTy = VecTy.getElementType(); 2249 assert(EltTy == MRI.getType(Dst)); 2250 2251 if (IdxVal < VecTy.getNumElements()) { 2252 auto Unmerge = B.buildUnmerge(EltTy, Vec); 2253 B.buildCopy(Dst, Unmerge.getReg(IdxVal)); 2254 } else { 2255 B.buildUndef(Dst); 2256 } 2257 2258 MI.eraseFromParent(); 2259 return true; 2260 } 2261 2262 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2263 MachineInstr &MI, MachineRegisterInfo &MRI, 2264 MachineIRBuilder &B) const { 2265 // TODO: Should move some of this into LegalizerHelper. 2266 2267 // TODO: Promote dynamic indexing of s16 to s32 2268 2269 // FIXME: Artifact combiner probably should have replaced the truncated 2270 // constant before this, so we shouldn't need 2271 // getIConstantVRegValWithLookThrough. 2272 Optional<ValueAndVReg> MaybeIdxVal = 2273 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2274 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2275 return true; 2276 2277 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2278 Register Dst = MI.getOperand(0).getReg(); 2279 Register Vec = MI.getOperand(1).getReg(); 2280 Register Ins = MI.getOperand(2).getReg(); 2281 2282 LLT VecTy = MRI.getType(Vec); 2283 LLT EltTy = VecTy.getElementType(); 2284 assert(EltTy == MRI.getType(Ins)); 2285 (void)Ins; 2286 2287 unsigned NumElts = VecTy.getNumElements(); 2288 if (IdxVal < NumElts) { 2289 SmallVector<Register, 8> SrcRegs; 2290 for (unsigned i = 0; i < NumElts; ++i) 2291 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy)); 2292 B.buildUnmerge(SrcRegs, Vec); 2293 2294 SrcRegs[IdxVal] = MI.getOperand(2).getReg(); 2295 B.buildMerge(Dst, SrcRegs); 2296 } else { 2297 B.buildUndef(Dst); 2298 } 2299 2300 MI.eraseFromParent(); 2301 return true; 2302 } 2303 2304 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2305 MachineInstr &MI, MachineRegisterInfo &MRI, 2306 MachineIRBuilder &B) const { 2307 const LLT V2S16 = LLT::fixed_vector(2, 16); 2308 2309 Register Dst = MI.getOperand(0).getReg(); 2310 Register Src0 = MI.getOperand(1).getReg(); 2311 LLT DstTy = MRI.getType(Dst); 2312 LLT SrcTy = MRI.getType(Src0); 2313 2314 if (SrcTy == V2S16 && DstTy == V2S16 && 2315 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2316 return true; 2317 2318 MachineIRBuilder HelperBuilder(MI); 2319 GISelObserverWrapper DummyObserver; 2320 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2321 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2322 } 2323 2324 bool AMDGPULegalizerInfo::legalizeSinCos( 2325 MachineInstr &MI, MachineRegisterInfo &MRI, 2326 MachineIRBuilder &B) const { 2327 2328 Register DstReg = MI.getOperand(0).getReg(); 2329 Register SrcReg = MI.getOperand(1).getReg(); 2330 LLT Ty = MRI.getType(DstReg); 2331 unsigned Flags = MI.getFlags(); 2332 2333 Register TrigVal; 2334 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2335 if (ST.hasTrigReducedRange()) { 2336 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2337 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2338 .addUse(MulVal.getReg(0)) 2339 .setMIFlags(Flags).getReg(0); 2340 } else 2341 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2342 2343 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2344 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2345 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2346 .addUse(TrigVal) 2347 .setMIFlags(Flags); 2348 MI.eraseFromParent(); 2349 return true; 2350 } 2351 2352 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2353 MachineIRBuilder &B, 2354 const GlobalValue *GV, 2355 int64_t Offset, 2356 unsigned GAFlags) const { 2357 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2358 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2359 // to the following code sequence: 2360 // 2361 // For constant address space: 2362 // s_getpc_b64 s[0:1] 2363 // s_add_u32 s0, s0, $symbol 2364 // s_addc_u32 s1, s1, 0 2365 // 2366 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2367 // a fixup or relocation is emitted to replace $symbol with a literal 2368 // constant, which is a pc-relative offset from the encoding of the $symbol 2369 // operand to the global variable. 2370 // 2371 // For global address space: 2372 // s_getpc_b64 s[0:1] 2373 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2374 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2375 // 2376 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2377 // fixups or relocations are emitted to replace $symbol@*@lo and 2378 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2379 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2380 // operand to the global variable. 2381 // 2382 // What we want here is an offset from the value returned by s_getpc 2383 // (which is the address of the s_add_u32 instruction) to the global 2384 // variable, but since the encoding of $symbol starts 4 bytes after the start 2385 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2386 // small. This requires us to add 4 to the global variable offset in order to 2387 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2388 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2389 // instruction. 2390 2391 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2392 2393 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2394 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2395 2396 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2397 .addDef(PCReg); 2398 2399 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2400 if (GAFlags == SIInstrInfo::MO_NONE) 2401 MIB.addImm(0); 2402 else 2403 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2404 2405 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2406 2407 if (PtrTy.getSizeInBits() == 32) 2408 B.buildExtract(DstReg, PCReg, 0); 2409 return true; 2410 } 2411 2412 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2413 MachineInstr &MI, MachineRegisterInfo &MRI, 2414 MachineIRBuilder &B) const { 2415 Register DstReg = MI.getOperand(0).getReg(); 2416 LLT Ty = MRI.getType(DstReg); 2417 unsigned AS = Ty.getAddressSpace(); 2418 2419 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2420 MachineFunction &MF = B.getMF(); 2421 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2422 2423 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2424 if (!MFI->isModuleEntryFunction() && 2425 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2426 const Function &Fn = MF.getFunction(); 2427 DiagnosticInfoUnsupported BadLDSDecl( 2428 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2429 DS_Warning); 2430 Fn.getContext().diagnose(BadLDSDecl); 2431 2432 // We currently don't have a way to correctly allocate LDS objects that 2433 // aren't directly associated with a kernel. We do force inlining of 2434 // functions that use local objects. However, if these dead functions are 2435 // not eliminated, we don't want a compile time error. Just emit a warning 2436 // and a trap, since there should be no callable path here. 2437 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2438 B.buildUndef(DstReg); 2439 MI.eraseFromParent(); 2440 return true; 2441 } 2442 2443 // TODO: We could emit code to handle the initialization somewhere. 2444 // We ignore the initializer for now and legalize it to allow selection. 2445 // The initializer will anyway get errored out during assembly emission. 2446 const SITargetLowering *TLI = ST.getTargetLowering(); 2447 if (!TLI->shouldUseLDSConstAddress(GV)) { 2448 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2449 return true; // Leave in place; 2450 } 2451 2452 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2453 Type *Ty = GV->getValueType(); 2454 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2455 // zero-sized type in other languages to declare the dynamic shared 2456 // memory which size is not known at the compile time. They will be 2457 // allocated by the runtime and placed directly after the static 2458 // allocated ones. They all share the same offset. 2459 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2460 // Adjust alignment for that dynamic shared memory array. 2461 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2462 LLT S32 = LLT::scalar(32); 2463 auto Sz = 2464 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2465 B.buildIntToPtr(DstReg, Sz); 2466 MI.eraseFromParent(); 2467 return true; 2468 } 2469 } 2470 2471 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), 2472 *cast<GlobalVariable>(GV))); 2473 MI.eraseFromParent(); 2474 return true; 2475 } 2476 2477 const SITargetLowering *TLI = ST.getTargetLowering(); 2478 2479 if (TLI->shouldEmitFixup(GV)) { 2480 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2481 MI.eraseFromParent(); 2482 return true; 2483 } 2484 2485 if (TLI->shouldEmitPCReloc(GV)) { 2486 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2487 MI.eraseFromParent(); 2488 return true; 2489 } 2490 2491 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2492 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2493 2494 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 2495 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2496 MachinePointerInfo::getGOT(MF), 2497 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2498 MachineMemOperand::MOInvariant, 2499 LoadTy, Align(8)); 2500 2501 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2502 2503 if (Ty.getSizeInBits() == 32) { 2504 // Truncate if this is a 32-bit constant address. 2505 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2506 B.buildExtract(DstReg, Load, 0); 2507 } else 2508 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2509 2510 MI.eraseFromParent(); 2511 return true; 2512 } 2513 2514 static LLT widenToNextPowerOf2(LLT Ty) { 2515 if (Ty.isVector()) 2516 return Ty.changeElementCount( 2517 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2518 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2519 } 2520 2521 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2522 MachineInstr &MI) const { 2523 MachineIRBuilder &B = Helper.MIRBuilder; 2524 MachineRegisterInfo &MRI = *B.getMRI(); 2525 GISelChangeObserver &Observer = Helper.Observer; 2526 2527 Register PtrReg = MI.getOperand(1).getReg(); 2528 LLT PtrTy = MRI.getType(PtrReg); 2529 unsigned AddrSpace = PtrTy.getAddressSpace(); 2530 2531 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2532 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2533 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2534 Observer.changingInstr(MI); 2535 MI.getOperand(1).setReg(Cast.getReg(0)); 2536 Observer.changedInstr(MI); 2537 return true; 2538 } 2539 2540 if (MI.getOpcode() != AMDGPU::G_LOAD) 2541 return false; 2542 2543 Register ValReg = MI.getOperand(0).getReg(); 2544 LLT ValTy = MRI.getType(ValReg); 2545 2546 MachineMemOperand *MMO = *MI.memoperands_begin(); 2547 const unsigned ValSize = ValTy.getSizeInBits(); 2548 const LLT MemTy = MMO->getMemoryType(); 2549 const Align MemAlign = MMO->getAlign(); 2550 const unsigned MemSize = MemTy.getSizeInBits(); 2551 const uint64_t AlignInBits = 8 * MemAlign.value(); 2552 2553 // Widen non-power-of-2 loads to the alignment if needed 2554 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 2555 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2556 2557 // This was already the correct extending load result type, so just adjust 2558 // the memory type. 2559 if (WideMemSize == ValSize) { 2560 MachineFunction &MF = B.getMF(); 2561 2562 MachineMemOperand *WideMMO = 2563 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2564 Observer.changingInstr(MI); 2565 MI.setMemRefs(MF, {WideMMO}); 2566 Observer.changedInstr(MI); 2567 return true; 2568 } 2569 2570 // Don't bother handling edge case that should probably never be produced. 2571 if (ValSize > WideMemSize) 2572 return false; 2573 2574 LLT WideTy = widenToNextPowerOf2(ValTy); 2575 2576 Register WideLoad; 2577 if (!WideTy.isVector()) { 2578 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2579 B.buildTrunc(ValReg, WideLoad).getReg(0); 2580 } else { 2581 // Extract the subvector. 2582 2583 if (isRegisterType(ValTy)) { 2584 // If this a case where G_EXTRACT is legal, use it. 2585 // (e.g. <3 x s32> -> <4 x s32>) 2586 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2587 B.buildExtract(ValReg, WideLoad, 0); 2588 } else { 2589 // For cases where the widened type isn't a nice register value, unmerge 2590 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2591 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2592 B.buildDeleteTrailingVectorElements(ValReg, WideLoad); 2593 } 2594 } 2595 2596 MI.eraseFromParent(); 2597 return true; 2598 } 2599 2600 return false; 2601 } 2602 2603 bool AMDGPULegalizerInfo::legalizeFMad( 2604 MachineInstr &MI, MachineRegisterInfo &MRI, 2605 MachineIRBuilder &B) const { 2606 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2607 assert(Ty.isScalar()); 2608 2609 MachineFunction &MF = B.getMF(); 2610 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2611 2612 // TODO: Always legal with future ftz flag. 2613 // FIXME: Do we need just output? 2614 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2615 return true; 2616 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2617 return true; 2618 2619 MachineIRBuilder HelperBuilder(MI); 2620 GISelObserverWrapper DummyObserver; 2621 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2622 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2623 } 2624 2625 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2626 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2627 Register DstReg = MI.getOperand(0).getReg(); 2628 Register PtrReg = MI.getOperand(1).getReg(); 2629 Register CmpVal = MI.getOperand(2).getReg(); 2630 Register NewVal = MI.getOperand(3).getReg(); 2631 2632 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2633 "this should not have been custom lowered"); 2634 2635 LLT ValTy = MRI.getType(CmpVal); 2636 LLT VecTy = LLT::fixed_vector(2, ValTy); 2637 2638 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2639 2640 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2641 .addDef(DstReg) 2642 .addUse(PtrReg) 2643 .addUse(PackedVal) 2644 .setMemRefs(MI.memoperands()); 2645 2646 MI.eraseFromParent(); 2647 return true; 2648 } 2649 2650 bool AMDGPULegalizerInfo::legalizeFlog( 2651 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2652 Register Dst = MI.getOperand(0).getReg(); 2653 Register Src = MI.getOperand(1).getReg(); 2654 LLT Ty = B.getMRI()->getType(Dst); 2655 unsigned Flags = MI.getFlags(); 2656 2657 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2658 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2659 2660 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2661 MI.eraseFromParent(); 2662 return true; 2663 } 2664 2665 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2666 MachineIRBuilder &B) const { 2667 Register Dst = MI.getOperand(0).getReg(); 2668 Register Src = MI.getOperand(1).getReg(); 2669 unsigned Flags = MI.getFlags(); 2670 LLT Ty = B.getMRI()->getType(Dst); 2671 2672 auto K = B.buildFConstant(Ty, numbers::log2e); 2673 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2674 B.buildFExp2(Dst, Mul, Flags); 2675 MI.eraseFromParent(); 2676 return true; 2677 } 2678 2679 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2680 MachineIRBuilder &B) const { 2681 Register Dst = MI.getOperand(0).getReg(); 2682 Register Src0 = MI.getOperand(1).getReg(); 2683 Register Src1 = MI.getOperand(2).getReg(); 2684 unsigned Flags = MI.getFlags(); 2685 LLT Ty = B.getMRI()->getType(Dst); 2686 const LLT S16 = LLT::scalar(16); 2687 const LLT S32 = LLT::scalar(32); 2688 2689 if (Ty == S32) { 2690 auto Log = B.buildFLog2(S32, Src0, Flags); 2691 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2692 .addUse(Log.getReg(0)) 2693 .addUse(Src1) 2694 .setMIFlags(Flags); 2695 B.buildFExp2(Dst, Mul, Flags); 2696 } else if (Ty == S16) { 2697 // There's no f16 fmul_legacy, so we need to convert for it. 2698 auto Log = B.buildFLog2(S16, Src0, Flags); 2699 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2700 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2701 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2702 .addUse(Ext0.getReg(0)) 2703 .addUse(Ext1.getReg(0)) 2704 .setMIFlags(Flags); 2705 2706 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2707 } else 2708 return false; 2709 2710 MI.eraseFromParent(); 2711 return true; 2712 } 2713 2714 // Find a source register, ignoring any possible source modifiers. 2715 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2716 Register ModSrc = OrigSrc; 2717 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2718 ModSrc = SrcFNeg->getOperand(1).getReg(); 2719 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2720 ModSrc = SrcFAbs->getOperand(1).getReg(); 2721 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2722 ModSrc = SrcFAbs->getOperand(1).getReg(); 2723 return ModSrc; 2724 } 2725 2726 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2727 MachineRegisterInfo &MRI, 2728 MachineIRBuilder &B) const { 2729 2730 const LLT S1 = LLT::scalar(1); 2731 const LLT S64 = LLT::scalar(64); 2732 Register Dst = MI.getOperand(0).getReg(); 2733 Register OrigSrc = MI.getOperand(1).getReg(); 2734 unsigned Flags = MI.getFlags(); 2735 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2736 "this should not have been custom lowered"); 2737 2738 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2739 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2740 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2741 // V_FRACT bug is: 2742 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2743 // 2744 // Convert floor(x) to (x - fract(x)) 2745 2746 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2747 .addUse(OrigSrc) 2748 .setMIFlags(Flags); 2749 2750 // Give source modifier matching some assistance before obscuring a foldable 2751 // pattern. 2752 2753 // TODO: We can avoid the neg on the fract? The input sign to fract 2754 // shouldn't matter? 2755 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2756 2757 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2758 2759 Register Min = MRI.createGenericVirtualRegister(S64); 2760 2761 // We don't need to concern ourselves with the snan handling difference, so 2762 // use the one which will directly select. 2763 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2764 if (MFI->getMode().IEEE) 2765 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2766 else 2767 B.buildFMinNum(Min, Fract, Const, Flags); 2768 2769 Register CorrectedFract = Min; 2770 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2771 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2772 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2773 } 2774 2775 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2776 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2777 2778 MI.eraseFromParent(); 2779 return true; 2780 } 2781 2782 // Turn an illegal packed v2s16 build vector into bit operations. 2783 // TODO: This should probably be a bitcast action in LegalizerHelper. 2784 bool AMDGPULegalizerInfo::legalizeBuildVector( 2785 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2786 Register Dst = MI.getOperand(0).getReg(); 2787 const LLT S32 = LLT::scalar(32); 2788 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 2789 2790 Register Src0 = MI.getOperand(1).getReg(); 2791 Register Src1 = MI.getOperand(2).getReg(); 2792 assert(MRI.getType(Src0) == LLT::scalar(16)); 2793 2794 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2795 B.buildBitcast(Dst, Merge); 2796 2797 MI.eraseFromParent(); 2798 return true; 2799 } 2800 2801 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to 2802 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input 2803 // case with a single min instruction instead of a compare+select. 2804 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, 2805 MachineRegisterInfo &MRI, 2806 MachineIRBuilder &B) const { 2807 Register Dst = MI.getOperand(0).getReg(); 2808 Register Src = MI.getOperand(1).getReg(); 2809 LLT DstTy = MRI.getType(Dst); 2810 LLT SrcTy = MRI.getType(Src); 2811 2812 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ 2813 ? AMDGPU::G_AMDGPU_FFBH_U32 2814 : AMDGPU::G_AMDGPU_FFBL_B32; 2815 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); 2816 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); 2817 2818 MI.eraseFromParent(); 2819 return true; 2820 } 2821 2822 // Check that this is a G_XOR x, -1 2823 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2824 if (MI.getOpcode() != TargetOpcode::G_XOR) 2825 return false; 2826 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2827 return ConstVal && *ConstVal == -1; 2828 } 2829 2830 // Return the use branch instruction, otherwise null if the usage is invalid. 2831 static MachineInstr * 2832 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2833 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2834 Register CondDef = MI.getOperand(0).getReg(); 2835 if (!MRI.hasOneNonDBGUse(CondDef)) 2836 return nullptr; 2837 2838 MachineBasicBlock *Parent = MI.getParent(); 2839 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2840 2841 if (isNot(MRI, *UseMI)) { 2842 Register NegatedCond = UseMI->getOperand(0).getReg(); 2843 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2844 return nullptr; 2845 2846 // We're deleting the def of this value, so we need to remove it. 2847 eraseInstr(*UseMI, MRI); 2848 2849 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2850 Negated = true; 2851 } 2852 2853 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2854 return nullptr; 2855 2856 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2857 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2858 if (Next == Parent->end()) { 2859 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2860 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2861 return nullptr; 2862 UncondBrTarget = &*NextMBB; 2863 } else { 2864 if (Next->getOpcode() != AMDGPU::G_BR) 2865 return nullptr; 2866 Br = &*Next; 2867 UncondBrTarget = Br->getOperand(0).getMBB(); 2868 } 2869 2870 return UseMI; 2871 } 2872 2873 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2874 const ArgDescriptor *Arg, 2875 const TargetRegisterClass *ArgRC, 2876 LLT ArgTy) const { 2877 MCRegister SrcReg = Arg->getRegister(); 2878 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2879 assert(DstReg.isVirtual() && "Virtual register expected"); 2880 2881 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, 2882 *ArgRC, B.getDebugLoc(), ArgTy); 2883 if (Arg->isMasked()) { 2884 // TODO: Should we try to emit this once in the entry block? 2885 const LLT S32 = LLT::scalar(32); 2886 const unsigned Mask = Arg->getMask(); 2887 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2888 2889 Register AndMaskSrc = LiveIn; 2890 2891 // TODO: Avoid clearing the high bits if we know workitem id y/z are always 2892 // 0. 2893 if (Shift != 0) { 2894 auto ShiftAmt = B.buildConstant(S32, Shift); 2895 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2896 } 2897 2898 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2899 } else { 2900 B.buildCopy(DstReg, LiveIn); 2901 } 2902 2903 return true; 2904 } 2905 2906 bool AMDGPULegalizerInfo::loadInputValue( 2907 Register DstReg, MachineIRBuilder &B, 2908 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2909 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2910 const ArgDescriptor *Arg; 2911 const TargetRegisterClass *ArgRC; 2912 LLT ArgTy; 2913 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2914 2915 if (!Arg) { 2916 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { 2917 // The intrinsic may appear when we have a 0 sized kernarg segment, in which 2918 // case the pointer argument may be missing and we use null. 2919 B.buildConstant(DstReg, 0); 2920 return true; 2921 } 2922 2923 // It's undefined behavior if a function marked with the amdgpu-no-* 2924 // attributes uses the corresponding intrinsic. 2925 B.buildUndef(DstReg); 2926 return true; 2927 } 2928 2929 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2930 return false; // TODO: Handle these 2931 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2932 } 2933 2934 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2935 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2936 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2937 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2938 return false; 2939 2940 MI.eraseFromParent(); 2941 return true; 2942 } 2943 2944 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2945 MachineRegisterInfo &MRI, 2946 MachineIRBuilder &B) const { 2947 Register Dst = MI.getOperand(0).getReg(); 2948 LLT DstTy = MRI.getType(Dst); 2949 LLT S16 = LLT::scalar(16); 2950 LLT S32 = LLT::scalar(32); 2951 LLT S64 = LLT::scalar(64); 2952 2953 if (DstTy == S16) 2954 return legalizeFDIV16(MI, MRI, B); 2955 if (DstTy == S32) 2956 return legalizeFDIV32(MI, MRI, B); 2957 if (DstTy == S64) 2958 return legalizeFDIV64(MI, MRI, B); 2959 2960 return false; 2961 } 2962 2963 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 2964 Register DstDivReg, 2965 Register DstRemReg, 2966 Register X, 2967 Register Y) const { 2968 const LLT S1 = LLT::scalar(1); 2969 const LLT S32 = LLT::scalar(32); 2970 2971 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2972 // algorithm used here. 2973 2974 // Initial estimate of inv(y). 2975 auto FloatY = B.buildUITOFP(S32, Y); 2976 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2977 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2978 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2979 auto Z = B.buildFPTOUI(S32, ScaledY); 2980 2981 // One round of UNR. 2982 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2983 auto NegYZ = B.buildMul(S32, NegY, Z); 2984 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2985 2986 // Quotient/remainder estimate. 2987 auto Q = B.buildUMulH(S32, X, Z); 2988 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2989 2990 // First quotient/remainder refinement. 2991 auto One = B.buildConstant(S32, 1); 2992 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2993 if (DstDivReg) 2994 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2995 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2996 2997 // Second quotient/remainder refinement. 2998 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2999 if (DstDivReg) 3000 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 3001 3002 if (DstRemReg) 3003 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 3004 } 3005 3006 // Build integer reciprocal sequence around V_RCP_IFLAG_F32 3007 // 3008 // Return lo, hi of result 3009 // 3010 // %cvt.lo = G_UITOFP Val.lo 3011 // %cvt.hi = G_UITOFP Val.hi 3012 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 3013 // %rcp = G_AMDGPU_RCP_IFLAG %mad 3014 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 3015 // %mul2 = G_FMUL %mul1, 2**(-32) 3016 // %trunc = G_INTRINSIC_TRUNC %mul2 3017 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 3018 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 3019 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 3020 Register Val) { 3021 const LLT S32 = LLT::scalar(32); 3022 auto Unmerge = B.buildUnmerge(S32, Val); 3023 3024 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 3025 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 3026 3027 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 3028 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 3029 3030 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 3031 auto Mul1 = 3032 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 3033 3034 // 2**(-32) 3035 auto Mul2 = 3036 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 3037 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 3038 3039 // -(2**32) 3040 auto Mad2 = B.buildFMAD(S32, Trunc, 3041 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 3042 3043 auto ResultLo = B.buildFPTOUI(S32, Mad2); 3044 auto ResultHi = B.buildFPTOUI(S32, Trunc); 3045 3046 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 3047 } 3048 3049 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 3050 Register DstDivReg, 3051 Register DstRemReg, 3052 Register Numer, 3053 Register Denom) const { 3054 const LLT S32 = LLT::scalar(32); 3055 const LLT S64 = LLT::scalar(64); 3056 const LLT S1 = LLT::scalar(1); 3057 Register RcpLo, RcpHi; 3058 3059 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 3060 3061 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 3062 3063 auto Zero64 = B.buildConstant(S64, 0); 3064 auto NegDenom = B.buildSub(S64, Zero64, Denom); 3065 3066 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 3067 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 3068 3069 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 3070 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 3071 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 3072 3073 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 3074 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 3075 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 3076 3077 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 3078 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 3079 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 3080 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 3081 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 3082 3083 auto Zero32 = B.buildConstant(S32, 0); 3084 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 3085 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1)); 3086 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 3087 3088 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 3089 Register NumerLo = UnmergeNumer.getReg(0); 3090 Register NumerHi = UnmergeNumer.getReg(1); 3091 3092 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 3093 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 3094 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 3095 Register Mul3_Lo = UnmergeMul3.getReg(0); 3096 Register Mul3_Hi = UnmergeMul3.getReg(1); 3097 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 3098 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 3099 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 3100 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 3101 3102 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 3103 Register DenomLo = UnmergeDenom.getReg(0); 3104 Register DenomHi = UnmergeDenom.getReg(1); 3105 3106 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 3107 auto C1 = B.buildSExt(S32, CmpHi); 3108 3109 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 3110 auto C2 = B.buildSExt(S32, CmpLo); 3111 3112 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 3113 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 3114 3115 // TODO: Here and below portions of the code can be enclosed into if/endif. 3116 // Currently control flow is unconditional and we have 4 selects after 3117 // potential endif to substitute PHIs. 3118 3119 // if C3 != 0 ... 3120 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 3121 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 3122 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 3123 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 3124 3125 auto One64 = B.buildConstant(S64, 1); 3126 auto Add3 = B.buildAdd(S64, MulHi3, One64); 3127 3128 auto C4 = 3129 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 3130 auto C5 = 3131 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 3132 auto C6 = B.buildSelect( 3133 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 3134 3135 // if (C6 != 0) 3136 auto Add4 = B.buildAdd(S64, Add3, One64); 3137 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 3138 3139 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 3140 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 3141 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 3142 3143 // endif C6 3144 // endif C3 3145 3146 if (DstDivReg) { 3147 auto Sel1 = B.buildSelect( 3148 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3149 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3150 Sel1, MulHi3); 3151 } 3152 3153 if (DstRemReg) { 3154 auto Sel2 = B.buildSelect( 3155 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3156 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3157 Sel2, Sub1); 3158 } 3159 } 3160 3161 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3162 MachineRegisterInfo &MRI, 3163 MachineIRBuilder &B) const { 3164 Register DstDivReg, DstRemReg; 3165 switch (MI.getOpcode()) { 3166 default: 3167 llvm_unreachable("Unexpected opcode!"); 3168 case AMDGPU::G_UDIV: { 3169 DstDivReg = MI.getOperand(0).getReg(); 3170 break; 3171 } 3172 case AMDGPU::G_UREM: { 3173 DstRemReg = MI.getOperand(0).getReg(); 3174 break; 3175 } 3176 case AMDGPU::G_UDIVREM: { 3177 DstDivReg = MI.getOperand(0).getReg(); 3178 DstRemReg = MI.getOperand(1).getReg(); 3179 break; 3180 } 3181 } 3182 3183 const LLT S64 = LLT::scalar(64); 3184 const LLT S32 = LLT::scalar(32); 3185 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3186 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3187 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3188 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3189 3190 if (Ty == S32) 3191 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3192 else if (Ty == S64) 3193 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3194 else 3195 return false; 3196 3197 MI.eraseFromParent(); 3198 return true; 3199 } 3200 3201 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3202 MachineRegisterInfo &MRI, 3203 MachineIRBuilder &B) const { 3204 const LLT S64 = LLT::scalar(64); 3205 const LLT S32 = LLT::scalar(32); 3206 3207 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3208 if (Ty != S32 && Ty != S64) 3209 return false; 3210 3211 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3212 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3213 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3214 3215 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3216 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3217 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3218 3219 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3220 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3221 3222 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3223 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3224 3225 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3226 switch (MI.getOpcode()) { 3227 default: 3228 llvm_unreachable("Unexpected opcode!"); 3229 case AMDGPU::G_SDIV: { 3230 DstDivReg = MI.getOperand(0).getReg(); 3231 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3232 break; 3233 } 3234 case AMDGPU::G_SREM: { 3235 DstRemReg = MI.getOperand(0).getReg(); 3236 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3237 break; 3238 } 3239 case AMDGPU::G_SDIVREM: { 3240 DstDivReg = MI.getOperand(0).getReg(); 3241 DstRemReg = MI.getOperand(1).getReg(); 3242 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3243 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3244 break; 3245 } 3246 } 3247 3248 if (Ty == S32) 3249 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3250 else 3251 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3252 3253 if (DstDivReg) { 3254 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3255 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3256 B.buildSub(DstDivReg, SignXor, Sign); 3257 } 3258 3259 if (DstRemReg) { 3260 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3261 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3262 B.buildSub(DstRemReg, SignXor, Sign); 3263 } 3264 3265 MI.eraseFromParent(); 3266 return true; 3267 } 3268 3269 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3270 MachineRegisterInfo &MRI, 3271 MachineIRBuilder &B) const { 3272 Register Res = MI.getOperand(0).getReg(); 3273 Register LHS = MI.getOperand(1).getReg(); 3274 Register RHS = MI.getOperand(2).getReg(); 3275 uint16_t Flags = MI.getFlags(); 3276 LLT ResTy = MRI.getType(Res); 3277 3278 const MachineFunction &MF = B.getMF(); 3279 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3280 MI.getFlag(MachineInstr::FmAfn); 3281 3282 if (!AllowInaccurateRcp) 3283 return false; 3284 3285 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3286 // 1 / x -> RCP(x) 3287 if (CLHS->isExactlyValue(1.0)) { 3288 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3289 .addUse(RHS) 3290 .setMIFlags(Flags); 3291 3292 MI.eraseFromParent(); 3293 return true; 3294 } 3295 3296 // -1 / x -> RCP( FNEG(x) ) 3297 if (CLHS->isExactlyValue(-1.0)) { 3298 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3299 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3300 .addUse(FNeg.getReg(0)) 3301 .setMIFlags(Flags); 3302 3303 MI.eraseFromParent(); 3304 return true; 3305 } 3306 } 3307 3308 // x / y -> x * (1.0 / y) 3309 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3310 .addUse(RHS) 3311 .setMIFlags(Flags); 3312 B.buildFMul(Res, LHS, RCP, Flags); 3313 3314 MI.eraseFromParent(); 3315 return true; 3316 } 3317 3318 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3319 MachineRegisterInfo &MRI, 3320 MachineIRBuilder &B) const { 3321 Register Res = MI.getOperand(0).getReg(); 3322 Register X = MI.getOperand(1).getReg(); 3323 Register Y = MI.getOperand(2).getReg(); 3324 uint16_t Flags = MI.getFlags(); 3325 LLT ResTy = MRI.getType(Res); 3326 3327 const MachineFunction &MF = B.getMF(); 3328 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3329 MI.getFlag(MachineInstr::FmAfn); 3330 3331 if (!AllowInaccurateRcp) 3332 return false; 3333 3334 auto NegY = B.buildFNeg(ResTy, Y); 3335 auto One = B.buildFConstant(ResTy, 1.0); 3336 3337 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3338 .addUse(Y) 3339 .setMIFlags(Flags); 3340 3341 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3342 R = B.buildFMA(ResTy, Tmp0, R, R); 3343 3344 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3345 R = B.buildFMA(ResTy, Tmp1, R, R); 3346 3347 auto Ret = B.buildFMul(ResTy, X, R); 3348 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3349 3350 B.buildFMA(Res, Tmp2, R, Ret); 3351 MI.eraseFromParent(); 3352 return true; 3353 } 3354 3355 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3356 MachineRegisterInfo &MRI, 3357 MachineIRBuilder &B) const { 3358 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3359 return true; 3360 3361 Register Res = MI.getOperand(0).getReg(); 3362 Register LHS = MI.getOperand(1).getReg(); 3363 Register RHS = MI.getOperand(2).getReg(); 3364 3365 uint16_t Flags = MI.getFlags(); 3366 3367 LLT S16 = LLT::scalar(16); 3368 LLT S32 = LLT::scalar(32); 3369 3370 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3371 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3372 3373 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3374 .addUse(RHSExt.getReg(0)) 3375 .setMIFlags(Flags); 3376 3377 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3378 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3379 3380 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3381 .addUse(RDst.getReg(0)) 3382 .addUse(RHS) 3383 .addUse(LHS) 3384 .setMIFlags(Flags); 3385 3386 MI.eraseFromParent(); 3387 return true; 3388 } 3389 3390 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3391 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3392 static void toggleSPDenormMode(bool Enable, 3393 MachineIRBuilder &B, 3394 const GCNSubtarget &ST, 3395 AMDGPU::SIModeRegisterDefaults Mode) { 3396 // Set SP denorm mode to this value. 3397 unsigned SPDenormMode = 3398 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3399 3400 if (ST.hasDenormModeInst()) { 3401 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3402 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3403 3404 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3405 B.buildInstr(AMDGPU::S_DENORM_MODE) 3406 .addImm(NewDenormModeValue); 3407 3408 } else { 3409 // Select FP32 bit field in mode register. 3410 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3411 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3412 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3413 3414 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3415 .addImm(SPDenormMode) 3416 .addImm(SPDenormModeBitField); 3417 } 3418 } 3419 3420 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3421 MachineRegisterInfo &MRI, 3422 MachineIRBuilder &B) const { 3423 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3424 return true; 3425 3426 Register Res = MI.getOperand(0).getReg(); 3427 Register LHS = MI.getOperand(1).getReg(); 3428 Register RHS = MI.getOperand(2).getReg(); 3429 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3430 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3431 3432 uint16_t Flags = MI.getFlags(); 3433 3434 LLT S32 = LLT::scalar(32); 3435 LLT S1 = LLT::scalar(1); 3436 3437 auto One = B.buildFConstant(S32, 1.0f); 3438 3439 auto DenominatorScaled = 3440 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3441 .addUse(LHS) 3442 .addUse(RHS) 3443 .addImm(0) 3444 .setMIFlags(Flags); 3445 auto NumeratorScaled = 3446 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3447 .addUse(LHS) 3448 .addUse(RHS) 3449 .addImm(1) 3450 .setMIFlags(Flags); 3451 3452 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3453 .addUse(DenominatorScaled.getReg(0)) 3454 .setMIFlags(Flags); 3455 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3456 3457 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3458 // aren't modeled as reading it. 3459 if (!Mode.allFP32Denormals()) 3460 toggleSPDenormMode(true, B, ST, Mode); 3461 3462 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3463 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3464 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3465 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3466 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3467 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3468 3469 if (!Mode.allFP32Denormals()) 3470 toggleSPDenormMode(false, B, ST, Mode); 3471 3472 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3473 .addUse(Fma4.getReg(0)) 3474 .addUse(Fma1.getReg(0)) 3475 .addUse(Fma3.getReg(0)) 3476 .addUse(NumeratorScaled.getReg(1)) 3477 .setMIFlags(Flags); 3478 3479 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3480 .addUse(Fmas.getReg(0)) 3481 .addUse(RHS) 3482 .addUse(LHS) 3483 .setMIFlags(Flags); 3484 3485 MI.eraseFromParent(); 3486 return true; 3487 } 3488 3489 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3490 MachineRegisterInfo &MRI, 3491 MachineIRBuilder &B) const { 3492 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3493 return true; 3494 3495 Register Res = MI.getOperand(0).getReg(); 3496 Register LHS = MI.getOperand(1).getReg(); 3497 Register RHS = MI.getOperand(2).getReg(); 3498 3499 uint16_t Flags = MI.getFlags(); 3500 3501 LLT S64 = LLT::scalar(64); 3502 LLT S1 = LLT::scalar(1); 3503 3504 auto One = B.buildFConstant(S64, 1.0); 3505 3506 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3507 .addUse(LHS) 3508 .addUse(RHS) 3509 .addImm(0) 3510 .setMIFlags(Flags); 3511 3512 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3513 3514 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3515 .addUse(DivScale0.getReg(0)) 3516 .setMIFlags(Flags); 3517 3518 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3519 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3520 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3521 3522 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3523 .addUse(LHS) 3524 .addUse(RHS) 3525 .addImm(1) 3526 .setMIFlags(Flags); 3527 3528 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3529 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3530 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3531 3532 Register Scale; 3533 if (!ST.hasUsableDivScaleConditionOutput()) { 3534 // Workaround a hardware bug on SI where the condition output from div_scale 3535 // is not usable. 3536 3537 LLT S32 = LLT::scalar(32); 3538 3539 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3540 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3541 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3542 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3543 3544 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3545 Scale1Unmerge.getReg(1)); 3546 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3547 Scale0Unmerge.getReg(1)); 3548 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3549 } else { 3550 Scale = DivScale1.getReg(1); 3551 } 3552 3553 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3554 .addUse(Fma4.getReg(0)) 3555 .addUse(Fma3.getReg(0)) 3556 .addUse(Mul.getReg(0)) 3557 .addUse(Scale) 3558 .setMIFlags(Flags); 3559 3560 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3561 .addUse(Fmas.getReg(0)) 3562 .addUse(RHS) 3563 .addUse(LHS) 3564 .setMIFlags(Flags); 3565 3566 MI.eraseFromParent(); 3567 return true; 3568 } 3569 3570 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3571 MachineRegisterInfo &MRI, 3572 MachineIRBuilder &B) const { 3573 Register Res = MI.getOperand(0).getReg(); 3574 Register LHS = MI.getOperand(2).getReg(); 3575 Register RHS = MI.getOperand(3).getReg(); 3576 uint16_t Flags = MI.getFlags(); 3577 3578 LLT S32 = LLT::scalar(32); 3579 LLT S1 = LLT::scalar(1); 3580 3581 auto Abs = B.buildFAbs(S32, RHS, Flags); 3582 const APFloat C0Val(1.0f); 3583 3584 auto C0 = B.buildConstant(S32, 0x6f800000); 3585 auto C1 = B.buildConstant(S32, 0x2f800000); 3586 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3587 3588 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3589 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3590 3591 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3592 3593 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3594 .addUse(Mul0.getReg(0)) 3595 .setMIFlags(Flags); 3596 3597 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3598 3599 B.buildFMul(Res, Sel, Mul1, Flags); 3600 3601 MI.eraseFromParent(); 3602 return true; 3603 } 3604 3605 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3606 // FIXME: Why do we handle this one but not other removed instructions? 3607 // 3608 // Reciprocal square root. The clamp prevents infinite results, clamping 3609 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3610 // +-max_float. 3611 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3612 MachineRegisterInfo &MRI, 3613 MachineIRBuilder &B) const { 3614 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3615 return true; 3616 3617 Register Dst = MI.getOperand(0).getReg(); 3618 Register Src = MI.getOperand(2).getReg(); 3619 auto Flags = MI.getFlags(); 3620 3621 LLT Ty = MRI.getType(Dst); 3622 3623 const fltSemantics *FltSemantics; 3624 if (Ty == LLT::scalar(32)) 3625 FltSemantics = &APFloat::IEEEsingle(); 3626 else if (Ty == LLT::scalar(64)) 3627 FltSemantics = &APFloat::IEEEdouble(); 3628 else 3629 return false; 3630 3631 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3632 .addUse(Src) 3633 .setMIFlags(Flags); 3634 3635 // We don't need to concern ourselves with the snan handling difference, since 3636 // the rsq quieted (or not) so use the one which will directly select. 3637 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3638 const bool UseIEEE = MFI->getMode().IEEE; 3639 3640 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3641 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3642 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3643 3644 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3645 3646 if (UseIEEE) 3647 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3648 else 3649 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3650 MI.eraseFromParent(); 3651 return true; 3652 } 3653 3654 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3655 switch (IID) { 3656 case Intrinsic::amdgcn_ds_fadd: 3657 return AMDGPU::G_ATOMICRMW_FADD; 3658 case Intrinsic::amdgcn_ds_fmin: 3659 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3660 case Intrinsic::amdgcn_ds_fmax: 3661 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3662 default: 3663 llvm_unreachable("not a DS FP intrinsic"); 3664 } 3665 } 3666 3667 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3668 MachineInstr &MI, 3669 Intrinsic::ID IID) const { 3670 GISelChangeObserver &Observer = Helper.Observer; 3671 Observer.changingInstr(MI); 3672 3673 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3674 3675 // The remaining operands were used to set fields in the MemOperand on 3676 // construction. 3677 for (int I = 6; I > 3; --I) 3678 MI.RemoveOperand(I); 3679 3680 MI.RemoveOperand(1); // Remove the intrinsic ID. 3681 Observer.changedInstr(MI); 3682 return true; 3683 } 3684 3685 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3686 MachineRegisterInfo &MRI, 3687 MachineIRBuilder &B) const { 3688 uint64_t Offset = 3689 ST.getTargetLowering()->getImplicitParameterOffset( 3690 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3691 LLT DstTy = MRI.getType(DstReg); 3692 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3693 3694 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3695 if (!loadInputValue(KernargPtrReg, B, 3696 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3697 return false; 3698 3699 // FIXME: This should be nuw 3700 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3701 return true; 3702 } 3703 3704 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3705 MachineRegisterInfo &MRI, 3706 MachineIRBuilder &B) const { 3707 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3708 if (!MFI->isEntryFunction()) { 3709 return legalizePreloadedArgIntrin(MI, MRI, B, 3710 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3711 } 3712 3713 Register DstReg = MI.getOperand(0).getReg(); 3714 if (!getImplicitArgPtr(DstReg, MRI, B)) 3715 return false; 3716 3717 MI.eraseFromParent(); 3718 return true; 3719 } 3720 3721 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3722 MachineRegisterInfo &MRI, 3723 MachineIRBuilder &B, 3724 unsigned AddrSpace) const { 3725 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3726 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3727 Register Hi32 = Unmerge.getReg(1); 3728 3729 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3730 MI.eraseFromParent(); 3731 return true; 3732 } 3733 3734 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3735 // offset (the offset that is included in bounds checking and swizzling, to be 3736 // split between the instruction's voffset and immoffset fields) and soffset 3737 // (the offset that is excluded from bounds checking and swizzling, to go in 3738 // the instruction's soffset field). This function takes the first kind of 3739 // offset and figures out how to split it between voffset and immoffset. 3740 std::pair<Register, unsigned> 3741 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3742 Register OrigOffset) const { 3743 const unsigned MaxImm = 4095; 3744 Register BaseReg; 3745 unsigned ImmOffset; 3746 const LLT S32 = LLT::scalar(32); 3747 MachineRegisterInfo &MRI = *B.getMRI(); 3748 3749 std::tie(BaseReg, ImmOffset) = 3750 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3751 3752 // If BaseReg is a pointer, convert it to int. 3753 if (MRI.getType(BaseReg).isPointer()) 3754 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3755 3756 // If the immediate value is too big for the immoffset field, put the value 3757 // and -4096 into the immoffset field so that the value that is copied/added 3758 // for the voffset field is a multiple of 4096, and it stands more chance 3759 // of being CSEd with the copy/add for another similar load/store. 3760 // However, do not do that rounding down to a multiple of 4096 if that is a 3761 // negative number, as it appears to be illegal to have a negative offset 3762 // in the vgpr, even if adding the immediate offset makes it positive. 3763 unsigned Overflow = ImmOffset & ~MaxImm; 3764 ImmOffset -= Overflow; 3765 if ((int32_t)Overflow < 0) { 3766 Overflow += ImmOffset; 3767 ImmOffset = 0; 3768 } 3769 3770 if (Overflow != 0) { 3771 if (!BaseReg) { 3772 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3773 } else { 3774 auto OverflowVal = B.buildConstant(S32, Overflow); 3775 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3776 } 3777 } 3778 3779 if (!BaseReg) 3780 BaseReg = B.buildConstant(S32, 0).getReg(0); 3781 3782 return std::make_pair(BaseReg, ImmOffset); 3783 } 3784 3785 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic. 3786 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, 3787 Register VOffset, Register SOffset, 3788 unsigned ImmOffset, Register VIndex, 3789 MachineRegisterInfo &MRI) const { 3790 Optional<ValueAndVReg> MaybeVOffsetVal = 3791 getIConstantVRegValWithLookThrough(VOffset, MRI); 3792 Optional<ValueAndVReg> MaybeSOffsetVal = 3793 getIConstantVRegValWithLookThrough(SOffset, MRI); 3794 Optional<ValueAndVReg> MaybeVIndexVal = 3795 getIConstantVRegValWithLookThrough(VIndex, MRI); 3796 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant, 3797 // update the MMO with that offset. The stride is unknown so we can only do 3798 // this if VIndex is constant 0. 3799 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && 3800 MaybeVIndexVal->Value == 0) { 3801 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + 3802 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; 3803 MMO->setOffset(TotalOffset); 3804 } else { 3805 // We don't have a constant combined offset to use in the MMO. Give up. 3806 MMO->setValue((Value *)nullptr); 3807 } 3808 } 3809 3810 /// Handle register layout difference for f16 images for some subtargets. 3811 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3812 MachineRegisterInfo &MRI, 3813 Register Reg, 3814 bool ImageStore) const { 3815 const LLT S16 = LLT::scalar(16); 3816 const LLT S32 = LLT::scalar(32); 3817 LLT StoreVT = MRI.getType(Reg); 3818 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3819 3820 if (ST.hasUnpackedD16VMem()) { 3821 auto Unmerge = B.buildUnmerge(S16, Reg); 3822 3823 SmallVector<Register, 4> WideRegs; 3824 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3825 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3826 3827 int NumElts = StoreVT.getNumElements(); 3828 3829 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 3830 .getReg(0); 3831 } 3832 3833 if (ImageStore && ST.hasImageStoreD16Bug()) { 3834 if (StoreVT.getNumElements() == 2) { 3835 SmallVector<Register, 4> PackedRegs; 3836 Reg = B.buildBitcast(S32, Reg).getReg(0); 3837 PackedRegs.push_back(Reg); 3838 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3839 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 3840 .getReg(0); 3841 } 3842 3843 if (StoreVT.getNumElements() == 3) { 3844 SmallVector<Register, 4> PackedRegs; 3845 auto Unmerge = B.buildUnmerge(S16, Reg); 3846 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3847 PackedRegs.push_back(Unmerge.getReg(I)); 3848 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3849 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 3850 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 3851 } 3852 3853 if (StoreVT.getNumElements() == 4) { 3854 SmallVector<Register, 4> PackedRegs; 3855 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 3856 auto Unmerge = B.buildUnmerge(S32, Reg); 3857 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3858 PackedRegs.push_back(Unmerge.getReg(I)); 3859 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3860 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 3861 .getReg(0); 3862 } 3863 3864 llvm_unreachable("invalid data type"); 3865 } 3866 3867 if (StoreVT == LLT::fixed_vector(3, S16)) { 3868 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) 3869 .getReg(0); 3870 } 3871 return Reg; 3872 } 3873 3874 Register AMDGPULegalizerInfo::fixStoreSourceType( 3875 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3876 MachineRegisterInfo *MRI = B.getMRI(); 3877 LLT Ty = MRI->getType(VData); 3878 3879 const LLT S16 = LLT::scalar(16); 3880 3881 // Fixup illegal register types for i8 stores. 3882 if (Ty == LLT::scalar(8) || Ty == S16) { 3883 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3884 return AnyExt; 3885 } 3886 3887 if (Ty.isVector()) { 3888 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3889 if (IsFormat) 3890 return handleD16VData(B, *MRI, VData); 3891 } 3892 } 3893 3894 return VData; 3895 } 3896 3897 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3898 MachineRegisterInfo &MRI, 3899 MachineIRBuilder &B, 3900 bool IsTyped, 3901 bool IsFormat) const { 3902 Register VData = MI.getOperand(1).getReg(); 3903 LLT Ty = MRI.getType(VData); 3904 LLT EltTy = Ty.getScalarType(); 3905 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3906 const LLT S32 = LLT::scalar(32); 3907 3908 VData = fixStoreSourceType(B, VData, IsFormat); 3909 Register RSrc = MI.getOperand(2).getReg(); 3910 3911 MachineMemOperand *MMO = *MI.memoperands_begin(); 3912 const int MemSize = MMO->getSize(); 3913 3914 unsigned ImmOffset; 3915 3916 // The typed intrinsics add an immediate after the registers. 3917 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3918 3919 // The struct intrinsic variants add one additional operand over raw. 3920 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3921 Register VIndex; 3922 int OpOffset = 0; 3923 if (HasVIndex) { 3924 VIndex = MI.getOperand(3).getReg(); 3925 OpOffset = 1; 3926 } else { 3927 VIndex = B.buildConstant(S32, 0).getReg(0); 3928 } 3929 3930 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3931 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3932 3933 unsigned Format = 0; 3934 if (IsTyped) { 3935 Format = MI.getOperand(5 + OpOffset).getImm(); 3936 ++OpOffset; 3937 } 3938 3939 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3940 3941 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 3942 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 3943 3944 unsigned Opc; 3945 if (IsTyped) { 3946 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3947 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3948 } else if (IsFormat) { 3949 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3950 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3951 } else { 3952 switch (MemSize) { 3953 case 1: 3954 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3955 break; 3956 case 2: 3957 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3958 break; 3959 default: 3960 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3961 break; 3962 } 3963 } 3964 3965 auto MIB = B.buildInstr(Opc) 3966 .addUse(VData) // vdata 3967 .addUse(RSrc) // rsrc 3968 .addUse(VIndex) // vindex 3969 .addUse(VOffset) // voffset 3970 .addUse(SOffset) // soffset 3971 .addImm(ImmOffset); // offset(imm) 3972 3973 if (IsTyped) 3974 MIB.addImm(Format); 3975 3976 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3977 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3978 .addMemOperand(MMO); 3979 3980 MI.eraseFromParent(); 3981 return true; 3982 } 3983 3984 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3985 MachineRegisterInfo &MRI, 3986 MachineIRBuilder &B, 3987 bool IsFormat, 3988 bool IsTyped) const { 3989 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3990 MachineMemOperand *MMO = *MI.memoperands_begin(); 3991 const LLT MemTy = MMO->getMemoryType(); 3992 const LLT S32 = LLT::scalar(32); 3993 3994 Register Dst = MI.getOperand(0).getReg(); 3995 Register RSrc = MI.getOperand(2).getReg(); 3996 3997 // The typed intrinsics add an immediate after the registers. 3998 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3999 4000 // The struct intrinsic variants add one additional operand over raw. 4001 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4002 Register VIndex; 4003 int OpOffset = 0; 4004 if (HasVIndex) { 4005 VIndex = MI.getOperand(3).getReg(); 4006 OpOffset = 1; 4007 } else { 4008 VIndex = B.buildConstant(S32, 0).getReg(0); 4009 } 4010 4011 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 4012 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 4013 4014 unsigned Format = 0; 4015 if (IsTyped) { 4016 Format = MI.getOperand(5 + OpOffset).getImm(); 4017 ++OpOffset; 4018 } 4019 4020 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 4021 unsigned ImmOffset; 4022 4023 LLT Ty = MRI.getType(Dst); 4024 LLT EltTy = Ty.getScalarType(); 4025 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 4026 const bool Unpacked = ST.hasUnpackedD16VMem(); 4027 4028 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4029 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 4030 4031 unsigned Opc; 4032 4033 if (IsTyped) { 4034 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 4035 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 4036 } else if (IsFormat) { 4037 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 4038 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 4039 } else { 4040 switch (MemTy.getSizeInBits()) { 4041 case 8: 4042 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 4043 break; 4044 case 16: 4045 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 4046 break; 4047 default: 4048 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 4049 break; 4050 } 4051 } 4052 4053 Register LoadDstReg; 4054 4055 bool IsExtLoad = 4056 (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector()); 4057 LLT UnpackedTy = Ty.changeElementSize(32); 4058 4059 if (IsExtLoad) 4060 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 4061 else if (Unpacked && IsD16 && Ty.isVector()) 4062 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 4063 else 4064 LoadDstReg = Dst; 4065 4066 auto MIB = B.buildInstr(Opc) 4067 .addDef(LoadDstReg) // vdata 4068 .addUse(RSrc) // rsrc 4069 .addUse(VIndex) // vindex 4070 .addUse(VOffset) // voffset 4071 .addUse(SOffset) // soffset 4072 .addImm(ImmOffset); // offset(imm) 4073 4074 if (IsTyped) 4075 MIB.addImm(Format); 4076 4077 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4078 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4079 .addMemOperand(MMO); 4080 4081 if (LoadDstReg != Dst) { 4082 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 4083 4084 // Widen result for extending loads was widened. 4085 if (IsExtLoad) 4086 B.buildTrunc(Dst, LoadDstReg); 4087 else { 4088 // Repack to original 16-bit vector result 4089 // FIXME: G_TRUNC should work, but legalization currently fails 4090 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 4091 SmallVector<Register, 4> Repack; 4092 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 4093 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 4094 B.buildMerge(Dst, Repack); 4095 } 4096 } 4097 4098 MI.eraseFromParent(); 4099 return true; 4100 } 4101 4102 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 4103 MachineIRBuilder &B, 4104 bool IsInc) const { 4105 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 4106 AMDGPU::G_AMDGPU_ATOMIC_DEC; 4107 B.buildInstr(Opc) 4108 .addDef(MI.getOperand(0).getReg()) 4109 .addUse(MI.getOperand(2).getReg()) 4110 .addUse(MI.getOperand(3).getReg()) 4111 .cloneMemRefs(MI); 4112 MI.eraseFromParent(); 4113 return true; 4114 } 4115 4116 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 4117 switch (IntrID) { 4118 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4119 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4120 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 4121 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4122 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4123 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 4124 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4125 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4126 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 4127 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4128 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4129 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 4130 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4131 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4132 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 4133 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4134 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4135 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 4136 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4137 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4138 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 4139 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4140 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4141 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 4142 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4143 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4144 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 4145 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4146 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4147 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 4148 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4149 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4150 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 4151 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4152 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4153 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 4154 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4155 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4156 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4157 case Intrinsic::amdgcn_buffer_atomic_fadd: 4158 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4159 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4160 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4161 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4162 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4163 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4164 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4165 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4166 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 4167 default: 4168 llvm_unreachable("unhandled atomic opcode"); 4169 } 4170 } 4171 4172 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4173 MachineIRBuilder &B, 4174 Intrinsic::ID IID) const { 4175 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4176 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4177 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4178 4179 Register Dst; 4180 4181 int OpOffset = 0; 4182 if (HasReturn) { 4183 // A few FP atomics do not support return values. 4184 Dst = MI.getOperand(0).getReg(); 4185 } else { 4186 OpOffset = -1; 4187 } 4188 4189 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4190 Register CmpVal; 4191 4192 if (IsCmpSwap) { 4193 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4194 ++OpOffset; 4195 } 4196 4197 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4198 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4199 4200 // The struct intrinsic variants add one additional operand over raw. 4201 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4202 Register VIndex; 4203 if (HasVIndex) { 4204 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4205 ++OpOffset; 4206 } else { 4207 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4208 } 4209 4210 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4211 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4212 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4213 4214 MachineMemOperand *MMO = *MI.memoperands_begin(); 4215 4216 unsigned ImmOffset; 4217 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4218 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); 4219 4220 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4221 4222 if (HasReturn) 4223 MIB.addDef(Dst); 4224 4225 MIB.addUse(VData); // vdata 4226 4227 if (IsCmpSwap) 4228 MIB.addReg(CmpVal); 4229 4230 MIB.addUse(RSrc) // rsrc 4231 .addUse(VIndex) // vindex 4232 .addUse(VOffset) // voffset 4233 .addUse(SOffset) // soffset 4234 .addImm(ImmOffset) // offset(imm) 4235 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4236 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4237 .addMemOperand(MMO); 4238 4239 MI.eraseFromParent(); 4240 return true; 4241 } 4242 4243 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4244 /// vector with s16 typed elements. 4245 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4246 SmallVectorImpl<Register> &PackedAddrs, 4247 unsigned ArgOffset, 4248 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4249 bool IsA16, bool IsG16) { 4250 const LLT S16 = LLT::scalar(16); 4251 const LLT V2S16 = LLT::fixed_vector(2, 16); 4252 auto EndIdx = Intr->VAddrEnd; 4253 4254 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4255 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4256 if (!SrcOp.isReg()) 4257 continue; // _L to _LZ may have eliminated this. 4258 4259 Register AddrReg = SrcOp.getReg(); 4260 4261 if ((I < Intr->GradientStart) || 4262 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4263 (I >= Intr->CoordStart && !IsA16)) { 4264 // Handle any gradient or coordinate operands that should not be packed 4265 if ((I < Intr->GradientStart) && IsA16 && 4266 (B.getMRI()->getType(AddrReg) == S16)) { 4267 // Special handling of bias when A16 is on. Bias is of type half but 4268 // occupies full 32-bit. 4269 PackedAddrs.push_back( 4270 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4271 .getReg(0)); 4272 } else { 4273 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4274 PackedAddrs.push_back(AddrReg); 4275 } 4276 } else { 4277 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4278 // derivatives dx/dh and dx/dv are packed with undef. 4279 if (((I + 1) >= EndIdx) || 4280 ((Intr->NumGradients / 2) % 2 == 1 && 4281 (I == static_cast<unsigned>(Intr->GradientStart + 4282 (Intr->NumGradients / 2) - 1) || 4283 I == static_cast<unsigned>(Intr->GradientStart + 4284 Intr->NumGradients - 1))) || 4285 // Check for _L to _LZ optimization 4286 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4287 PackedAddrs.push_back( 4288 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4289 .getReg(0)); 4290 } else { 4291 PackedAddrs.push_back( 4292 B.buildBuildVector( 4293 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4294 .getReg(0)); 4295 ++I; 4296 } 4297 } 4298 } 4299 } 4300 4301 /// Convert from separate vaddr components to a single vector address register, 4302 /// and replace the remaining operands with $noreg. 4303 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4304 int DimIdx, int NumVAddrs) { 4305 const LLT S32 = LLT::scalar(32); 4306 4307 SmallVector<Register, 8> AddrRegs; 4308 for (int I = 0; I != NumVAddrs; ++I) { 4309 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4310 if (SrcOp.isReg()) { 4311 AddrRegs.push_back(SrcOp.getReg()); 4312 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4313 } 4314 } 4315 4316 int NumAddrRegs = AddrRegs.size(); 4317 if (NumAddrRegs != 1) { 4318 // Above 8 elements round up to next power of 2 (i.e. 16). 4319 if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) { 4320 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4321 auto Undef = B.buildUndef(S32); 4322 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4323 NumAddrRegs = RoundedNumRegs; 4324 } 4325 4326 auto VAddr = 4327 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 4328 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4329 } 4330 4331 for (int I = 1; I != NumVAddrs; ++I) { 4332 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4333 if (SrcOp.isReg()) 4334 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4335 } 4336 } 4337 4338 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4339 /// 4340 /// Depending on the subtarget, load/store with 16-bit element data need to be 4341 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4342 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4343 /// registers. 4344 /// 4345 /// We don't want to directly select image instructions just yet, but also want 4346 /// to exposes all register repacking to the legalizer/combiners. We also don't 4347 /// want a selected instrution entering RegBankSelect. In order to avoid 4348 /// defining a multitude of intermediate image instructions, directly hack on 4349 /// the intrinsic's arguments. In cases like a16 addresses, this requires 4350 /// padding now unnecessary arguments with $noreg. 4351 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4352 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4353 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4354 4355 const unsigned NumDefs = MI.getNumExplicitDefs(); 4356 const unsigned ArgOffset = NumDefs + 1; 4357 bool IsTFE = NumDefs == 2; 4358 // We are only processing the operands of d16 image operations on subtargets 4359 // that use the unpacked register layout, or need to repack the TFE result. 4360 4361 // TODO: Do we need to guard against already legalized intrinsics? 4362 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4363 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4364 4365 MachineRegisterInfo *MRI = B.getMRI(); 4366 const LLT S32 = LLT::scalar(32); 4367 const LLT S16 = LLT::scalar(16); 4368 const LLT V2S16 = LLT::fixed_vector(2, 16); 4369 4370 unsigned DMask = 0; 4371 Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg(); 4372 LLT Ty = MRI->getType(VData); 4373 4374 // Check for 16 bit addresses and pack if true. 4375 LLT GradTy = 4376 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4377 LLT AddrTy = 4378 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4379 const bool IsG16 = GradTy == S16; 4380 const bool IsA16 = AddrTy == S16; 4381 const bool IsD16 = Ty.getScalarType() == S16; 4382 4383 int DMaskLanes = 0; 4384 if (!BaseOpcode->Atomic) { 4385 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4386 if (BaseOpcode->Gather4) { 4387 DMaskLanes = 4; 4388 } else if (DMask != 0) { 4389 DMaskLanes = countPopulation(DMask); 4390 } else if (!IsTFE && !BaseOpcode->Store) { 4391 // If dmask is 0, this is a no-op load. This can be eliminated. 4392 B.buildUndef(MI.getOperand(0)); 4393 MI.eraseFromParent(); 4394 return true; 4395 } 4396 } 4397 4398 Observer.changingInstr(MI); 4399 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4400 4401 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16 4402 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE; 4403 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 4404 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4405 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode; 4406 4407 // Track that we legalized this 4408 MI.setDesc(B.getTII().get(NewOpcode)); 4409 4410 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4411 // dmask to be at least 1 otherwise the instruction will fail 4412 if (IsTFE && DMask == 0) { 4413 DMask = 0x1; 4414 DMaskLanes = 1; 4415 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4416 } 4417 4418 if (BaseOpcode->Atomic) { 4419 Register VData0 = MI.getOperand(2).getReg(); 4420 LLT Ty = MRI->getType(VData0); 4421 4422 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4423 if (Ty.isVector()) 4424 return false; 4425 4426 if (BaseOpcode->AtomicX2) { 4427 Register VData1 = MI.getOperand(3).getReg(); 4428 // The two values are packed in one register. 4429 LLT PackedTy = LLT::fixed_vector(2, Ty); 4430 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4431 MI.getOperand(2).setReg(Concat.getReg(0)); 4432 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4433 } 4434 } 4435 4436 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4437 4438 // Optimize _L to _LZ when _L is zero 4439 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4440 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4441 const ConstantFP *ConstantLod; 4442 4443 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4444 m_GFCst(ConstantLod))) { 4445 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4446 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4447 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4448 AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ, 4449 Intr->Dim); 4450 4451 // The starting indexes should remain in the same place. 4452 --CorrectedNumVAddrs; 4453 4454 MI.getOperand(NumDefs).setIntrinsicID( 4455 static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4456 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4457 Intr = NewImageDimIntr; 4458 } 4459 } 4460 } 4461 4462 // Optimize _mip away, when 'lod' is zero 4463 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4464 int64_t ConstantLod; 4465 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4466 m_ICst(ConstantLod))) { 4467 if (ConstantLod == 0) { 4468 // TODO: Change intrinsic opcode and remove operand instead or replacing 4469 // it with 0, as the _L to _LZ handling is done above. 4470 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4471 --CorrectedNumVAddrs; 4472 } 4473 } 4474 } 4475 4476 // Rewrite the addressing register layout before doing anything else. 4477 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4478 // 16 bit gradients are supported, but are tied to the A16 control 4479 // so both gradients and addresses must be 16 bit 4480 return false; 4481 } 4482 4483 if (IsA16 && !ST.hasA16()) { 4484 // A16 not supported 4485 return false; 4486 } 4487 4488 if (IsA16 || IsG16) { 4489 if (Intr->NumVAddrs > 1) { 4490 SmallVector<Register, 4> PackedRegs; 4491 4492 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4493 IsG16); 4494 4495 // See also below in the non-a16 branch 4496 const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 && 4497 PackedRegs.size() <= ST.getNSAMaxSize(); 4498 4499 if (!UseNSA && PackedRegs.size() > 1) { 4500 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 4501 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4502 PackedRegs[0] = Concat.getReg(0); 4503 PackedRegs.resize(1); 4504 } 4505 4506 const unsigned NumPacked = PackedRegs.size(); 4507 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4508 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4509 if (!SrcOp.isReg()) { 4510 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4511 continue; 4512 } 4513 4514 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4515 4516 if (I - Intr->VAddrStart < NumPacked) 4517 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4518 else 4519 SrcOp.setReg(AMDGPU::NoRegister); 4520 } 4521 } 4522 } else { 4523 // If the register allocator cannot place the address registers contiguously 4524 // without introducing moves, then using the non-sequential address encoding 4525 // is always preferable, since it saves VALU instructions and is usually a 4526 // wash in terms of code size or even better. 4527 // 4528 // However, we currently have no way of hinting to the register allocator 4529 // that MIMG addresses should be placed contiguously when it is possible to 4530 // do so, so force non-NSA for the common 2-address case as a heuristic. 4531 // 4532 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4533 // allocation when possible. 4534 const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 && 4535 CorrectedNumVAddrs <= ST.getNSAMaxSize(); 4536 4537 if (!UseNSA && Intr->NumVAddrs > 1) 4538 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4539 Intr->NumVAddrs); 4540 } 4541 4542 int Flags = 0; 4543 if (IsA16) 4544 Flags |= 1; 4545 if (IsG16) 4546 Flags |= 2; 4547 MI.addOperand(MachineOperand::CreateImm(Flags)); 4548 4549 if (BaseOpcode->Store) { // No TFE for stores? 4550 // TODO: Handle dmask trim 4551 if (!Ty.isVector() || !IsD16) 4552 return true; 4553 4554 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4555 if (RepackedReg != VData) { 4556 MI.getOperand(1).setReg(RepackedReg); 4557 } 4558 4559 return true; 4560 } 4561 4562 Register DstReg = MI.getOperand(0).getReg(); 4563 const LLT EltTy = Ty.getScalarType(); 4564 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4565 4566 // Confirm that the return type is large enough for the dmask specified 4567 if (NumElts < DMaskLanes) 4568 return false; 4569 4570 if (NumElts > 4 || DMaskLanes > 4) 4571 return false; 4572 4573 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4574 const LLT AdjustedTy = 4575 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 4576 4577 // The raw dword aligned data component of the load. The only legal cases 4578 // where this matters should be when using the packed D16 format, for 4579 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4580 LLT RoundedTy; 4581 4582 // S32 vector to to cover all data, plus TFE result element. 4583 LLT TFETy; 4584 4585 // Register type to use for each loaded component. Will be S32 or V2S16. 4586 LLT RegTy; 4587 4588 if (IsD16 && ST.hasUnpackedD16VMem()) { 4589 RoundedTy = 4590 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 4591 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 4592 RegTy = S32; 4593 } else { 4594 unsigned EltSize = EltTy.getSizeInBits(); 4595 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4596 unsigned RoundedSize = 32 * RoundedElts; 4597 RoundedTy = LLT::scalarOrVector( 4598 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 4599 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 4600 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4601 } 4602 4603 // The return type does not need adjustment. 4604 // TODO: Should we change s16 case to s32 or <2 x s16>? 4605 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4606 return true; 4607 4608 Register Dst1Reg; 4609 4610 // Insert after the instruction. 4611 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4612 4613 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4614 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4615 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4616 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4617 4618 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4619 4620 MI.getOperand(0).setReg(NewResultReg); 4621 4622 // In the IR, TFE is supposed to be used with a 2 element struct return 4623 // type. The instruction really returns these two values in one contiguous 4624 // register, with one additional dword beyond the loaded data. Rewrite the 4625 // return type to use a single register result. 4626 4627 if (IsTFE) { 4628 Dst1Reg = MI.getOperand(1).getReg(); 4629 if (MRI->getType(Dst1Reg) != S32) 4630 return false; 4631 4632 // TODO: Make sure the TFE operand bit is set. 4633 MI.RemoveOperand(1); 4634 4635 // Handle the easy case that requires no repack instructions. 4636 if (Ty == S32) { 4637 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4638 return true; 4639 } 4640 } 4641 4642 // Now figure out how to copy the new result register back into the old 4643 // result. 4644 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4645 4646 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4647 4648 if (ResultNumRegs == 1) { 4649 assert(!IsTFE); 4650 ResultRegs[0] = NewResultReg; 4651 } else { 4652 // We have to repack into a new vector of some kind. 4653 for (int I = 0; I != NumDataRegs; ++I) 4654 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4655 B.buildUnmerge(ResultRegs, NewResultReg); 4656 4657 // Drop the final TFE element to get the data part. The TFE result is 4658 // directly written to the right place already. 4659 if (IsTFE) 4660 ResultRegs.resize(NumDataRegs); 4661 } 4662 4663 // For an s16 scalar result, we form an s32 result with a truncate regardless 4664 // of packed vs. unpacked. 4665 if (IsD16 && !Ty.isVector()) { 4666 B.buildTrunc(DstReg, ResultRegs[0]); 4667 return true; 4668 } 4669 4670 // Avoid a build/concat_vector of 1 entry. 4671 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4672 B.buildBitcast(DstReg, ResultRegs[0]); 4673 return true; 4674 } 4675 4676 assert(Ty.isVector()); 4677 4678 if (IsD16) { 4679 // For packed D16 results with TFE enabled, all the data components are 4680 // S32. Cast back to the expected type. 4681 // 4682 // TODO: We don't really need to use load s32 elements. We would only need one 4683 // cast for the TFE result if a multiple of v2s16 was used. 4684 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4685 for (Register &Reg : ResultRegs) 4686 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4687 } else if (ST.hasUnpackedD16VMem()) { 4688 for (Register &Reg : ResultRegs) 4689 Reg = B.buildTrunc(S16, Reg).getReg(0); 4690 } 4691 } 4692 4693 auto padWithUndef = [&](LLT Ty, int NumElts) { 4694 if (NumElts == 0) 4695 return; 4696 Register Undef = B.buildUndef(Ty).getReg(0); 4697 for (int I = 0; I != NumElts; ++I) 4698 ResultRegs.push_back(Undef); 4699 }; 4700 4701 // Pad out any elements eliminated due to the dmask. 4702 LLT ResTy = MRI->getType(ResultRegs[0]); 4703 if (!ResTy.isVector()) { 4704 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4705 B.buildBuildVector(DstReg, ResultRegs); 4706 return true; 4707 } 4708 4709 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4710 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4711 4712 // Deal with the one annoying legal case. 4713 const LLT V3S16 = LLT::fixed_vector(3, 16); 4714 if (Ty == V3S16) { 4715 if (IsTFE) { 4716 if (ResultRegs.size() == 1) { 4717 NewResultReg = ResultRegs[0]; 4718 } else if (ResultRegs.size() == 2) { 4719 LLT V4S16 = LLT::fixed_vector(4, 16); 4720 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); 4721 } else { 4722 return false; 4723 } 4724 } 4725 4726 if (MRI->getType(DstReg).getNumElements() < 4727 MRI->getType(NewResultReg).getNumElements()) { 4728 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg); 4729 } else { 4730 B.buildPadVectorWithUndefElements(DstReg, NewResultReg); 4731 } 4732 return true; 4733 } 4734 4735 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4736 B.buildConcatVectors(DstReg, ResultRegs); 4737 return true; 4738 } 4739 4740 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4741 LegalizerHelper &Helper, MachineInstr &MI) const { 4742 MachineIRBuilder &B = Helper.MIRBuilder; 4743 GISelChangeObserver &Observer = Helper.Observer; 4744 4745 Register Dst = MI.getOperand(0).getReg(); 4746 LLT Ty = B.getMRI()->getType(Dst); 4747 unsigned Size = Ty.getSizeInBits(); 4748 MachineFunction &MF = B.getMF(); 4749 4750 Observer.changingInstr(MI); 4751 4752 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 4753 Ty = getBitcastRegisterType(Ty); 4754 Helper.bitcastDst(MI, Ty, 0); 4755 Dst = MI.getOperand(0).getReg(); 4756 B.setInsertPt(B.getMBB(), MI); 4757 } 4758 4759 // FIXME: We don't really need this intermediate instruction. The intrinsic 4760 // should be fixed to have a memory operand. Since it's readnone, we're not 4761 // allowed to add one. 4762 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4763 MI.RemoveOperand(1); // Remove intrinsic ID 4764 4765 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4766 // TODO: Should this use datalayout alignment? 4767 const unsigned MemSize = (Size + 7) / 8; 4768 const Align MemAlign(4); 4769 MachineMemOperand *MMO = MF.getMachineMemOperand( 4770 MachinePointerInfo(), 4771 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4772 MachineMemOperand::MOInvariant, 4773 MemSize, MemAlign); 4774 MI.addMemOperand(MF, MMO); 4775 4776 // There are no 96-bit result scalar loads, but widening to 128-bit should 4777 // always be legal. We may need to restore this to a 96-bit result if it turns 4778 // out this needs to be converted to a vector load during RegBankSelect. 4779 if (!isPowerOf2_32(Size)) { 4780 if (Ty.isVector()) 4781 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4782 else 4783 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4784 } 4785 4786 Observer.changedInstr(MI); 4787 return true; 4788 } 4789 4790 // TODO: Move to selection 4791 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4792 MachineRegisterInfo &MRI, 4793 MachineIRBuilder &B) const { 4794 if (!ST.isTrapHandlerEnabled() || 4795 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4796 return legalizeTrapEndpgm(MI, MRI, B); 4797 4798 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4799 switch (*HsaAbiVer) { 4800 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4801 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4802 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4803 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4804 return ST.supportsGetDoorbellID() ? 4805 legalizeTrapHsa(MI, MRI, B) : 4806 legalizeTrapHsaQueuePtr(MI, MRI, B); 4807 } 4808 } 4809 4810 llvm_unreachable("Unknown trap handler"); 4811 } 4812 4813 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4814 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4815 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4816 MI.eraseFromParent(); 4817 return true; 4818 } 4819 4820 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4821 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4822 // Pass queue pointer to trap handler as input, and insert trap instruction 4823 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4824 Register LiveIn = 4825 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4826 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4827 return false; 4828 4829 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4830 B.buildCopy(SGPR01, LiveIn); 4831 B.buildInstr(AMDGPU::S_TRAP) 4832 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4833 .addReg(SGPR01, RegState::Implicit); 4834 4835 MI.eraseFromParent(); 4836 return true; 4837 } 4838 4839 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4840 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4841 B.buildInstr(AMDGPU::S_TRAP) 4842 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4843 MI.eraseFromParent(); 4844 return true; 4845 } 4846 4847 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4848 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4849 // Is non-HSA path or trap-handler disabled? Then, report a warning 4850 // accordingly 4851 if (!ST.isTrapHandlerEnabled() || 4852 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4853 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4854 "debugtrap handler not supported", 4855 MI.getDebugLoc(), DS_Warning); 4856 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4857 Ctx.diagnose(NoTrap); 4858 } else { 4859 // Insert debug-trap instruction 4860 B.buildInstr(AMDGPU::S_TRAP) 4861 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4862 } 4863 4864 MI.eraseFromParent(); 4865 return true; 4866 } 4867 4868 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4869 MachineIRBuilder &B) const { 4870 MachineRegisterInfo &MRI = *B.getMRI(); 4871 const LLT S16 = LLT::scalar(16); 4872 const LLT S32 = LLT::scalar(32); 4873 4874 Register DstReg = MI.getOperand(0).getReg(); 4875 Register NodePtr = MI.getOperand(2).getReg(); 4876 Register RayExtent = MI.getOperand(3).getReg(); 4877 Register RayOrigin = MI.getOperand(4).getReg(); 4878 Register RayDir = MI.getOperand(5).getReg(); 4879 Register RayInvDir = MI.getOperand(6).getReg(); 4880 Register TDescr = MI.getOperand(7).getReg(); 4881 4882 if (!ST.hasGFX10_AEncoding()) { 4883 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 4884 "intrinsic not supported on subtarget", 4885 MI.getDebugLoc()); 4886 B.getMF().getFunction().getContext().diagnose(BadIntrin); 4887 return false; 4888 } 4889 4890 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4891 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4892 const unsigned NumVDataDwords = 4; 4893 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 4894 const bool UseNSA = 4895 ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize(); 4896 const unsigned BaseOpcodes[2][2] = { 4897 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 4898 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 4899 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 4900 int Opcode; 4901 if (UseNSA) { 4902 Opcode = 4903 AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA, 4904 NumVDataDwords, NumVAddrDwords); 4905 } else { 4906 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 4907 AMDGPU::MIMGEncGfx10Default, NumVDataDwords, 4908 PowerOf2Ceil(NumVAddrDwords)); 4909 } 4910 assert(Opcode != -1); 4911 4912 SmallVector<Register, 12> Ops; 4913 if (Is64) { 4914 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4915 Ops.push_back(Unmerge.getReg(0)); 4916 Ops.push_back(Unmerge.getReg(1)); 4917 } else { 4918 Ops.push_back(NodePtr); 4919 } 4920 Ops.push_back(RayExtent); 4921 4922 auto packLanes = [&Ops, &S32, &B](Register Src) { 4923 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 4924 Ops.push_back(Unmerge.getReg(0)); 4925 Ops.push_back(Unmerge.getReg(1)); 4926 Ops.push_back(Unmerge.getReg(2)); 4927 }; 4928 4929 packLanes(RayOrigin); 4930 if (IsA16) { 4931 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 4932 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 4933 Register R1 = MRI.createGenericVirtualRegister(S32); 4934 Register R2 = MRI.createGenericVirtualRegister(S32); 4935 Register R3 = MRI.createGenericVirtualRegister(S32); 4936 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4937 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4938 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4939 Ops.push_back(R1); 4940 Ops.push_back(R2); 4941 Ops.push_back(R3); 4942 } else { 4943 packLanes(RayDir); 4944 packLanes(RayInvDir); 4945 } 4946 4947 if (!UseNSA) { 4948 // Build a single vector containing all the operands so far prepared. 4949 LLT OpTy = LLT::fixed_vector(Ops.size(), 32); 4950 Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0); 4951 Ops.clear(); 4952 Ops.push_back(MergedOps); 4953 } 4954 4955 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4956 .addDef(DstReg) 4957 .addImm(Opcode); 4958 4959 for (Register R : Ops) { 4960 MIB.addUse(R); 4961 } 4962 4963 MIB.addUse(TDescr) 4964 .addImm(IsA16 ? 1 : 0) 4965 .cloneMemRefs(MI); 4966 4967 MI.eraseFromParent(); 4968 return true; 4969 } 4970 4971 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C) { 4972 B.buildConstant(MI.getOperand(0).getReg(), C); 4973 MI.eraseFromParent(); 4974 return true; 4975 } 4976 4977 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4978 MachineInstr &MI) const { 4979 MachineIRBuilder &B = Helper.MIRBuilder; 4980 MachineRegisterInfo &MRI = *B.getMRI(); 4981 4982 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4983 auto IntrID = MI.getIntrinsicID(); 4984 switch (IntrID) { 4985 case Intrinsic::amdgcn_if: 4986 case Intrinsic::amdgcn_else: { 4987 MachineInstr *Br = nullptr; 4988 MachineBasicBlock *UncondBrTarget = nullptr; 4989 bool Negated = false; 4990 if (MachineInstr *BrCond = 4991 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4992 const SIRegisterInfo *TRI 4993 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4994 4995 Register Def = MI.getOperand(1).getReg(); 4996 Register Use = MI.getOperand(3).getReg(); 4997 4998 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4999 5000 if (Negated) 5001 std::swap(CondBrTarget, UncondBrTarget); 5002 5003 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 5004 if (IntrID == Intrinsic::amdgcn_if) { 5005 B.buildInstr(AMDGPU::SI_IF) 5006 .addDef(Def) 5007 .addUse(Use) 5008 .addMBB(UncondBrTarget); 5009 } else { 5010 B.buildInstr(AMDGPU::SI_ELSE) 5011 .addDef(Def) 5012 .addUse(Use) 5013 .addMBB(UncondBrTarget); 5014 } 5015 5016 if (Br) { 5017 Br->getOperand(0).setMBB(CondBrTarget); 5018 } else { 5019 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 5020 // since we're swapping branch targets it needs to be reinserted. 5021 // FIXME: IRTranslator should probably not do this 5022 B.buildBr(*CondBrTarget); 5023 } 5024 5025 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 5026 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 5027 MI.eraseFromParent(); 5028 BrCond->eraseFromParent(); 5029 return true; 5030 } 5031 5032 return false; 5033 } 5034 case Intrinsic::amdgcn_loop: { 5035 MachineInstr *Br = nullptr; 5036 MachineBasicBlock *UncondBrTarget = nullptr; 5037 bool Negated = false; 5038 if (MachineInstr *BrCond = 5039 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 5040 const SIRegisterInfo *TRI 5041 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 5042 5043 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 5044 Register Reg = MI.getOperand(2).getReg(); 5045 5046 if (Negated) 5047 std::swap(CondBrTarget, UncondBrTarget); 5048 5049 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 5050 B.buildInstr(AMDGPU::SI_LOOP) 5051 .addUse(Reg) 5052 .addMBB(UncondBrTarget); 5053 5054 if (Br) 5055 Br->getOperand(0).setMBB(CondBrTarget); 5056 else 5057 B.buildBr(*CondBrTarget); 5058 5059 MI.eraseFromParent(); 5060 BrCond->eraseFromParent(); 5061 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 5062 return true; 5063 } 5064 5065 return false; 5066 } 5067 case Intrinsic::amdgcn_kernarg_segment_ptr: 5068 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 5069 // This only makes sense to call in a kernel, so just lower to null. 5070 B.buildConstant(MI.getOperand(0).getReg(), 0); 5071 MI.eraseFromParent(); 5072 return true; 5073 } 5074 5075 return legalizePreloadedArgIntrin( 5076 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 5077 case Intrinsic::amdgcn_implicitarg_ptr: 5078 return legalizeImplicitArgPtr(MI, MRI, B); 5079 case Intrinsic::amdgcn_workitem_id_x: 5080 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 0) == 0) 5081 return replaceWithConstant(B, MI, 0); 5082 return legalizePreloadedArgIntrin(MI, MRI, B, 5083 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 5084 case Intrinsic::amdgcn_workitem_id_y: 5085 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 1) == 0) 5086 return replaceWithConstant(B, MI, 0); 5087 5088 return legalizePreloadedArgIntrin(MI, MRI, B, 5089 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 5090 case Intrinsic::amdgcn_workitem_id_z: 5091 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 2) == 0) 5092 return replaceWithConstant(B, MI, 0); 5093 5094 return legalizePreloadedArgIntrin(MI, MRI, B, 5095 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 5096 case Intrinsic::amdgcn_workgroup_id_x: 5097 return legalizePreloadedArgIntrin(MI, MRI, B, 5098 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 5099 case Intrinsic::amdgcn_workgroup_id_y: 5100 return legalizePreloadedArgIntrin(MI, MRI, B, 5101 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 5102 case Intrinsic::amdgcn_workgroup_id_z: 5103 return legalizePreloadedArgIntrin(MI, MRI, B, 5104 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 5105 case Intrinsic::amdgcn_dispatch_ptr: 5106 return legalizePreloadedArgIntrin(MI, MRI, B, 5107 AMDGPUFunctionArgInfo::DISPATCH_PTR); 5108 case Intrinsic::amdgcn_queue_ptr: 5109 return legalizePreloadedArgIntrin(MI, MRI, B, 5110 AMDGPUFunctionArgInfo::QUEUE_PTR); 5111 case Intrinsic::amdgcn_implicit_buffer_ptr: 5112 return legalizePreloadedArgIntrin( 5113 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 5114 case Intrinsic::amdgcn_dispatch_id: 5115 return legalizePreloadedArgIntrin(MI, MRI, B, 5116 AMDGPUFunctionArgInfo::DISPATCH_ID); 5117 case Intrinsic::amdgcn_fdiv_fast: 5118 return legalizeFDIVFastIntrin(MI, MRI, B); 5119 case Intrinsic::amdgcn_is_shared: 5120 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 5121 case Intrinsic::amdgcn_is_private: 5122 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 5123 case Intrinsic::amdgcn_wavefrontsize: { 5124 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 5125 MI.eraseFromParent(); 5126 return true; 5127 } 5128 case Intrinsic::amdgcn_s_buffer_load: 5129 return legalizeSBufferLoad(Helper, MI); 5130 case Intrinsic::amdgcn_raw_buffer_store: 5131 case Intrinsic::amdgcn_struct_buffer_store: 5132 return legalizeBufferStore(MI, MRI, B, false, false); 5133 case Intrinsic::amdgcn_raw_buffer_store_format: 5134 case Intrinsic::amdgcn_struct_buffer_store_format: 5135 return legalizeBufferStore(MI, MRI, B, false, true); 5136 case Intrinsic::amdgcn_raw_tbuffer_store: 5137 case Intrinsic::amdgcn_struct_tbuffer_store: 5138 return legalizeBufferStore(MI, MRI, B, true, true); 5139 case Intrinsic::amdgcn_raw_buffer_load: 5140 case Intrinsic::amdgcn_struct_buffer_load: 5141 return legalizeBufferLoad(MI, MRI, B, false, false); 5142 case Intrinsic::amdgcn_raw_buffer_load_format: 5143 case Intrinsic::amdgcn_struct_buffer_load_format: 5144 return legalizeBufferLoad(MI, MRI, B, true, false); 5145 case Intrinsic::amdgcn_raw_tbuffer_load: 5146 case Intrinsic::amdgcn_struct_tbuffer_load: 5147 return legalizeBufferLoad(MI, MRI, B, true, true); 5148 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 5149 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 5150 case Intrinsic::amdgcn_raw_buffer_atomic_add: 5151 case Intrinsic::amdgcn_struct_buffer_atomic_add: 5152 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 5153 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 5154 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 5155 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 5156 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 5157 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 5158 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 5159 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 5160 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 5161 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 5162 case Intrinsic::amdgcn_raw_buffer_atomic_and: 5163 case Intrinsic::amdgcn_struct_buffer_atomic_and: 5164 case Intrinsic::amdgcn_raw_buffer_atomic_or: 5165 case Intrinsic::amdgcn_struct_buffer_atomic_or: 5166 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 5167 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 5168 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 5169 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 5170 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 5171 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 5172 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 5173 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 5174 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 5175 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 5176 case Intrinsic::amdgcn_buffer_atomic_fadd: 5177 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 5178 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 5179 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 5180 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 5181 return legalizeBufferAtomic(MI, B, IntrID); 5182 case Intrinsic::amdgcn_atomic_inc: 5183 return legalizeAtomicIncDec(MI, B, true); 5184 case Intrinsic::amdgcn_atomic_dec: 5185 return legalizeAtomicIncDec(MI, B, false); 5186 case Intrinsic::trap: 5187 return legalizeTrapIntrinsic(MI, MRI, B); 5188 case Intrinsic::debugtrap: 5189 return legalizeDebugTrapIntrinsic(MI, MRI, B); 5190 case Intrinsic::amdgcn_rsq_clamp: 5191 return legalizeRsqClampIntrinsic(MI, MRI, B); 5192 case Intrinsic::amdgcn_ds_fadd: 5193 case Intrinsic::amdgcn_ds_fmin: 5194 case Intrinsic::amdgcn_ds_fmax: 5195 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 5196 case Intrinsic::amdgcn_image_bvh_intersect_ray: 5197 return legalizeBVHIntrinsic(MI, B); 5198 default: { 5199 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 5200 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 5201 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 5202 return true; 5203 } 5204 } 5205 5206 return true; 5207 } 5208