1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-legalinfo" 31 32 using namespace llvm; 33 using namespace LegalizeActions; 34 using namespace LegalizeMutations; 35 using namespace LegalityPredicates; 36 using namespace MIPatternMatch; 37 38 // Hack until load/store selection patterns support any tuple of legal types. 39 static cl::opt<bool> EnableNewLegality( 40 "amdgpu-global-isel-new-legality", 41 cl::desc("Use GlobalISel desired legality, rather than try to use" 42 "rules compatible with selection patterns"), 43 cl::init(false), 44 cl::ReallyHidden); 45 46 static constexpr unsigned MaxRegisterSize = 1024; 47 48 // Round the number of elements to the next power of two elements 49 static LLT getPow2VectorType(LLT Ty) { 50 unsigned NElts = Ty.getNumElements(); 51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 52 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 53 } 54 55 // Round the number of bits to the next power of two bits 56 static LLT getPow2ScalarType(LLT Ty) { 57 unsigned Bits = Ty.getSizeInBits(); 58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 59 return LLT::scalar(Pow2Bits); 60 } 61 62 /// \returns true if this is an odd sized vector which should widen by adding an 63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 64 /// excludes s1 vectors, which should always be scalarized. 65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 66 return [=](const LegalityQuery &Query) { 67 const LLT Ty = Query.Types[TypeIdx]; 68 if (!Ty.isVector()) 69 return false; 70 71 const LLT EltTy = Ty.getElementType(); 72 const unsigned EltSize = EltTy.getSizeInBits(); 73 return Ty.getNumElements() % 2 != 0 && 74 EltSize > 1 && EltSize < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 return Ty.getSizeInBits() % 32 == 0; 83 }; 84 } 85 86 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getScalarType(); 90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 91 }; 92 } 93 94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 95 return [=](const LegalityQuery &Query) { 96 const LLT Ty = Query.Types[TypeIdx]; 97 const LLT EltTy = Ty.getElementType(); 98 return std::make_pair(TypeIdx, 99 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 100 }; 101 } 102 103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 104 return [=](const LegalityQuery &Query) { 105 const LLT Ty = Query.Types[TypeIdx]; 106 const LLT EltTy = Ty.getElementType(); 107 unsigned Size = Ty.getSizeInBits(); 108 unsigned Pieces = (Size + 63) / 64; 109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 110 return std::make_pair( 111 TypeIdx, 112 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy)); 113 }; 114 } 115 116 // Increase the number of vector elements to reach the next multiple of 32-bit 117 // type. 118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 122 const LLT EltTy = Ty.getElementType(); 123 const int Size = Ty.getSizeInBits(); 124 const int EltSize = EltTy.getSizeInBits(); 125 const int NextMul32 = (Size + 31) / 32; 126 127 assert(EltSize < 32); 128 129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 130 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 131 }; 132 } 133 134 static LLT getBitcastRegisterType(const LLT Ty) { 135 const unsigned Size = Ty.getSizeInBits(); 136 137 LLT CoercedTy; 138 if (Size <= 32) { 139 // <2 x s8> -> s16 140 // <4 x s8> -> s32 141 return LLT::scalar(Size); 142 } 143 144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 145 } 146 147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 148 return [=](const LegalityQuery &Query) { 149 const LLT Ty = Query.Types[TypeIdx]; 150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 151 }; 152 } 153 154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 155 return [=](const LegalityQuery &Query) { 156 const LLT Ty = Query.Types[TypeIdx]; 157 unsigned Size = Ty.getSizeInBits(); 158 assert(Size % 32 == 0); 159 return std::make_pair( 160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 161 }; 162 } 163 164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 165 return [=](const LegalityQuery &Query) { 166 const LLT QueryTy = Query.Types[TypeIdx]; 167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 168 }; 169 } 170 171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 172 return [=](const LegalityQuery &Query) { 173 const LLT QueryTy = Query.Types[TypeIdx]; 174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 175 }; 176 } 177 178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 179 return [=](const LegalityQuery &Query) { 180 const LLT QueryTy = Query.Types[TypeIdx]; 181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 182 }; 183 } 184 185 static bool isRegisterSize(unsigned Size) { 186 return Size % 32 == 0 && Size <= MaxRegisterSize; 187 } 188 189 static bool isRegisterVectorElementType(LLT EltTy) { 190 const int EltSize = EltTy.getSizeInBits(); 191 return EltSize == 16 || EltSize % 32 == 0; 192 } 193 194 static bool isRegisterVectorType(LLT Ty) { 195 const int EltSize = Ty.getElementType().getSizeInBits(); 196 return EltSize == 32 || EltSize == 64 || 197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 198 EltSize == 128 || EltSize == 256; 199 } 200 201 static bool isRegisterType(LLT Ty) { 202 if (!isRegisterSize(Ty.getSizeInBits())) 203 return false; 204 205 if (Ty.isVector()) 206 return isRegisterVectorType(Ty); 207 208 return true; 209 } 210 211 // Any combination of 32 or 64-bit elements up the maximum register size, and 212 // multiples of v2s16. 213 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 214 return [=](const LegalityQuery &Query) { 215 return isRegisterType(Query.Types[TypeIdx]); 216 }; 217 } 218 219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 220 return [=](const LegalityQuery &Query) { 221 const LLT QueryTy = Query.Types[TypeIdx]; 222 if (!QueryTy.isVector()) 223 return false; 224 const LLT EltTy = QueryTy.getElementType(); 225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 226 }; 227 } 228 229 // If we have a truncating store or an extending load with a data size larger 230 // than 32-bits, we need to reduce to a 32-bit type. 231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 232 return [=](const LegalityQuery &Query) { 233 const LLT Ty = Query.Types[TypeIdx]; 234 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 236 }; 237 } 238 239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 240 // handle some operations by just promoting the register during 241 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 243 bool IsLoad) { 244 switch (AS) { 245 case AMDGPUAS::PRIVATE_ADDRESS: 246 // FIXME: Private element size. 247 return ST.enableFlatScratch() ? 128 : 32; 248 case AMDGPUAS::LOCAL_ADDRESS: 249 return ST.useDS128() ? 128 : 64; 250 case AMDGPUAS::GLOBAL_ADDRESS: 251 case AMDGPUAS::CONSTANT_ADDRESS: 252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 253 // Treat constant and global as identical. SMRD loads are sometimes usable for 254 // global loads (ideally constant address space should be eliminated) 255 // depending on the context. Legality cannot be context dependent, but 256 // RegBankSelect can split the load as necessary depending on the pointer 257 // register bank/uniformity and if the memory is invariant or not written in a 258 // kernel. 259 return IsLoad ? 512 : 128; 260 default: 261 // Flat addresses may contextually need to be split to 32-bit parts if they 262 // may alias scratch depending on the subtarget. 263 return 128; 264 } 265 } 266 267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 268 const LegalityQuery &Query) { 269 const LLT Ty = Query.Types[0]; 270 271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 273 274 unsigned RegSize = Ty.getSizeInBits(); 275 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 276 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits; 277 unsigned AS = Query.Types[1].getAddressSpace(); 278 279 // All of these need to be custom lowered to cast the pointer operand. 280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 281 return false; 282 283 // Do not handle extending vector loads. 284 if (Ty.isVector() && MemSize != RegSize) 285 return false; 286 287 // TODO: We should be able to widen loads if the alignment is high enough, but 288 // we also need to modify the memory access size. 289 #if 0 290 // Accept widening loads based on alignment. 291 if (IsLoad && MemSize < Size) 292 MemSize = std::max(MemSize, Align); 293 #endif 294 295 // Only 1-byte and 2-byte to 32-bit extloads are valid. 296 if (MemSize != RegSize && RegSize != 32) 297 return false; 298 299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 300 return false; 301 302 switch (MemSize) { 303 case 8: 304 case 16: 305 case 32: 306 case 64: 307 case 128: 308 break; 309 case 96: 310 if (!ST.hasDwordx3LoadStores()) 311 return false; 312 break; 313 case 256: 314 case 512: 315 // These may contextually need to be broken down. 316 break; 317 default: 318 return false; 319 } 320 321 assert(RegSize >= MemSize); 322 323 if (AlignBits < MemSize) { 324 const SITargetLowering *TLI = ST.getTargetLowering(); 325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 326 Align(AlignBits / 8))) 327 return false; 328 } 329 330 return true; 331 } 332 333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 334 // workaround this. Eventually it should ignore the type for loads and only care 335 // about the size. Return true in cases where we will workaround this for now by 336 // bitcasting. 337 static bool loadStoreBitcastWorkaround(const LLT Ty) { 338 if (EnableNewLegality) 339 return false; 340 341 const unsigned Size = Ty.getSizeInBits(); 342 if (Size <= 64) 343 return false; 344 if (!Ty.isVector()) 345 return true; 346 347 LLT EltTy = Ty.getElementType(); 348 if (EltTy.isPointer()) 349 return true; 350 351 unsigned EltSize = EltTy.getSizeInBits(); 352 return EltSize != 32 && EltSize != 64; 353 } 354 355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 356 const LLT Ty = Query.Types[0]; 357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 358 !loadStoreBitcastWorkaround(Ty); 359 } 360 361 /// Return true if a load or store of the type should be lowered with a bitcast 362 /// to a different type. 363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 364 const LLT MemTy) { 365 const unsigned MemSizeInBits = MemTy.getSizeInBits(); 366 const unsigned Size = Ty.getSizeInBits(); 367 if (Size != MemSizeInBits) 368 return Size <= 32 && Ty.isVector(); 369 370 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 371 return true; 372 373 // Don't try to handle bitcasting vector ext loads for now. 374 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 375 (Size <= 32 || isRegisterSize(Size)) && 376 !isRegisterVectorElementType(Ty.getElementType()); 377 } 378 379 /// Return true if we should legalize a load by widening an odd sized memory 380 /// access up to the alignment. Note this case when the memory access itself 381 /// changes, not the size of the result register. 382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 383 uint64_t AlignInBits, unsigned AddrSpace, 384 unsigned Opcode) { 385 unsigned SizeInBits = MemoryTy.getSizeInBits(); 386 // We don't want to widen cases that are naturally legal. 387 if (isPowerOf2_32(SizeInBits)) 388 return false; 389 390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 391 // end up widening these for a scalar load during RegBankSelect, since there 392 // aren't 96-bit scalar loads. 393 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 394 return false; 395 396 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 397 return false; 398 399 // A load is known dereferenceable up to the alignment, so it's legal to widen 400 // to it. 401 // 402 // TODO: Could check dereferenceable for less aligned cases. 403 unsigned RoundedSize = NextPowerOf2(SizeInBits); 404 if (AlignInBits < RoundedSize) 405 return false; 406 407 // Do not widen if it would introduce a slow unaligned load. 408 const SITargetLowering *TLI = ST.getTargetLowering(); 409 bool Fast = false; 410 return TLI->allowsMisalignedMemoryAccessesImpl( 411 RoundedSize, AddrSpace, Align(AlignInBits / 8), 412 MachineMemOperand::MOLoad, &Fast) && 413 Fast; 414 } 415 416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 417 unsigned Opcode) { 418 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 419 return false; 420 421 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 422 Query.MMODescrs[0].AlignInBits, 423 Query.Types[1].getAddressSpace(), Opcode); 424 } 425 426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 427 const GCNTargetMachine &TM) 428 : ST(ST_) { 429 using namespace TargetOpcode; 430 431 auto GetAddrSpacePtr = [&TM](unsigned AS) { 432 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 433 }; 434 435 const LLT S1 = LLT::scalar(1); 436 const LLT S8 = LLT::scalar(8); 437 const LLT S16 = LLT::scalar(16); 438 const LLT S32 = LLT::scalar(32); 439 const LLT S64 = LLT::scalar(64); 440 const LLT S128 = LLT::scalar(128); 441 const LLT S256 = LLT::scalar(256); 442 const LLT S512 = LLT::scalar(512); 443 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 444 445 const LLT V2S8 = LLT::fixed_vector(2, 8); 446 const LLT V2S16 = LLT::fixed_vector(2, 16); 447 const LLT V4S16 = LLT::fixed_vector(4, 16); 448 449 const LLT V2S32 = LLT::fixed_vector(2, 32); 450 const LLT V3S32 = LLT::fixed_vector(3, 32); 451 const LLT V4S32 = LLT::fixed_vector(4, 32); 452 const LLT V5S32 = LLT::fixed_vector(5, 32); 453 const LLT V6S32 = LLT::fixed_vector(6, 32); 454 const LLT V7S32 = LLT::fixed_vector(7, 32); 455 const LLT V8S32 = LLT::fixed_vector(8, 32); 456 const LLT V9S32 = LLT::fixed_vector(9, 32); 457 const LLT V10S32 = LLT::fixed_vector(10, 32); 458 const LLT V11S32 = LLT::fixed_vector(11, 32); 459 const LLT V12S32 = LLT::fixed_vector(12, 32); 460 const LLT V13S32 = LLT::fixed_vector(13, 32); 461 const LLT V14S32 = LLT::fixed_vector(14, 32); 462 const LLT V15S32 = LLT::fixed_vector(15, 32); 463 const LLT V16S32 = LLT::fixed_vector(16, 32); 464 const LLT V32S32 = LLT::fixed_vector(32, 32); 465 466 const LLT V2S64 = LLT::fixed_vector(2, 64); 467 const LLT V3S64 = LLT::fixed_vector(3, 64); 468 const LLT V4S64 = LLT::fixed_vector(4, 64); 469 const LLT V5S64 = LLT::fixed_vector(5, 64); 470 const LLT V6S64 = LLT::fixed_vector(6, 64); 471 const LLT V7S64 = LLT::fixed_vector(7, 64); 472 const LLT V8S64 = LLT::fixed_vector(8, 64); 473 const LLT V16S64 = LLT::fixed_vector(16, 64); 474 475 std::initializer_list<LLT> AllS32Vectors = 476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 478 std::initializer_list<LLT> AllS64Vectors = 479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 480 481 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 482 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 483 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 484 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 485 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 486 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 487 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 488 489 const LLT CodePtr = FlatPtr; 490 491 const std::initializer_list<LLT> AddrSpaces64 = { 492 GlobalPtr, ConstantPtr, FlatPtr 493 }; 494 495 const std::initializer_list<LLT> AddrSpaces32 = { 496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 497 }; 498 499 const std::initializer_list<LLT> FPTypesBase = { 500 S32, S64 501 }; 502 503 const std::initializer_list<LLT> FPTypes16 = { 504 S32, S64, S16 505 }; 506 507 const std::initializer_list<LLT> FPTypesPK16 = { 508 S32, S64, S16, V2S16 509 }; 510 511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 512 513 // s1 for VCC branches, s32 for SCC branches. 514 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 515 516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 517 // elements for v3s16 518 getActionDefinitionsBuilder(G_PHI) 519 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 520 .legalFor(AllS32Vectors) 521 .legalFor(AllS64Vectors) 522 .legalFor(AddrSpaces64) 523 .legalFor(AddrSpaces32) 524 .legalIf(isPointer(0)) 525 .clampScalar(0, S16, S256) 526 .widenScalarToNextPow2(0, 32) 527 .clampMaxNumElements(0, S32, 16) 528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 529 .scalarize(0); 530 531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 532 // Full set of gfx9 features. 533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 534 .legalFor({S32, S16, V2S16}) 535 .minScalar(0, S16) 536 .clampMaxNumElementsStrict(0, S16, 2) 537 .widenScalarToNextMultipleOf(0, 32) 538 .maxScalar(0, S32) 539 .scalarize(0); 540 541 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 542 .legalFor({S32, S16, V2S16}) // Clamp modifier 543 .minScalarOrElt(0, S16) 544 .clampMaxNumElementsStrict(0, S16, 2) 545 .scalarize(0) 546 .widenScalarToNextPow2(0, 32) 547 .lower(); 548 } else if (ST.has16BitInsts()) { 549 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 550 .legalFor({S32, S16}) 551 .minScalar(0, S16) 552 .widenScalarToNextMultipleOf(0, 32) 553 .maxScalar(0, S32) 554 .scalarize(0); 555 556 // Technically the saturating operations require clamp bit support, but this 557 // was introduced at the same time as 16-bit operations. 558 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 559 .legalFor({S32, S16}) // Clamp modifier 560 .minScalar(0, S16) 561 .scalarize(0) 562 .widenScalarToNextPow2(0, 16) 563 .lower(); 564 565 // We're just lowering this, but it helps get a better result to try to 566 // coerce to the desired type first. 567 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 568 .minScalar(0, S16) 569 .scalarize(0) 570 .lower(); 571 } else { 572 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 573 .legalFor({S32}) 574 .widenScalarToNextMultipleOf(0, 32) 575 .clampScalar(0, S32, S32) 576 .scalarize(0); 577 578 if (ST.hasIntClamp()) { 579 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 580 .legalFor({S32}) // Clamp modifier. 581 .scalarize(0) 582 .minScalarOrElt(0, S32) 583 .lower(); 584 } else { 585 // Clamp bit support was added in VI, along with 16-bit operations. 586 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 587 .minScalar(0, S32) 588 .scalarize(0) 589 .lower(); 590 } 591 592 // FIXME: DAG expansion gets better results. The widening uses the smaller 593 // range values and goes for the min/max lowering directly. 594 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 595 .minScalar(0, S32) 596 .scalarize(0) 597 .lower(); 598 } 599 600 getActionDefinitionsBuilder( 601 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 602 .customFor({S32, S64}) 603 .clampScalar(0, S32, S64) 604 .widenScalarToNextPow2(0, 32) 605 .scalarize(0); 606 607 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 608 .legalFor({S32}) 609 .maxScalar(0, S32); 610 611 if (ST.hasVOP3PInsts()) { 612 Mulh 613 .clampMaxNumElements(0, S8, 2) 614 .lowerFor({V2S8}); 615 } 616 617 Mulh 618 .scalarize(0) 619 .lower(); 620 621 // Report legal for any types we can handle anywhere. For the cases only legal 622 // on the SALU, RegBankSelect will be able to re-legalize. 623 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 624 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 625 .clampScalar(0, S32, S64) 626 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 627 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 628 .widenScalarToNextPow2(0) 629 .scalarize(0); 630 631 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 632 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 633 .legalFor({{S32, S1}, {S32, S32}}) 634 .minScalar(0, S32) 635 // TODO: .scalarize(0) 636 .lower(); 637 638 getActionDefinitionsBuilder(G_BITCAST) 639 // Don't worry about the size constraint. 640 .legalIf(all(isRegisterType(0), isRegisterType(1))) 641 .lower(); 642 643 644 getActionDefinitionsBuilder(G_CONSTANT) 645 .legalFor({S1, S32, S64, S16, GlobalPtr, 646 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 647 .legalIf(isPointer(0)) 648 .clampScalar(0, S32, S64) 649 .widenScalarToNextPow2(0); 650 651 getActionDefinitionsBuilder(G_FCONSTANT) 652 .legalFor({S32, S64, S16}) 653 .clampScalar(0, S16, S64); 654 655 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 656 .legalIf(isRegisterType(0)) 657 // s1 and s16 are special cases because they have legal operations on 658 // them, but don't really occupy registers in the normal way. 659 .legalFor({S1, S16}) 660 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 661 .clampScalarOrElt(0, S32, MaxScalar) 662 .widenScalarToNextPow2(0, 32) 663 .clampMaxNumElements(0, S32, 16); 664 665 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 666 667 // If the amount is divergent, we have to do a wave reduction to get the 668 // maximum value, so this is expanded during RegBankSelect. 669 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 670 .legalFor({{PrivatePtr, S32}}); 671 672 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 673 .customIf(typeIsNot(0, PrivatePtr)); 674 675 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 676 677 auto &FPOpActions = getActionDefinitionsBuilder( 678 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 679 .legalFor({S32, S64}); 680 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 681 .customFor({S32, S64}); 682 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 683 .customFor({S32, S64}); 684 685 if (ST.has16BitInsts()) { 686 if (ST.hasVOP3PInsts()) 687 FPOpActions.legalFor({S16, V2S16}); 688 else 689 FPOpActions.legalFor({S16}); 690 691 TrigActions.customFor({S16}); 692 FDIVActions.customFor({S16}); 693 } 694 695 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 696 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 697 698 if (ST.hasVOP3PInsts()) { 699 MinNumMaxNum.customFor(FPTypesPK16) 700 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 701 .clampMaxNumElements(0, S16, 2) 702 .clampScalar(0, S16, S64) 703 .scalarize(0); 704 } else if (ST.has16BitInsts()) { 705 MinNumMaxNum.customFor(FPTypes16) 706 .clampScalar(0, S16, S64) 707 .scalarize(0); 708 } else { 709 MinNumMaxNum.customFor(FPTypesBase) 710 .clampScalar(0, S32, S64) 711 .scalarize(0); 712 } 713 714 if (ST.hasVOP3PInsts()) 715 FPOpActions.clampMaxNumElementsStrict(0, S16, 2); 716 717 FPOpActions 718 .scalarize(0) 719 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 720 721 TrigActions 722 .scalarize(0) 723 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 724 725 FDIVActions 726 .scalarize(0) 727 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 728 729 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 730 .legalFor(FPTypesPK16) 731 .clampMaxNumElementsStrict(0, S16, 2) 732 .scalarize(0) 733 .clampScalar(0, S16, S64); 734 735 if (ST.has16BitInsts()) { 736 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 737 .legalFor({S32, S64, S16}) 738 .scalarize(0) 739 .clampScalar(0, S16, S64); 740 } else { 741 getActionDefinitionsBuilder(G_FSQRT) 742 .legalFor({S32, S64}) 743 .scalarize(0) 744 .clampScalar(0, S32, S64); 745 746 if (ST.hasFractBug()) { 747 getActionDefinitionsBuilder(G_FFLOOR) 748 .customFor({S64}) 749 .legalFor({S32, S64}) 750 .scalarize(0) 751 .clampScalar(0, S32, S64); 752 } else { 753 getActionDefinitionsBuilder(G_FFLOOR) 754 .legalFor({S32, S64}) 755 .scalarize(0) 756 .clampScalar(0, S32, S64); 757 } 758 } 759 760 getActionDefinitionsBuilder(G_FPTRUNC) 761 .legalFor({{S32, S64}, {S16, S32}}) 762 .scalarize(0) 763 .lower(); 764 765 getActionDefinitionsBuilder(G_FPEXT) 766 .legalFor({{S64, S32}, {S32, S16}}) 767 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 768 .scalarize(0); 769 770 getActionDefinitionsBuilder(G_FSUB) 771 // Use actual fsub instruction 772 .legalFor({S32}) 773 // Must use fadd + fneg 774 .lowerFor({S64, S16, V2S16}) 775 .scalarize(0) 776 .clampScalar(0, S32, S64); 777 778 // Whether this is legal depends on the floating point mode for the function. 779 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 780 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 781 FMad.customFor({S32, S16}); 782 else if (ST.hasMadMacF32Insts()) 783 FMad.customFor({S32}); 784 else if (ST.hasMadF16()) 785 FMad.customFor({S16}); 786 FMad.scalarize(0) 787 .lower(); 788 789 auto &FRem = getActionDefinitionsBuilder(G_FREM); 790 if (ST.has16BitInsts()) { 791 FRem.customFor({S16, S32, S64}); 792 } else { 793 FRem.minScalar(0, S32) 794 .customFor({S32, S64}); 795 } 796 FRem.scalarize(0); 797 798 // TODO: Do we need to clamp maximum bitwidth? 799 getActionDefinitionsBuilder(G_TRUNC) 800 .legalIf(isScalar(0)) 801 .legalFor({{V2S16, V2S32}}) 802 .clampMaxNumElements(0, S16, 2) 803 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 804 // situations (like an invalid implicit use), we don't want to infinite loop 805 // in the legalizer. 806 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 807 .alwaysLegal(); 808 809 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 810 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 811 {S32, S1}, {S64, S1}, {S16, S1}}) 812 .scalarize(0) 813 .clampScalar(0, S32, S64) 814 .widenScalarToNextPow2(1, 32); 815 816 // TODO: Split s1->s64 during regbankselect for VALU. 817 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 818 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 819 .lowerIf(typeIs(1, S1)) 820 .customFor({{S32, S64}, {S64, S64}}); 821 if (ST.has16BitInsts()) 822 IToFP.legalFor({{S16, S16}}); 823 IToFP.clampScalar(1, S32, S64) 824 .minScalar(0, S32) 825 .scalarize(0) 826 .widenScalarToNextPow2(1); 827 828 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 829 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 830 .customFor({{S64, S32}, {S64, S64}}) 831 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 832 if (ST.has16BitInsts()) 833 FPToI.legalFor({{S16, S16}}); 834 else 835 FPToI.minScalar(1, S32); 836 837 FPToI.minScalar(0, S32) 838 .widenScalarToNextPow2(0, 32) 839 .scalarize(0) 840 .lower(); 841 842 // Lower roundeven into G_FRINT 843 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 844 .scalarize(0) 845 .lower(); 846 847 if (ST.has16BitInsts()) { 848 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 849 .legalFor({S16, S32, S64}) 850 .clampScalar(0, S16, S64) 851 .scalarize(0); 852 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 853 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 854 .legalFor({S32, S64}) 855 .clampScalar(0, S32, S64) 856 .scalarize(0); 857 } else { 858 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 859 .legalFor({S32}) 860 .customFor({S64}) 861 .clampScalar(0, S32, S64) 862 .scalarize(0); 863 } 864 865 getActionDefinitionsBuilder(G_PTR_ADD) 866 .legalIf(all(isPointer(0), sameSize(0, 1))) 867 .scalarize(0) 868 .scalarSameSizeAs(1, 0); 869 870 getActionDefinitionsBuilder(G_PTRMASK) 871 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 872 .scalarSameSizeAs(1, 0) 873 .scalarize(0); 874 875 auto &CmpBuilder = 876 getActionDefinitionsBuilder(G_ICMP) 877 // The compare output type differs based on the register bank of the output, 878 // so make both s1 and s32 legal. 879 // 880 // Scalar compares producing output in scc will be promoted to s32, as that 881 // is the allocatable register type that will be needed for the copy from 882 // scc. This will be promoted during RegBankSelect, and we assume something 883 // before that won't try to use s32 result types. 884 // 885 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 886 // bank. 887 .legalForCartesianProduct( 888 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 889 .legalForCartesianProduct( 890 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 891 if (ST.has16BitInsts()) { 892 CmpBuilder.legalFor({{S1, S16}}); 893 } 894 895 CmpBuilder 896 .widenScalarToNextPow2(1) 897 .clampScalar(1, S32, S64) 898 .scalarize(0) 899 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 900 901 getActionDefinitionsBuilder(G_FCMP) 902 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 903 .widenScalarToNextPow2(1) 904 .clampScalar(1, S32, S64) 905 .scalarize(0); 906 907 // FIXME: fpow has a selection pattern that should move to custom lowering. 908 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 909 if (ST.has16BitInsts()) 910 Exp2Ops.legalFor({S32, S16}); 911 else 912 Exp2Ops.legalFor({S32}); 913 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 914 Exp2Ops.scalarize(0); 915 916 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 917 if (ST.has16BitInsts()) 918 ExpOps.customFor({{S32}, {S16}}); 919 else 920 ExpOps.customFor({S32}); 921 ExpOps.clampScalar(0, MinScalarFPTy, S32) 922 .scalarize(0); 923 924 getActionDefinitionsBuilder(G_FPOWI) 925 .clampScalar(0, MinScalarFPTy, S32) 926 .lower(); 927 928 // The 64-bit versions produce 32-bit results, but only on the SALU. 929 getActionDefinitionsBuilder(G_CTPOP) 930 .legalFor({{S32, S32}, {S32, S64}}) 931 .clampScalar(0, S32, S32) 932 .widenScalarToNextPow2(1, 32) 933 .clampScalar(1, S32, S64) 934 .scalarize(0) 935 .widenScalarToNextPow2(0, 32); 936 937 938 // The hardware instructions return a different result on 0 than the generic 939 // instructions expect. The hardware produces -1, but these produce the 940 // bitwidth. 941 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 942 .scalarize(0) 943 .clampScalar(0, S32, S32) 944 .clampScalar(1, S32, S64) 945 .widenScalarToNextPow2(0, 32) 946 .widenScalarToNextPow2(1, 32) 947 .custom(); 948 949 // The 64-bit versions produce 32-bit results, but only on the SALU. 950 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 951 .legalFor({{S32, S32}, {S32, S64}}) 952 .clampScalar(0, S32, S32) 953 .clampScalar(1, S32, S64) 954 .scalarize(0) 955 .widenScalarToNextPow2(0, 32) 956 .widenScalarToNextPow2(1, 32); 957 958 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 959 // RegBankSelect. 960 getActionDefinitionsBuilder(G_BITREVERSE) 961 .legalFor({S32, S64}) 962 .clampScalar(0, S32, S64) 963 .scalarize(0) 964 .widenScalarToNextPow2(0); 965 966 if (ST.has16BitInsts()) { 967 getActionDefinitionsBuilder(G_BSWAP) 968 .legalFor({S16, S32, V2S16}) 969 .clampMaxNumElementsStrict(0, S16, 2) 970 // FIXME: Fixing non-power-of-2 before clamp is workaround for 971 // narrowScalar limitation. 972 .widenScalarToNextPow2(0) 973 .clampScalar(0, S16, S32) 974 .scalarize(0); 975 976 if (ST.hasVOP3PInsts()) { 977 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 978 .legalFor({S32, S16, V2S16}) 979 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 980 .clampMaxNumElements(0, S16, 2) 981 .minScalar(0, S16) 982 .widenScalarToNextPow2(0) 983 .scalarize(0) 984 .lower(); 985 } else { 986 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 987 .legalFor({S32, S16}) 988 .widenScalarToNextPow2(0) 989 .minScalar(0, S16) 990 .scalarize(0) 991 .lower(); 992 } 993 } else { 994 // TODO: Should have same legality without v_perm_b32 995 getActionDefinitionsBuilder(G_BSWAP) 996 .legalFor({S32}) 997 .lowerIf(scalarNarrowerThan(0, 32)) 998 // FIXME: Fixing non-power-of-2 before clamp is workaround for 999 // narrowScalar limitation. 1000 .widenScalarToNextPow2(0) 1001 .maxScalar(0, S32) 1002 .scalarize(0) 1003 .lower(); 1004 1005 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1006 .legalFor({S32}) 1007 .minScalar(0, S32) 1008 .widenScalarToNextPow2(0) 1009 .scalarize(0) 1010 .lower(); 1011 } 1012 1013 getActionDefinitionsBuilder(G_INTTOPTR) 1014 // List the common cases 1015 .legalForCartesianProduct(AddrSpaces64, {S64}) 1016 .legalForCartesianProduct(AddrSpaces32, {S32}) 1017 .scalarize(0) 1018 // Accept any address space as long as the size matches 1019 .legalIf(sameSize(0, 1)) 1020 .widenScalarIf(smallerThan(1, 0), 1021 [](const LegalityQuery &Query) { 1022 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1023 }) 1024 .narrowScalarIf(largerThan(1, 0), 1025 [](const LegalityQuery &Query) { 1026 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1027 }); 1028 1029 getActionDefinitionsBuilder(G_PTRTOINT) 1030 // List the common cases 1031 .legalForCartesianProduct(AddrSpaces64, {S64}) 1032 .legalForCartesianProduct(AddrSpaces32, {S32}) 1033 .scalarize(0) 1034 // Accept any address space as long as the size matches 1035 .legalIf(sameSize(0, 1)) 1036 .widenScalarIf(smallerThan(0, 1), 1037 [](const LegalityQuery &Query) { 1038 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1039 }) 1040 .narrowScalarIf( 1041 largerThan(0, 1), 1042 [](const LegalityQuery &Query) { 1043 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1044 }); 1045 1046 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1047 .scalarize(0) 1048 .custom(); 1049 1050 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1051 bool IsLoad) -> bool { 1052 const LLT DstTy = Query.Types[0]; 1053 1054 // Split vector extloads. 1055 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1056 1057 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1058 return true; 1059 1060 const LLT PtrTy = Query.Types[1]; 1061 unsigned AS = PtrTy.getAddressSpace(); 1062 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1063 return true; 1064 1065 // Catch weird sized loads that don't evenly divide into the access sizes 1066 // TODO: May be able to widen depending on alignment etc. 1067 unsigned NumRegs = (MemSize + 31) / 32; 1068 if (NumRegs == 3) { 1069 if (!ST.hasDwordx3LoadStores()) 1070 return true; 1071 } else { 1072 // If the alignment allows, these should have been widened. 1073 if (!isPowerOf2_32(NumRegs)) 1074 return true; 1075 } 1076 1077 return false; 1078 }; 1079 1080 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1081 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1082 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1083 1084 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1085 // LDS 1086 // TODO: Unsupported flat for SI. 1087 1088 for (unsigned Op : {G_LOAD, G_STORE}) { 1089 const bool IsStore = Op == G_STORE; 1090 1091 auto &Actions = getActionDefinitionsBuilder(Op); 1092 // Explicitly list some common cases. 1093 // TODO: Does this help compile time at all? 1094 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1095 {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1096 {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1097 {S64, GlobalPtr, S64, GlobalAlign32}, 1098 {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1099 {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1100 {S32, GlobalPtr, S8, GlobalAlign8}, 1101 {S32, GlobalPtr, S16, GlobalAlign16}, 1102 1103 {S32, LocalPtr, S32, 32}, 1104 {S64, LocalPtr, S64, 32}, 1105 {V2S32, LocalPtr, V2S32, 32}, 1106 {S32, LocalPtr, S8, 8}, 1107 {S32, LocalPtr, S16, 16}, 1108 {V2S16, LocalPtr, S32, 32}, 1109 1110 {S32, PrivatePtr, S32, 32}, 1111 {S32, PrivatePtr, S8, 8}, 1112 {S32, PrivatePtr, S16, 16}, 1113 {V2S16, PrivatePtr, S32, 32}, 1114 1115 {S32, ConstantPtr, S32, GlobalAlign32}, 1116 {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1117 {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1118 {S64, ConstantPtr, S64, GlobalAlign32}, 1119 {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 1120 Actions.legalIf( 1121 [=](const LegalityQuery &Query) -> bool { 1122 return isLoadStoreLegal(ST, Query); 1123 }); 1124 1125 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1126 // 64-bits. 1127 // 1128 // TODO: Should generalize bitcast action into coerce, which will also cover 1129 // inserting addrspacecasts. 1130 Actions.customIf(typeIs(1, Constant32Ptr)); 1131 1132 // Turn any illegal element vectors into something easier to deal 1133 // with. These will ultimately produce 32-bit scalar shifts to extract the 1134 // parts anyway. 1135 // 1136 // For odd 16-bit element vectors, prefer to split those into pieces with 1137 // 16-bit vector parts. 1138 Actions.bitcastIf( 1139 [=](const LegalityQuery &Query) -> bool { 1140 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1141 Query.MMODescrs[0].MemoryTy); 1142 }, bitcastToRegisterType(0)); 1143 1144 if (!IsStore) { 1145 // Widen suitably aligned loads by loading extra bytes. The standard 1146 // legalization actions can't properly express widening memory operands. 1147 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1148 return shouldWidenLoad(ST, Query, G_LOAD); 1149 }); 1150 } 1151 1152 // FIXME: load/store narrowing should be moved to lower action 1153 Actions 1154 .narrowScalarIf( 1155 [=](const LegalityQuery &Query) -> bool { 1156 return !Query.Types[0].isVector() && 1157 needToSplitMemOp(Query, Op == G_LOAD); 1158 }, 1159 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1160 const LLT DstTy = Query.Types[0]; 1161 const LLT PtrTy = Query.Types[1]; 1162 1163 const unsigned DstSize = DstTy.getSizeInBits(); 1164 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1165 1166 // Split extloads. 1167 if (DstSize > MemSize) 1168 return std::make_pair(0, LLT::scalar(MemSize)); 1169 1170 unsigned MaxSize = maxSizeForAddrSpace(ST, 1171 PtrTy.getAddressSpace(), 1172 Op == G_LOAD); 1173 if (MemSize > MaxSize) 1174 return std::make_pair(0, LLT::scalar(MaxSize)); 1175 1176 uint64_t Align = Query.MMODescrs[0].AlignInBits; 1177 return std::make_pair(0, LLT::scalar(Align)); 1178 }) 1179 .fewerElementsIf( 1180 [=](const LegalityQuery &Query) -> bool { 1181 return Query.Types[0].isVector() && 1182 needToSplitMemOp(Query, Op == G_LOAD); 1183 }, 1184 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1185 const LLT DstTy = Query.Types[0]; 1186 const LLT PtrTy = Query.Types[1]; 1187 1188 LLT EltTy = DstTy.getElementType(); 1189 unsigned MaxSize = maxSizeForAddrSpace(ST, 1190 PtrTy.getAddressSpace(), 1191 Op == G_LOAD); 1192 1193 // FIXME: Handle widened to power of 2 results better. This ends 1194 // up scalarizing. 1195 // FIXME: 3 element stores scalarized on SI 1196 1197 // Split if it's too large for the address space. 1198 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1199 if (MemSize > MaxSize) { 1200 unsigned NumElts = DstTy.getNumElements(); 1201 unsigned EltSize = EltTy.getSizeInBits(); 1202 1203 if (MaxSize % EltSize == 0) { 1204 return std::make_pair( 1205 0, LLT::scalarOrVector( 1206 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1207 } 1208 1209 unsigned NumPieces = MemSize / MaxSize; 1210 1211 // FIXME: Refine when odd breakdowns handled 1212 // The scalars will need to be re-legalized. 1213 if (NumPieces == 1 || NumPieces >= NumElts || 1214 NumElts % NumPieces != 0) 1215 return std::make_pair(0, EltTy); 1216 1217 return std::make_pair( 1218 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1219 } 1220 1221 // FIXME: We could probably handle weird extending loads better. 1222 if (DstTy.getSizeInBits() > MemSize) 1223 return std::make_pair(0, EltTy); 1224 1225 unsigned EltSize = EltTy.getSizeInBits(); 1226 unsigned DstSize = DstTy.getSizeInBits(); 1227 if (!isPowerOf2_32(DstSize)) { 1228 // We're probably decomposing an odd sized store. Try to split 1229 // to the widest type. TODO: Account for alignment. As-is it 1230 // should be OK, since the new parts will be further legalized. 1231 unsigned FloorSize = PowerOf2Floor(DstSize); 1232 return std::make_pair( 1233 0, LLT::scalarOrVector( 1234 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1235 } 1236 1237 // May need relegalization for the scalars. 1238 return std::make_pair(0, EltTy); 1239 }) 1240 .minScalar(0, S32) 1241 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1242 .widenScalarToNextPow2(0) 1243 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1244 .lower(); 1245 } 1246 1247 // FIXME: Unaligned accesses not lowered. 1248 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1249 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1250 {S32, GlobalPtr, S16, 2 * 8}, 1251 {S32, LocalPtr, S8, 8}, 1252 {S32, LocalPtr, S16, 16}, 1253 {S32, PrivatePtr, S8, 8}, 1254 {S32, PrivatePtr, S16, 16}, 1255 {S32, ConstantPtr, S8, 8}, 1256 {S32, ConstantPtr, S16, 2 * 8}}) 1257 .legalIf( 1258 [=](const LegalityQuery &Query) -> bool { 1259 return isLoadStoreLegal(ST, Query); 1260 }); 1261 1262 if (ST.hasFlatAddressSpace()) { 1263 ExtLoads.legalForTypesWithMemDesc( 1264 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 1265 } 1266 1267 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1268 // 64-bits. 1269 // 1270 // TODO: Should generalize bitcast action into coerce, which will also cover 1271 // inserting addrspacecasts. 1272 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1273 1274 ExtLoads.clampScalar(0, S32, S32) 1275 .widenScalarToNextPow2(0) 1276 .lower(); 1277 1278 auto &Atomics = getActionDefinitionsBuilder( 1279 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1280 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1281 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1282 G_ATOMICRMW_UMIN}) 1283 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1284 {S64, GlobalPtr}, {S64, LocalPtr}, 1285 {S32, RegionPtr}, {S64, RegionPtr}}); 1286 if (ST.hasFlatAddressSpace()) { 1287 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1288 } 1289 1290 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1291 if (ST.hasLDSFPAtomicAdd()) { 1292 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1293 if (ST.hasGFX90AInsts()) 1294 Atomic.legalFor({{S64, LocalPtr}}); 1295 } 1296 if (ST.hasAtomicFaddInsts()) 1297 Atomic.legalFor({{S32, GlobalPtr}}); 1298 1299 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1300 // demarshalling 1301 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1302 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1303 {S32, FlatPtr}, {S64, FlatPtr}}) 1304 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1305 {S32, RegionPtr}, {S64, RegionPtr}}); 1306 // TODO: Pointer types, any 32-bit or 64-bit vector 1307 1308 // Condition should be s32 for scalar, s1 for vector. 1309 getActionDefinitionsBuilder(G_SELECT) 1310 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1311 LocalPtr, FlatPtr, PrivatePtr, 1312 LLT::fixed_vector(2, LocalPtr), 1313 LLT::fixed_vector(2, PrivatePtr)}, 1314 {S1, S32}) 1315 .clampScalar(0, S16, S64) 1316 .scalarize(1) 1317 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1318 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1319 .clampMaxNumElements(0, S32, 2) 1320 .clampMaxNumElements(0, LocalPtr, 2) 1321 .clampMaxNumElements(0, PrivatePtr, 2) 1322 .scalarize(0) 1323 .widenScalarToNextPow2(0) 1324 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1325 1326 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1327 // be more flexible with the shift amount type. 1328 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1329 .legalFor({{S32, S32}, {S64, S32}}); 1330 if (ST.has16BitInsts()) { 1331 if (ST.hasVOP3PInsts()) { 1332 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1333 .clampMaxNumElements(0, S16, 2); 1334 } else 1335 Shifts.legalFor({{S16, S16}}); 1336 1337 // TODO: Support 16-bit shift amounts for all types 1338 Shifts.widenScalarIf( 1339 [=](const LegalityQuery &Query) { 1340 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1341 // 32-bit amount. 1342 const LLT ValTy = Query.Types[0]; 1343 const LLT AmountTy = Query.Types[1]; 1344 return ValTy.getSizeInBits() <= 16 && 1345 AmountTy.getSizeInBits() < 16; 1346 }, changeTo(1, S16)); 1347 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1348 Shifts.clampScalar(1, S32, S32); 1349 Shifts.widenScalarToNextPow2(0, 16); 1350 Shifts.clampScalar(0, S16, S64); 1351 1352 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1353 .minScalar(0, S16) 1354 .scalarize(0) 1355 .lower(); 1356 } else { 1357 // Make sure we legalize the shift amount type first, as the general 1358 // expansion for the shifted type will produce much worse code if it hasn't 1359 // been truncated already. 1360 Shifts.clampScalar(1, S32, S32); 1361 Shifts.widenScalarToNextPow2(0, 32); 1362 Shifts.clampScalar(0, S32, S64); 1363 1364 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1365 .minScalar(0, S32) 1366 .scalarize(0) 1367 .lower(); 1368 } 1369 Shifts.scalarize(0); 1370 1371 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1372 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1373 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1374 unsigned IdxTypeIdx = 2; 1375 1376 getActionDefinitionsBuilder(Op) 1377 .customIf([=](const LegalityQuery &Query) { 1378 const LLT EltTy = Query.Types[EltTypeIdx]; 1379 const LLT VecTy = Query.Types[VecTypeIdx]; 1380 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1381 const unsigned EltSize = EltTy.getSizeInBits(); 1382 return (EltSize == 32 || EltSize == 64) && 1383 VecTy.getSizeInBits() % 32 == 0 && 1384 VecTy.getSizeInBits() <= MaxRegisterSize && 1385 IdxTy.getSizeInBits() == 32; 1386 }) 1387 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1388 bitcastToVectorElement32(VecTypeIdx)) 1389 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1390 .bitcastIf( 1391 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1392 [=](const LegalityQuery &Query) { 1393 // For > 64-bit element types, try to turn this into a 64-bit 1394 // element vector since we may be able to do better indexing 1395 // if this is scalar. If not, fall back to 32. 1396 const LLT EltTy = Query.Types[EltTypeIdx]; 1397 const LLT VecTy = Query.Types[VecTypeIdx]; 1398 const unsigned DstEltSize = EltTy.getSizeInBits(); 1399 const unsigned VecSize = VecTy.getSizeInBits(); 1400 1401 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1402 return std::make_pair( 1403 VecTypeIdx, 1404 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1405 }) 1406 .clampScalar(EltTypeIdx, S32, S64) 1407 .clampScalar(VecTypeIdx, S32, S64) 1408 .clampScalar(IdxTypeIdx, S32, S32) 1409 .clampMaxNumElements(VecTypeIdx, S32, 32) 1410 // TODO: Clamp elements for 64-bit vectors? 1411 // It should only be necessary with variable indexes. 1412 // As a last resort, lower to the stack 1413 .lower(); 1414 } 1415 1416 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1417 .unsupportedIf([=](const LegalityQuery &Query) { 1418 const LLT &EltTy = Query.Types[1].getElementType(); 1419 return Query.Types[0] != EltTy; 1420 }); 1421 1422 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1423 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1424 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1425 1426 // FIXME: Doesn't handle extract of illegal sizes. 1427 getActionDefinitionsBuilder(Op) 1428 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1429 .lowerIf([=](const LegalityQuery &Query) { 1430 // Sub-vector(or single element) insert and extract. 1431 // TODO: verify immediate offset here since lower only works with 1432 // whole elements. 1433 const LLT BigTy = Query.Types[BigTyIdx]; 1434 return BigTy.isVector(); 1435 }) 1436 // FIXME: Multiples of 16 should not be legal. 1437 .legalIf([=](const LegalityQuery &Query) { 1438 const LLT BigTy = Query.Types[BigTyIdx]; 1439 const LLT LitTy = Query.Types[LitTyIdx]; 1440 return (BigTy.getSizeInBits() % 32 == 0) && 1441 (LitTy.getSizeInBits() % 16 == 0); 1442 }) 1443 .widenScalarIf( 1444 [=](const LegalityQuery &Query) { 1445 const LLT BigTy = Query.Types[BigTyIdx]; 1446 return (BigTy.getScalarSizeInBits() < 16); 1447 }, 1448 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1449 .widenScalarIf( 1450 [=](const LegalityQuery &Query) { 1451 const LLT LitTy = Query.Types[LitTyIdx]; 1452 return (LitTy.getScalarSizeInBits() < 16); 1453 }, 1454 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1455 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1456 .widenScalarToNextPow2(BigTyIdx, 32); 1457 1458 } 1459 1460 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1461 .legalForCartesianProduct(AllS32Vectors, {S32}) 1462 .legalForCartesianProduct(AllS64Vectors, {S64}) 1463 .clampNumElements(0, V16S32, V32S32) 1464 .clampNumElements(0, V2S64, V16S64) 1465 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1466 1467 if (ST.hasScalarPackInsts()) { 1468 BuildVector 1469 // FIXME: Should probably widen s1 vectors straight to s32 1470 .minScalarOrElt(0, S16) 1471 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1472 .minScalar(1, S32); 1473 1474 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1475 .legalFor({V2S16, S32}) 1476 .lower(); 1477 BuildVector.minScalarOrElt(0, S32); 1478 } else { 1479 BuildVector.customFor({V2S16, S16}); 1480 BuildVector.minScalarOrElt(0, S32); 1481 1482 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1483 .customFor({V2S16, S32}) 1484 .lower(); 1485 } 1486 1487 BuildVector.legalIf(isRegisterType(0)); 1488 1489 // FIXME: Clamp maximum size 1490 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1491 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1492 .clampMaxNumElements(0, S32, 32) 1493 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1494 .clampMaxNumElements(0, S16, 64); 1495 1496 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1497 // pre-legalize. 1498 if (ST.hasVOP3PInsts()) { 1499 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1500 .customFor({V2S16, V2S16}) 1501 .lower(); 1502 } else 1503 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1504 1505 // Merge/Unmerge 1506 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1507 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1508 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1509 1510 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1511 const LLT Ty = Query.Types[TypeIdx]; 1512 if (Ty.isVector()) { 1513 const LLT &EltTy = Ty.getElementType(); 1514 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1515 return true; 1516 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1517 return true; 1518 } 1519 return false; 1520 }; 1521 1522 auto &Builder = getActionDefinitionsBuilder(Op) 1523 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1524 .lowerFor({{S16, V2S16}}) 1525 .lowerIf([=](const LegalityQuery &Query) { 1526 const LLT BigTy = Query.Types[BigTyIdx]; 1527 return BigTy.getSizeInBits() == 32; 1528 }) 1529 // Try to widen to s16 first for small types. 1530 // TODO: Only do this on targets with legal s16 shifts 1531 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1532 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1533 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1534 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1535 elementTypeIs(1, S16)), 1536 changeTo(1, V2S16)) 1537 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1538 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1539 // valid. 1540 .clampScalar(LitTyIdx, S32, S512) 1541 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1542 // Break up vectors with weird elements into scalars 1543 .fewerElementsIf( 1544 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1545 scalarize(0)) 1546 .fewerElementsIf( 1547 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1548 scalarize(1)) 1549 .clampScalar(BigTyIdx, S32, MaxScalar); 1550 1551 if (Op == G_MERGE_VALUES) { 1552 Builder.widenScalarIf( 1553 // TODO: Use 16-bit shifts if legal for 8-bit values? 1554 [=](const LegalityQuery &Query) { 1555 const LLT Ty = Query.Types[LitTyIdx]; 1556 return Ty.getSizeInBits() < 32; 1557 }, 1558 changeTo(LitTyIdx, S32)); 1559 } 1560 1561 Builder.widenScalarIf( 1562 [=](const LegalityQuery &Query) { 1563 const LLT Ty = Query.Types[BigTyIdx]; 1564 return !isPowerOf2_32(Ty.getSizeInBits()) && 1565 Ty.getSizeInBits() % 16 != 0; 1566 }, 1567 [=](const LegalityQuery &Query) { 1568 // Pick the next power of 2, or a multiple of 64 over 128. 1569 // Whichever is smaller. 1570 const LLT &Ty = Query.Types[BigTyIdx]; 1571 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1572 if (NewSizeInBits >= 256) { 1573 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1574 if (RoundedTo < NewSizeInBits) 1575 NewSizeInBits = RoundedTo; 1576 } 1577 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1578 }) 1579 // Any vectors left are the wrong size. Scalarize them. 1580 .scalarize(0) 1581 .scalarize(1); 1582 } 1583 1584 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1585 // RegBankSelect. 1586 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1587 .legalFor({{S32}, {S64}}); 1588 1589 if (ST.hasVOP3PInsts()) { 1590 SextInReg.lowerFor({{V2S16}}) 1591 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1592 // get more vector shift opportunities, since we'll get those when 1593 // expanded. 1594 .clampMaxNumElementsStrict(0, S16, 2); 1595 } else if (ST.has16BitInsts()) { 1596 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1597 } else { 1598 // Prefer to promote to s32 before lowering if we don't have 16-bit 1599 // shifts. This avoid a lot of intermediate truncate and extend operations. 1600 SextInReg.lowerFor({{S32}, {S64}}); 1601 } 1602 1603 SextInReg 1604 .scalarize(0) 1605 .clampScalar(0, S32, S64) 1606 .lower(); 1607 1608 getActionDefinitionsBuilder({G_ROTR, G_ROTL}) 1609 .scalarize(0) 1610 .lower(); 1611 1612 // TODO: Only Try to form v2s16 with legal packed instructions. 1613 getActionDefinitionsBuilder(G_FSHR) 1614 .legalFor({{S32, S32}}) 1615 .lowerFor({{V2S16, V2S16}}) 1616 .clampMaxNumElementsStrict(0, S16, 2) 1617 .scalarize(0) 1618 .lower(); 1619 1620 if (ST.hasVOP3PInsts()) { 1621 getActionDefinitionsBuilder(G_FSHL) 1622 .lowerFor({{V2S16, V2S16}}) 1623 .clampMaxNumElementsStrict(0, S16, 2) 1624 .scalarize(0) 1625 .lower(); 1626 } else { 1627 getActionDefinitionsBuilder(G_FSHL) 1628 .scalarize(0) 1629 .lower(); 1630 } 1631 1632 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1633 .legalFor({S64}); 1634 1635 getActionDefinitionsBuilder(G_FENCE) 1636 .alwaysLegal(); 1637 1638 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1639 .scalarize(0) 1640 .minScalar(0, S32) 1641 .lower(); 1642 1643 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1644 .legalFor({{S32, S32}, {S64, S32}}) 1645 .clampScalar(1, S32, S32) 1646 .clampScalar(0, S32, S64) 1647 .widenScalarToNextPow2(0) 1648 .scalarize(0); 1649 1650 getActionDefinitionsBuilder({ 1651 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1652 G_FCOPYSIGN, 1653 1654 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1655 G_ATOMICRMW_NAND, 1656 G_ATOMICRMW_FSUB, 1657 G_READ_REGISTER, 1658 G_WRITE_REGISTER, 1659 1660 G_SADDO, G_SSUBO, 1661 1662 // TODO: Implement 1663 G_FMINIMUM, G_FMAXIMUM}).lower(); 1664 1665 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) 1666 .lower(); 1667 1668 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1669 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1670 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1671 .unsupported(); 1672 1673 getLegacyLegalizerInfo().computeTables(); 1674 verify(*ST.getInstrInfo()); 1675 } 1676 1677 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1678 MachineInstr &MI) const { 1679 MachineIRBuilder &B = Helper.MIRBuilder; 1680 MachineRegisterInfo &MRI = *B.getMRI(); 1681 1682 switch (MI.getOpcode()) { 1683 case TargetOpcode::G_ADDRSPACE_CAST: 1684 return legalizeAddrSpaceCast(MI, MRI, B); 1685 case TargetOpcode::G_FRINT: 1686 return legalizeFrint(MI, MRI, B); 1687 case TargetOpcode::G_FCEIL: 1688 return legalizeFceil(MI, MRI, B); 1689 case TargetOpcode::G_FREM: 1690 return legalizeFrem(MI, MRI, B); 1691 case TargetOpcode::G_INTRINSIC_TRUNC: 1692 return legalizeIntrinsicTrunc(MI, MRI, B); 1693 case TargetOpcode::G_SITOFP: 1694 return legalizeITOFP(MI, MRI, B, true); 1695 case TargetOpcode::G_UITOFP: 1696 return legalizeITOFP(MI, MRI, B, false); 1697 case TargetOpcode::G_FPTOSI: 1698 return legalizeFPTOI(MI, MRI, B, true); 1699 case TargetOpcode::G_FPTOUI: 1700 return legalizeFPTOI(MI, MRI, B, false); 1701 case TargetOpcode::G_FMINNUM: 1702 case TargetOpcode::G_FMAXNUM: 1703 case TargetOpcode::G_FMINNUM_IEEE: 1704 case TargetOpcode::G_FMAXNUM_IEEE: 1705 return legalizeMinNumMaxNum(Helper, MI); 1706 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1707 return legalizeExtractVectorElt(MI, MRI, B); 1708 case TargetOpcode::G_INSERT_VECTOR_ELT: 1709 return legalizeInsertVectorElt(MI, MRI, B); 1710 case TargetOpcode::G_SHUFFLE_VECTOR: 1711 return legalizeShuffleVector(MI, MRI, B); 1712 case TargetOpcode::G_FSIN: 1713 case TargetOpcode::G_FCOS: 1714 return legalizeSinCos(MI, MRI, B); 1715 case TargetOpcode::G_GLOBAL_VALUE: 1716 return legalizeGlobalValue(MI, MRI, B); 1717 case TargetOpcode::G_LOAD: 1718 case TargetOpcode::G_SEXTLOAD: 1719 case TargetOpcode::G_ZEXTLOAD: 1720 return legalizeLoad(Helper, MI); 1721 case TargetOpcode::G_FMAD: 1722 return legalizeFMad(MI, MRI, B); 1723 case TargetOpcode::G_FDIV: 1724 return legalizeFDIV(MI, MRI, B); 1725 case TargetOpcode::G_UDIV: 1726 case TargetOpcode::G_UREM: 1727 case TargetOpcode::G_UDIVREM: 1728 return legalizeUnsignedDIV_REM(MI, MRI, B); 1729 case TargetOpcode::G_SDIV: 1730 case TargetOpcode::G_SREM: 1731 case TargetOpcode::G_SDIVREM: 1732 return legalizeSignedDIV_REM(MI, MRI, B); 1733 case TargetOpcode::G_ATOMIC_CMPXCHG: 1734 return legalizeAtomicCmpXChg(MI, MRI, B); 1735 case TargetOpcode::G_FLOG: 1736 return legalizeFlog(MI, B, numbers::ln2f); 1737 case TargetOpcode::G_FLOG10: 1738 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1739 case TargetOpcode::G_FEXP: 1740 return legalizeFExp(MI, B); 1741 case TargetOpcode::G_FPOW: 1742 return legalizeFPow(MI, B); 1743 case TargetOpcode::G_FFLOOR: 1744 return legalizeFFloor(MI, MRI, B); 1745 case TargetOpcode::G_BUILD_VECTOR: 1746 return legalizeBuildVector(MI, MRI, B); 1747 case TargetOpcode::G_CTLZ: 1748 case TargetOpcode::G_CTTZ: 1749 return legalizeCTLZ_CTTZ(MI, MRI, B); 1750 default: 1751 return false; 1752 } 1753 1754 llvm_unreachable("expected switch to return"); 1755 } 1756 1757 Register AMDGPULegalizerInfo::getSegmentAperture( 1758 unsigned AS, 1759 MachineRegisterInfo &MRI, 1760 MachineIRBuilder &B) const { 1761 MachineFunction &MF = B.getMF(); 1762 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1763 const LLT S32 = LLT::scalar(32); 1764 1765 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1766 1767 if (ST.hasApertureRegs()) { 1768 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1769 // getreg. 1770 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1771 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1772 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1773 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1774 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1775 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1776 unsigned Encoding = 1777 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1778 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1779 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1780 1781 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1782 1783 B.buildInstr(AMDGPU::S_GETREG_B32) 1784 .addDef(GetReg) 1785 .addImm(Encoding); 1786 MRI.setType(GetReg, S32); 1787 1788 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1789 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1790 } 1791 1792 Register QueuePtr = MRI.createGenericVirtualRegister( 1793 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1794 1795 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1796 return Register(); 1797 1798 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1799 // private_segment_aperture_base_hi. 1800 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1801 1802 // TODO: can we be smarter about machine pointer info? 1803 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1804 MachineMemOperand *MMO = MF.getMachineMemOperand( 1805 PtrInfo, 1806 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1807 MachineMemOperand::MOInvariant, 1808 LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 1809 1810 Register LoadAddr; 1811 1812 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1813 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1814 } 1815 1816 /// Return true if the value is a known valid address, such that a null check is 1817 /// not necessary. 1818 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, 1819 const AMDGPUTargetMachine &TM, unsigned AddrSpace) { 1820 MachineInstr *Def = MRI.getVRegDef(Val); 1821 switch (Def->getOpcode()) { 1822 case AMDGPU::G_FRAME_INDEX: 1823 case AMDGPU::G_GLOBAL_VALUE: 1824 case AMDGPU::G_BLOCK_ADDR: 1825 return true; 1826 case AMDGPU::G_CONSTANT: { 1827 const ConstantInt *CI = Def->getOperand(1).getCImm(); 1828 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace); 1829 } 1830 default: 1831 return false; 1832 } 1833 1834 return false; 1835 } 1836 1837 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1838 MachineInstr &MI, MachineRegisterInfo &MRI, 1839 MachineIRBuilder &B) const { 1840 MachineFunction &MF = B.getMF(); 1841 1842 const LLT S32 = LLT::scalar(32); 1843 Register Dst = MI.getOperand(0).getReg(); 1844 Register Src = MI.getOperand(1).getReg(); 1845 1846 LLT DstTy = MRI.getType(Dst); 1847 LLT SrcTy = MRI.getType(Src); 1848 unsigned DestAS = DstTy.getAddressSpace(); 1849 unsigned SrcAS = SrcTy.getAddressSpace(); 1850 1851 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1852 // vector element. 1853 assert(!DstTy.isVector()); 1854 1855 const AMDGPUTargetMachine &TM 1856 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1857 1858 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1859 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1860 return true; 1861 } 1862 1863 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1864 // Truncate. 1865 B.buildExtract(Dst, Src, 0); 1866 MI.eraseFromParent(); 1867 return true; 1868 } 1869 1870 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1871 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1872 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1873 1874 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1875 // another. Merge operands are required to be the same type, but creating an 1876 // extra ptrtoint would be kind of pointless. 1877 auto HighAddr = B.buildConstant( 1878 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1879 B.buildMerge(Dst, {Src, HighAddr}); 1880 MI.eraseFromParent(); 1881 return true; 1882 } 1883 1884 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1885 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1886 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1887 1888 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 1889 // Extract low 32-bits of the pointer. 1890 B.buildExtract(Dst, Src, 0); 1891 MI.eraseFromParent(); 1892 return true; 1893 } 1894 1895 unsigned NullVal = TM.getNullPointerValue(DestAS); 1896 1897 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1898 auto FlatNull = B.buildConstant(SrcTy, 0); 1899 1900 // Extract low 32-bits of the pointer. 1901 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1902 1903 auto CmpRes = 1904 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1905 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1906 1907 MI.eraseFromParent(); 1908 return true; 1909 } 1910 1911 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1912 return false; 1913 1914 if (!ST.hasFlatAddressSpace()) 1915 return false; 1916 1917 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1918 if (!ApertureReg.isValid()) 1919 return false; 1920 1921 // Coerce the type of the low half of the result so we can use merge_values. 1922 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1923 1924 // TODO: Should we allow mismatched types but matching sizes in merges to 1925 // avoid the ptrtoint? 1926 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1927 1928 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 1929 B.buildCopy(Dst, BuildPtr); 1930 MI.eraseFromParent(); 1931 return true; 1932 } 1933 1934 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1935 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1936 1937 auto CmpRes = 1938 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1939 1940 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1941 1942 MI.eraseFromParent(); 1943 return true; 1944 } 1945 1946 bool AMDGPULegalizerInfo::legalizeFrint( 1947 MachineInstr &MI, MachineRegisterInfo &MRI, 1948 MachineIRBuilder &B) const { 1949 Register Src = MI.getOperand(1).getReg(); 1950 LLT Ty = MRI.getType(Src); 1951 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1952 1953 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1954 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1955 1956 auto C1 = B.buildFConstant(Ty, C1Val); 1957 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1958 1959 // TODO: Should this propagate fast-math-flags? 1960 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1961 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1962 1963 auto C2 = B.buildFConstant(Ty, C2Val); 1964 auto Fabs = B.buildFAbs(Ty, Src); 1965 1966 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1967 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1968 MI.eraseFromParent(); 1969 return true; 1970 } 1971 1972 bool AMDGPULegalizerInfo::legalizeFceil( 1973 MachineInstr &MI, MachineRegisterInfo &MRI, 1974 MachineIRBuilder &B) const { 1975 1976 const LLT S1 = LLT::scalar(1); 1977 const LLT S64 = LLT::scalar(64); 1978 1979 Register Src = MI.getOperand(1).getReg(); 1980 assert(MRI.getType(Src) == S64); 1981 1982 // result = trunc(src) 1983 // if (src > 0.0 && src != result) 1984 // result += 1.0 1985 1986 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1987 1988 const auto Zero = B.buildFConstant(S64, 0.0); 1989 const auto One = B.buildFConstant(S64, 1.0); 1990 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1991 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1992 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1993 auto Add = B.buildSelect(S64, And, One, Zero); 1994 1995 // TODO: Should this propagate fast-math-flags? 1996 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1997 MI.eraseFromParent(); 1998 return true; 1999 } 2000 2001 bool AMDGPULegalizerInfo::legalizeFrem( 2002 MachineInstr &MI, MachineRegisterInfo &MRI, 2003 MachineIRBuilder &B) const { 2004 Register DstReg = MI.getOperand(0).getReg(); 2005 Register Src0Reg = MI.getOperand(1).getReg(); 2006 Register Src1Reg = MI.getOperand(2).getReg(); 2007 auto Flags = MI.getFlags(); 2008 LLT Ty = MRI.getType(DstReg); 2009 2010 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 2011 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 2012 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 2013 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 2014 MI.eraseFromParent(); 2015 return true; 2016 } 2017 2018 static MachineInstrBuilder extractF64Exponent(Register Hi, 2019 MachineIRBuilder &B) { 2020 const unsigned FractBits = 52; 2021 const unsigned ExpBits = 11; 2022 LLT S32 = LLT::scalar(32); 2023 2024 auto Const0 = B.buildConstant(S32, FractBits - 32); 2025 auto Const1 = B.buildConstant(S32, ExpBits); 2026 2027 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 2028 .addUse(Hi) 2029 .addUse(Const0.getReg(0)) 2030 .addUse(Const1.getReg(0)); 2031 2032 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 2033 } 2034 2035 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2036 MachineInstr &MI, MachineRegisterInfo &MRI, 2037 MachineIRBuilder &B) const { 2038 const LLT S1 = LLT::scalar(1); 2039 const LLT S32 = LLT::scalar(32); 2040 const LLT S64 = LLT::scalar(64); 2041 2042 Register Src = MI.getOperand(1).getReg(); 2043 assert(MRI.getType(Src) == S64); 2044 2045 // TODO: Should this use extract since the low half is unused? 2046 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2047 Register Hi = Unmerge.getReg(1); 2048 2049 // Extract the upper half, since this is where we will find the sign and 2050 // exponent. 2051 auto Exp = extractF64Exponent(Hi, B); 2052 2053 const unsigned FractBits = 52; 2054 2055 // Extract the sign bit. 2056 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2057 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2058 2059 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2060 2061 const auto Zero32 = B.buildConstant(S32, 0); 2062 2063 // Extend back to 64-bits. 2064 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2065 2066 auto Shr = B.buildAShr(S64, FractMask, Exp); 2067 auto Not = B.buildNot(S64, Shr); 2068 auto Tmp0 = B.buildAnd(S64, Src, Not); 2069 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2070 2071 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2072 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2073 2074 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2075 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2076 MI.eraseFromParent(); 2077 return true; 2078 } 2079 2080 bool AMDGPULegalizerInfo::legalizeITOFP( 2081 MachineInstr &MI, MachineRegisterInfo &MRI, 2082 MachineIRBuilder &B, bool Signed) const { 2083 2084 Register Dst = MI.getOperand(0).getReg(); 2085 Register Src = MI.getOperand(1).getReg(); 2086 2087 const LLT S64 = LLT::scalar(64); 2088 const LLT S32 = LLT::scalar(32); 2089 2090 assert(MRI.getType(Src) == S64); 2091 2092 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2093 auto ThirtyTwo = B.buildConstant(S32, 32); 2094 2095 if (MRI.getType(Dst) == S64) { 2096 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) 2097 : B.buildUITOFP(S64, Unmerge.getReg(1)); 2098 2099 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2100 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2101 .addUse(CvtHi.getReg(0)) 2102 .addUse(ThirtyTwo.getReg(0)); 2103 2104 // TODO: Should this propagate fast-math-flags? 2105 B.buildFAdd(Dst, LdExp, CvtLo); 2106 MI.eraseFromParent(); 2107 return true; 2108 } 2109 2110 assert(MRI.getType(Dst) == S32); 2111 2112 auto One = B.buildConstant(S32, 1); 2113 2114 MachineInstrBuilder ShAmt; 2115 if (Signed) { 2116 auto ThirtyOne = B.buildConstant(S32, 31); 2117 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); 2118 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); 2119 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); 2120 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, 2121 /*HasSideEffects=*/false) 2122 .addUse(Unmerge.getReg(1)); 2123 auto LS2 = B.buildSub(S32, LS, One); 2124 ShAmt = B.buildUMin(S32, LS2, MaxShAmt); 2125 } else 2126 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); 2127 auto Norm = B.buildShl(S64, Src, ShAmt); 2128 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); 2129 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); 2130 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); 2131 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); 2132 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); 2133 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst}, 2134 /*HasSideEffects=*/false) 2135 .addUse(FVal.getReg(0)) 2136 .addUse(Scale.getReg(0)); 2137 MI.eraseFromParent(); 2138 return true; 2139 } 2140 2141 // TODO: Copied from DAG implementation. Verify logic and document how this 2142 // actually works. 2143 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2144 MachineRegisterInfo &MRI, 2145 MachineIRBuilder &B, 2146 bool Signed) const { 2147 2148 Register Dst = MI.getOperand(0).getReg(); 2149 Register Src = MI.getOperand(1).getReg(); 2150 2151 const LLT S64 = LLT::scalar(64); 2152 const LLT S32 = LLT::scalar(32); 2153 2154 const LLT SrcLT = MRI.getType(Src); 2155 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2156 2157 unsigned Flags = MI.getFlags(); 2158 2159 // The basic idea of converting a floating point number into a pair of 32-bit 2160 // integers is illustrated as follows: 2161 // 2162 // tf := trunc(val); 2163 // hif := floor(tf * 2^-32); 2164 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2165 // hi := fptoi(hif); 2166 // lo := fptoi(lof); 2167 // 2168 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2169 MachineInstrBuilder Sign; 2170 if (Signed && SrcLT == S32) { 2171 // However, a 32-bit floating point number has only 23 bits mantissa and 2172 // it's not enough to hold all the significant bits of `lof` if val is 2173 // negative. To avoid the loss of precision, We need to take the absolute 2174 // value after truncating and flip the result back based on the original 2175 // signedness. 2176 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2177 Trunc = B.buildFAbs(S32, Trunc, Flags); 2178 } 2179 MachineInstrBuilder K0, K1; 2180 if (SrcLT == S64) { 2181 K0 = B.buildFConstant(S64, 2182 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2183 K1 = B.buildFConstant(S64, 2184 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2185 } else { 2186 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2187 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2188 } 2189 2190 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2191 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2192 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2193 2194 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2195 : B.buildFPTOUI(S32, FloorMul); 2196 auto Lo = B.buildFPTOUI(S32, Fma); 2197 2198 if (Signed && SrcLT == S32) { 2199 // Flip the result based on the signedness, which is either all 0s or 1s. 2200 Sign = B.buildMerge(S64, {Sign, Sign}); 2201 // r := xor({lo, hi}, sign) - sign; 2202 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); 2203 } else 2204 B.buildMerge(Dst, {Lo, Hi}); 2205 MI.eraseFromParent(); 2206 2207 return true; 2208 } 2209 2210 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2211 MachineInstr &MI) const { 2212 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2213 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2214 2215 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2216 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2217 2218 // With ieee_mode disabled, the instructions have the correct behavior 2219 // already for G_FMINNUM/G_FMAXNUM 2220 if (!MFI->getMode().IEEE) 2221 return !IsIEEEOp; 2222 2223 if (IsIEEEOp) 2224 return true; 2225 2226 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2227 } 2228 2229 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2230 MachineInstr &MI, MachineRegisterInfo &MRI, 2231 MachineIRBuilder &B) const { 2232 // TODO: Should move some of this into LegalizerHelper. 2233 2234 // TODO: Promote dynamic indexing of s16 to s32 2235 2236 // FIXME: Artifact combiner probably should have replaced the truncated 2237 // constant before this, so we shouldn't need 2238 // getIConstantVRegValWithLookThrough. 2239 Optional<ValueAndVReg> MaybeIdxVal = 2240 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2241 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2242 return true; 2243 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2244 2245 Register Dst = MI.getOperand(0).getReg(); 2246 Register Vec = MI.getOperand(1).getReg(); 2247 2248 LLT VecTy = MRI.getType(Vec); 2249 LLT EltTy = VecTy.getElementType(); 2250 assert(EltTy == MRI.getType(Dst)); 2251 2252 if (IdxVal < VecTy.getNumElements()) { 2253 auto Unmerge = B.buildUnmerge(EltTy, Vec); 2254 B.buildCopy(Dst, Unmerge.getReg(IdxVal)); 2255 } else { 2256 B.buildUndef(Dst); 2257 } 2258 2259 MI.eraseFromParent(); 2260 return true; 2261 } 2262 2263 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2264 MachineInstr &MI, MachineRegisterInfo &MRI, 2265 MachineIRBuilder &B) const { 2266 // TODO: Should move some of this into LegalizerHelper. 2267 2268 // TODO: Promote dynamic indexing of s16 to s32 2269 2270 // FIXME: Artifact combiner probably should have replaced the truncated 2271 // constant before this, so we shouldn't need 2272 // getIConstantVRegValWithLookThrough. 2273 Optional<ValueAndVReg> MaybeIdxVal = 2274 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2275 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2276 return true; 2277 2278 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2279 Register Dst = MI.getOperand(0).getReg(); 2280 Register Vec = MI.getOperand(1).getReg(); 2281 Register Ins = MI.getOperand(2).getReg(); 2282 2283 LLT VecTy = MRI.getType(Vec); 2284 LLT EltTy = VecTy.getElementType(); 2285 assert(EltTy == MRI.getType(Ins)); 2286 (void)Ins; 2287 2288 unsigned NumElts = VecTy.getNumElements(); 2289 if (IdxVal < NumElts) { 2290 SmallVector<Register, 8> SrcRegs; 2291 for (unsigned i = 0; i < NumElts; ++i) 2292 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy)); 2293 B.buildUnmerge(SrcRegs, Vec); 2294 2295 SrcRegs[IdxVal] = MI.getOperand(2).getReg(); 2296 B.buildMerge(Dst, SrcRegs); 2297 } else { 2298 B.buildUndef(Dst); 2299 } 2300 2301 MI.eraseFromParent(); 2302 return true; 2303 } 2304 2305 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2306 MachineInstr &MI, MachineRegisterInfo &MRI, 2307 MachineIRBuilder &B) const { 2308 const LLT V2S16 = LLT::fixed_vector(2, 16); 2309 2310 Register Dst = MI.getOperand(0).getReg(); 2311 Register Src0 = MI.getOperand(1).getReg(); 2312 LLT DstTy = MRI.getType(Dst); 2313 LLT SrcTy = MRI.getType(Src0); 2314 2315 if (SrcTy == V2S16 && DstTy == V2S16 && 2316 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2317 return true; 2318 2319 MachineIRBuilder HelperBuilder(MI); 2320 GISelObserverWrapper DummyObserver; 2321 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2322 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2323 } 2324 2325 bool AMDGPULegalizerInfo::legalizeSinCos( 2326 MachineInstr &MI, MachineRegisterInfo &MRI, 2327 MachineIRBuilder &B) const { 2328 2329 Register DstReg = MI.getOperand(0).getReg(); 2330 Register SrcReg = MI.getOperand(1).getReg(); 2331 LLT Ty = MRI.getType(DstReg); 2332 unsigned Flags = MI.getFlags(); 2333 2334 Register TrigVal; 2335 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2336 if (ST.hasTrigReducedRange()) { 2337 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2338 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2339 .addUse(MulVal.getReg(0)) 2340 .setMIFlags(Flags).getReg(0); 2341 } else 2342 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2343 2344 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2345 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2346 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2347 .addUse(TrigVal) 2348 .setMIFlags(Flags); 2349 MI.eraseFromParent(); 2350 return true; 2351 } 2352 2353 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2354 MachineIRBuilder &B, 2355 const GlobalValue *GV, 2356 int64_t Offset, 2357 unsigned GAFlags) const { 2358 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2359 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2360 // to the following code sequence: 2361 // 2362 // For constant address space: 2363 // s_getpc_b64 s[0:1] 2364 // s_add_u32 s0, s0, $symbol 2365 // s_addc_u32 s1, s1, 0 2366 // 2367 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2368 // a fixup or relocation is emitted to replace $symbol with a literal 2369 // constant, which is a pc-relative offset from the encoding of the $symbol 2370 // operand to the global variable. 2371 // 2372 // For global address space: 2373 // s_getpc_b64 s[0:1] 2374 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2375 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2376 // 2377 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2378 // fixups or relocations are emitted to replace $symbol@*@lo and 2379 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2380 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2381 // operand to the global variable. 2382 // 2383 // What we want here is an offset from the value returned by s_getpc 2384 // (which is the address of the s_add_u32 instruction) to the global 2385 // variable, but since the encoding of $symbol starts 4 bytes after the start 2386 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2387 // small. This requires us to add 4 to the global variable offset in order to 2388 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2389 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2390 // instruction. 2391 2392 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2393 2394 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2395 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2396 2397 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2398 .addDef(PCReg); 2399 2400 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2401 if (GAFlags == SIInstrInfo::MO_NONE) 2402 MIB.addImm(0); 2403 else 2404 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2405 2406 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2407 2408 if (PtrTy.getSizeInBits() == 32) 2409 B.buildExtract(DstReg, PCReg, 0); 2410 return true; 2411 } 2412 2413 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2414 MachineInstr &MI, MachineRegisterInfo &MRI, 2415 MachineIRBuilder &B) const { 2416 Register DstReg = MI.getOperand(0).getReg(); 2417 LLT Ty = MRI.getType(DstReg); 2418 unsigned AS = Ty.getAddressSpace(); 2419 2420 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2421 MachineFunction &MF = B.getMF(); 2422 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2423 2424 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2425 if (!MFI->isModuleEntryFunction() && 2426 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2427 const Function &Fn = MF.getFunction(); 2428 DiagnosticInfoUnsupported BadLDSDecl( 2429 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2430 DS_Warning); 2431 Fn.getContext().diagnose(BadLDSDecl); 2432 2433 // We currently don't have a way to correctly allocate LDS objects that 2434 // aren't directly associated with a kernel. We do force inlining of 2435 // functions that use local objects. However, if these dead functions are 2436 // not eliminated, we don't want a compile time error. Just emit a warning 2437 // and a trap, since there should be no callable path here. 2438 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2439 B.buildUndef(DstReg); 2440 MI.eraseFromParent(); 2441 return true; 2442 } 2443 2444 // TODO: We could emit code to handle the initialization somewhere. 2445 // We ignore the initializer for now and legalize it to allow selection. 2446 // The initializer will anyway get errored out during assembly emission. 2447 const SITargetLowering *TLI = ST.getTargetLowering(); 2448 if (!TLI->shouldUseLDSConstAddress(GV)) { 2449 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2450 return true; // Leave in place; 2451 } 2452 2453 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2454 Type *Ty = GV->getValueType(); 2455 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2456 // zero-sized type in other languages to declare the dynamic shared 2457 // memory which size is not known at the compile time. They will be 2458 // allocated by the runtime and placed directly after the static 2459 // allocated ones. They all share the same offset. 2460 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2461 // Adjust alignment for that dynamic shared memory array. 2462 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2463 LLT S32 = LLT::scalar(32); 2464 auto Sz = 2465 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2466 B.buildIntToPtr(DstReg, Sz); 2467 MI.eraseFromParent(); 2468 return true; 2469 } 2470 } 2471 2472 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), 2473 *cast<GlobalVariable>(GV))); 2474 MI.eraseFromParent(); 2475 return true; 2476 } 2477 2478 const SITargetLowering *TLI = ST.getTargetLowering(); 2479 2480 if (TLI->shouldEmitFixup(GV)) { 2481 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2482 MI.eraseFromParent(); 2483 return true; 2484 } 2485 2486 if (TLI->shouldEmitPCReloc(GV)) { 2487 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2488 MI.eraseFromParent(); 2489 return true; 2490 } 2491 2492 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2493 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2494 2495 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 2496 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2497 MachinePointerInfo::getGOT(MF), 2498 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2499 MachineMemOperand::MOInvariant, 2500 LoadTy, Align(8)); 2501 2502 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2503 2504 if (Ty.getSizeInBits() == 32) { 2505 // Truncate if this is a 32-bit constant address. 2506 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2507 B.buildExtract(DstReg, Load, 0); 2508 } else 2509 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2510 2511 MI.eraseFromParent(); 2512 return true; 2513 } 2514 2515 static LLT widenToNextPowerOf2(LLT Ty) { 2516 if (Ty.isVector()) 2517 return Ty.changeElementCount( 2518 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2519 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2520 } 2521 2522 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2523 MachineInstr &MI) const { 2524 MachineIRBuilder &B = Helper.MIRBuilder; 2525 MachineRegisterInfo &MRI = *B.getMRI(); 2526 GISelChangeObserver &Observer = Helper.Observer; 2527 2528 Register PtrReg = MI.getOperand(1).getReg(); 2529 LLT PtrTy = MRI.getType(PtrReg); 2530 unsigned AddrSpace = PtrTy.getAddressSpace(); 2531 2532 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2533 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2534 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2535 Observer.changingInstr(MI); 2536 MI.getOperand(1).setReg(Cast.getReg(0)); 2537 Observer.changedInstr(MI); 2538 return true; 2539 } 2540 2541 if (MI.getOpcode() != AMDGPU::G_LOAD) 2542 return false; 2543 2544 Register ValReg = MI.getOperand(0).getReg(); 2545 LLT ValTy = MRI.getType(ValReg); 2546 2547 MachineMemOperand *MMO = *MI.memoperands_begin(); 2548 const unsigned ValSize = ValTy.getSizeInBits(); 2549 const LLT MemTy = MMO->getMemoryType(); 2550 const Align MemAlign = MMO->getAlign(); 2551 const unsigned MemSize = MemTy.getSizeInBits(); 2552 const uint64_t AlignInBits = 8 * MemAlign.value(); 2553 2554 // Widen non-power-of-2 loads to the alignment if needed 2555 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 2556 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2557 2558 // This was already the correct extending load result type, so just adjust 2559 // the memory type. 2560 if (WideMemSize == ValSize) { 2561 MachineFunction &MF = B.getMF(); 2562 2563 MachineMemOperand *WideMMO = 2564 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2565 Observer.changingInstr(MI); 2566 MI.setMemRefs(MF, {WideMMO}); 2567 Observer.changedInstr(MI); 2568 return true; 2569 } 2570 2571 // Don't bother handling edge case that should probably never be produced. 2572 if (ValSize > WideMemSize) 2573 return false; 2574 2575 LLT WideTy = widenToNextPowerOf2(ValTy); 2576 2577 Register WideLoad; 2578 if (!WideTy.isVector()) { 2579 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2580 B.buildTrunc(ValReg, WideLoad).getReg(0); 2581 } else { 2582 // Extract the subvector. 2583 2584 if (isRegisterType(ValTy)) { 2585 // If this a case where G_EXTRACT is legal, use it. 2586 // (e.g. <3 x s32> -> <4 x s32>) 2587 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2588 B.buildExtract(ValReg, WideLoad, 0); 2589 } else { 2590 // For cases where the widened type isn't a nice register value, unmerge 2591 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2592 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2593 B.buildDeleteTrailingVectorElements(ValReg, WideLoad); 2594 } 2595 } 2596 2597 MI.eraseFromParent(); 2598 return true; 2599 } 2600 2601 return false; 2602 } 2603 2604 bool AMDGPULegalizerInfo::legalizeFMad( 2605 MachineInstr &MI, MachineRegisterInfo &MRI, 2606 MachineIRBuilder &B) const { 2607 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2608 assert(Ty.isScalar()); 2609 2610 MachineFunction &MF = B.getMF(); 2611 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2612 2613 // TODO: Always legal with future ftz flag. 2614 // FIXME: Do we need just output? 2615 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2616 return true; 2617 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2618 return true; 2619 2620 MachineIRBuilder HelperBuilder(MI); 2621 GISelObserverWrapper DummyObserver; 2622 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2623 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2624 } 2625 2626 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2627 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2628 Register DstReg = MI.getOperand(0).getReg(); 2629 Register PtrReg = MI.getOperand(1).getReg(); 2630 Register CmpVal = MI.getOperand(2).getReg(); 2631 Register NewVal = MI.getOperand(3).getReg(); 2632 2633 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2634 "this should not have been custom lowered"); 2635 2636 LLT ValTy = MRI.getType(CmpVal); 2637 LLT VecTy = LLT::fixed_vector(2, ValTy); 2638 2639 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2640 2641 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2642 .addDef(DstReg) 2643 .addUse(PtrReg) 2644 .addUse(PackedVal) 2645 .setMemRefs(MI.memoperands()); 2646 2647 MI.eraseFromParent(); 2648 return true; 2649 } 2650 2651 bool AMDGPULegalizerInfo::legalizeFlog( 2652 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2653 Register Dst = MI.getOperand(0).getReg(); 2654 Register Src = MI.getOperand(1).getReg(); 2655 LLT Ty = B.getMRI()->getType(Dst); 2656 unsigned Flags = MI.getFlags(); 2657 2658 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2659 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2660 2661 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2662 MI.eraseFromParent(); 2663 return true; 2664 } 2665 2666 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2667 MachineIRBuilder &B) const { 2668 Register Dst = MI.getOperand(0).getReg(); 2669 Register Src = MI.getOperand(1).getReg(); 2670 unsigned Flags = MI.getFlags(); 2671 LLT Ty = B.getMRI()->getType(Dst); 2672 2673 auto K = B.buildFConstant(Ty, numbers::log2e); 2674 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2675 B.buildFExp2(Dst, Mul, Flags); 2676 MI.eraseFromParent(); 2677 return true; 2678 } 2679 2680 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2681 MachineIRBuilder &B) const { 2682 Register Dst = MI.getOperand(0).getReg(); 2683 Register Src0 = MI.getOperand(1).getReg(); 2684 Register Src1 = MI.getOperand(2).getReg(); 2685 unsigned Flags = MI.getFlags(); 2686 LLT Ty = B.getMRI()->getType(Dst); 2687 const LLT S16 = LLT::scalar(16); 2688 const LLT S32 = LLT::scalar(32); 2689 2690 if (Ty == S32) { 2691 auto Log = B.buildFLog2(S32, Src0, Flags); 2692 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2693 .addUse(Log.getReg(0)) 2694 .addUse(Src1) 2695 .setMIFlags(Flags); 2696 B.buildFExp2(Dst, Mul, Flags); 2697 } else if (Ty == S16) { 2698 // There's no f16 fmul_legacy, so we need to convert for it. 2699 auto Log = B.buildFLog2(S16, Src0, Flags); 2700 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2701 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2702 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2703 .addUse(Ext0.getReg(0)) 2704 .addUse(Ext1.getReg(0)) 2705 .setMIFlags(Flags); 2706 2707 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2708 } else 2709 return false; 2710 2711 MI.eraseFromParent(); 2712 return true; 2713 } 2714 2715 // Find a source register, ignoring any possible source modifiers. 2716 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2717 Register ModSrc = OrigSrc; 2718 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2719 ModSrc = SrcFNeg->getOperand(1).getReg(); 2720 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2721 ModSrc = SrcFAbs->getOperand(1).getReg(); 2722 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2723 ModSrc = SrcFAbs->getOperand(1).getReg(); 2724 return ModSrc; 2725 } 2726 2727 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2728 MachineRegisterInfo &MRI, 2729 MachineIRBuilder &B) const { 2730 2731 const LLT S1 = LLT::scalar(1); 2732 const LLT S64 = LLT::scalar(64); 2733 Register Dst = MI.getOperand(0).getReg(); 2734 Register OrigSrc = MI.getOperand(1).getReg(); 2735 unsigned Flags = MI.getFlags(); 2736 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2737 "this should not have been custom lowered"); 2738 2739 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2740 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2741 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2742 // V_FRACT bug is: 2743 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2744 // 2745 // Convert floor(x) to (x - fract(x)) 2746 2747 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2748 .addUse(OrigSrc) 2749 .setMIFlags(Flags); 2750 2751 // Give source modifier matching some assistance before obscuring a foldable 2752 // pattern. 2753 2754 // TODO: We can avoid the neg on the fract? The input sign to fract 2755 // shouldn't matter? 2756 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2757 2758 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2759 2760 Register Min = MRI.createGenericVirtualRegister(S64); 2761 2762 // We don't need to concern ourselves with the snan handling difference, so 2763 // use the one which will directly select. 2764 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2765 if (MFI->getMode().IEEE) 2766 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2767 else 2768 B.buildFMinNum(Min, Fract, Const, Flags); 2769 2770 Register CorrectedFract = Min; 2771 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2772 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2773 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2774 } 2775 2776 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2777 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2778 2779 MI.eraseFromParent(); 2780 return true; 2781 } 2782 2783 // Turn an illegal packed v2s16 build vector into bit operations. 2784 // TODO: This should probably be a bitcast action in LegalizerHelper. 2785 bool AMDGPULegalizerInfo::legalizeBuildVector( 2786 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2787 Register Dst = MI.getOperand(0).getReg(); 2788 const LLT S32 = LLT::scalar(32); 2789 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 2790 2791 Register Src0 = MI.getOperand(1).getReg(); 2792 Register Src1 = MI.getOperand(2).getReg(); 2793 assert(MRI.getType(Src0) == LLT::scalar(16)); 2794 2795 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2796 B.buildBitcast(Dst, Merge); 2797 2798 MI.eraseFromParent(); 2799 return true; 2800 } 2801 2802 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to 2803 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input 2804 // case with a single min instruction instead of a compare+select. 2805 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, 2806 MachineRegisterInfo &MRI, 2807 MachineIRBuilder &B) const { 2808 Register Dst = MI.getOperand(0).getReg(); 2809 Register Src = MI.getOperand(1).getReg(); 2810 LLT DstTy = MRI.getType(Dst); 2811 LLT SrcTy = MRI.getType(Src); 2812 2813 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ 2814 ? AMDGPU::G_AMDGPU_FFBH_U32 2815 : AMDGPU::G_AMDGPU_FFBL_B32; 2816 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); 2817 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); 2818 2819 MI.eraseFromParent(); 2820 return true; 2821 } 2822 2823 // Check that this is a G_XOR x, -1 2824 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2825 if (MI.getOpcode() != TargetOpcode::G_XOR) 2826 return false; 2827 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2828 return ConstVal && *ConstVal == -1; 2829 } 2830 2831 // Return the use branch instruction, otherwise null if the usage is invalid. 2832 static MachineInstr * 2833 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2834 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2835 Register CondDef = MI.getOperand(0).getReg(); 2836 if (!MRI.hasOneNonDBGUse(CondDef)) 2837 return nullptr; 2838 2839 MachineBasicBlock *Parent = MI.getParent(); 2840 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2841 2842 if (isNot(MRI, *UseMI)) { 2843 Register NegatedCond = UseMI->getOperand(0).getReg(); 2844 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2845 return nullptr; 2846 2847 // We're deleting the def of this value, so we need to remove it. 2848 eraseInstr(*UseMI, MRI); 2849 2850 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2851 Negated = true; 2852 } 2853 2854 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2855 return nullptr; 2856 2857 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2858 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2859 if (Next == Parent->end()) { 2860 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2861 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2862 return nullptr; 2863 UncondBrTarget = &*NextMBB; 2864 } else { 2865 if (Next->getOpcode() != AMDGPU::G_BR) 2866 return nullptr; 2867 Br = &*Next; 2868 UncondBrTarget = Br->getOperand(0).getMBB(); 2869 } 2870 2871 return UseMI; 2872 } 2873 2874 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2875 const ArgDescriptor *Arg, 2876 const TargetRegisterClass *ArgRC, 2877 LLT ArgTy) const { 2878 MCRegister SrcReg = Arg->getRegister(); 2879 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2880 assert(DstReg.isVirtual() && "Virtual register expected"); 2881 2882 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, 2883 *ArgRC, B.getDebugLoc(), ArgTy); 2884 if (Arg->isMasked()) { 2885 // TODO: Should we try to emit this once in the entry block? 2886 const LLT S32 = LLT::scalar(32); 2887 const unsigned Mask = Arg->getMask(); 2888 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2889 2890 Register AndMaskSrc = LiveIn; 2891 2892 // TODO: Avoid clearing the high bits if we know workitem id y/z are always 2893 // 0. 2894 if (Shift != 0) { 2895 auto ShiftAmt = B.buildConstant(S32, Shift); 2896 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2897 } 2898 2899 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2900 } else { 2901 B.buildCopy(DstReg, LiveIn); 2902 } 2903 2904 return true; 2905 } 2906 2907 bool AMDGPULegalizerInfo::loadInputValue( 2908 Register DstReg, MachineIRBuilder &B, 2909 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2910 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2911 const ArgDescriptor *Arg; 2912 const TargetRegisterClass *ArgRC; 2913 LLT ArgTy; 2914 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2915 2916 if (!Arg) { 2917 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { 2918 // The intrinsic may appear when we have a 0 sized kernarg segment, in which 2919 // case the pointer argument may be missing and we use null. 2920 B.buildConstant(DstReg, 0); 2921 return true; 2922 } 2923 2924 // It's undefined behavior if a function marked with the amdgpu-no-* 2925 // attributes uses the corresponding intrinsic. 2926 B.buildUndef(DstReg); 2927 return true; 2928 } 2929 2930 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2931 return false; // TODO: Handle these 2932 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2933 } 2934 2935 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2936 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2937 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2938 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2939 return false; 2940 2941 MI.eraseFromParent(); 2942 return true; 2943 } 2944 2945 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2946 MachineRegisterInfo &MRI, 2947 MachineIRBuilder &B) const { 2948 Register Dst = MI.getOperand(0).getReg(); 2949 LLT DstTy = MRI.getType(Dst); 2950 LLT S16 = LLT::scalar(16); 2951 LLT S32 = LLT::scalar(32); 2952 LLT S64 = LLT::scalar(64); 2953 2954 if (DstTy == S16) 2955 return legalizeFDIV16(MI, MRI, B); 2956 if (DstTy == S32) 2957 return legalizeFDIV32(MI, MRI, B); 2958 if (DstTy == S64) 2959 return legalizeFDIV64(MI, MRI, B); 2960 2961 return false; 2962 } 2963 2964 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 2965 Register DstDivReg, 2966 Register DstRemReg, 2967 Register X, 2968 Register Y) const { 2969 const LLT S1 = LLT::scalar(1); 2970 const LLT S32 = LLT::scalar(32); 2971 2972 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2973 // algorithm used here. 2974 2975 // Initial estimate of inv(y). 2976 auto FloatY = B.buildUITOFP(S32, Y); 2977 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2978 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2979 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2980 auto Z = B.buildFPTOUI(S32, ScaledY); 2981 2982 // One round of UNR. 2983 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2984 auto NegYZ = B.buildMul(S32, NegY, Z); 2985 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2986 2987 // Quotient/remainder estimate. 2988 auto Q = B.buildUMulH(S32, X, Z); 2989 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2990 2991 // First quotient/remainder refinement. 2992 auto One = B.buildConstant(S32, 1); 2993 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2994 if (DstDivReg) 2995 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2996 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2997 2998 // Second quotient/remainder refinement. 2999 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 3000 if (DstDivReg) 3001 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 3002 3003 if (DstRemReg) 3004 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 3005 } 3006 3007 // Build integer reciprocal sequence around V_RCP_IFLAG_F32 3008 // 3009 // Return lo, hi of result 3010 // 3011 // %cvt.lo = G_UITOFP Val.lo 3012 // %cvt.hi = G_UITOFP Val.hi 3013 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 3014 // %rcp = G_AMDGPU_RCP_IFLAG %mad 3015 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 3016 // %mul2 = G_FMUL %mul1, 2**(-32) 3017 // %trunc = G_INTRINSIC_TRUNC %mul2 3018 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 3019 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 3020 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 3021 Register Val) { 3022 const LLT S32 = LLT::scalar(32); 3023 auto Unmerge = B.buildUnmerge(S32, Val); 3024 3025 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 3026 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 3027 3028 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 3029 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 3030 3031 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 3032 auto Mul1 = 3033 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 3034 3035 // 2**(-32) 3036 auto Mul2 = 3037 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 3038 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 3039 3040 // -(2**32) 3041 auto Mad2 = B.buildFMAD(S32, Trunc, 3042 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 3043 3044 auto ResultLo = B.buildFPTOUI(S32, Mad2); 3045 auto ResultHi = B.buildFPTOUI(S32, Trunc); 3046 3047 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 3048 } 3049 3050 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 3051 Register DstDivReg, 3052 Register DstRemReg, 3053 Register Numer, 3054 Register Denom) const { 3055 const LLT S32 = LLT::scalar(32); 3056 const LLT S64 = LLT::scalar(64); 3057 const LLT S1 = LLT::scalar(1); 3058 Register RcpLo, RcpHi; 3059 3060 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 3061 3062 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 3063 3064 auto Zero64 = B.buildConstant(S64, 0); 3065 auto NegDenom = B.buildSub(S64, Zero64, Denom); 3066 3067 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 3068 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 3069 3070 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 3071 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 3072 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 3073 3074 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 3075 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 3076 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 3077 3078 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 3079 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 3080 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 3081 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 3082 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 3083 3084 auto Zero32 = B.buildConstant(S32, 0); 3085 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 3086 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1)); 3087 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 3088 3089 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 3090 Register NumerLo = UnmergeNumer.getReg(0); 3091 Register NumerHi = UnmergeNumer.getReg(1); 3092 3093 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 3094 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 3095 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 3096 Register Mul3_Lo = UnmergeMul3.getReg(0); 3097 Register Mul3_Hi = UnmergeMul3.getReg(1); 3098 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 3099 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 3100 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 3101 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 3102 3103 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 3104 Register DenomLo = UnmergeDenom.getReg(0); 3105 Register DenomHi = UnmergeDenom.getReg(1); 3106 3107 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 3108 auto C1 = B.buildSExt(S32, CmpHi); 3109 3110 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 3111 auto C2 = B.buildSExt(S32, CmpLo); 3112 3113 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 3114 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 3115 3116 // TODO: Here and below portions of the code can be enclosed into if/endif. 3117 // Currently control flow is unconditional and we have 4 selects after 3118 // potential endif to substitute PHIs. 3119 3120 // if C3 != 0 ... 3121 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 3122 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 3123 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 3124 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 3125 3126 auto One64 = B.buildConstant(S64, 1); 3127 auto Add3 = B.buildAdd(S64, MulHi3, One64); 3128 3129 auto C4 = 3130 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 3131 auto C5 = 3132 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 3133 auto C6 = B.buildSelect( 3134 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 3135 3136 // if (C6 != 0) 3137 auto Add4 = B.buildAdd(S64, Add3, One64); 3138 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 3139 3140 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 3141 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 3142 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 3143 3144 // endif C6 3145 // endif C3 3146 3147 if (DstDivReg) { 3148 auto Sel1 = B.buildSelect( 3149 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3150 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3151 Sel1, MulHi3); 3152 } 3153 3154 if (DstRemReg) { 3155 auto Sel2 = B.buildSelect( 3156 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3157 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3158 Sel2, Sub1); 3159 } 3160 } 3161 3162 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3163 MachineRegisterInfo &MRI, 3164 MachineIRBuilder &B) const { 3165 Register DstDivReg, DstRemReg; 3166 switch (MI.getOpcode()) { 3167 default: 3168 llvm_unreachable("Unexpected opcode!"); 3169 case AMDGPU::G_UDIV: { 3170 DstDivReg = MI.getOperand(0).getReg(); 3171 break; 3172 } 3173 case AMDGPU::G_UREM: { 3174 DstRemReg = MI.getOperand(0).getReg(); 3175 break; 3176 } 3177 case AMDGPU::G_UDIVREM: { 3178 DstDivReg = MI.getOperand(0).getReg(); 3179 DstRemReg = MI.getOperand(1).getReg(); 3180 break; 3181 } 3182 } 3183 3184 const LLT S64 = LLT::scalar(64); 3185 const LLT S32 = LLT::scalar(32); 3186 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3187 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3188 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3189 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3190 3191 if (Ty == S32) 3192 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3193 else if (Ty == S64) 3194 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3195 else 3196 return false; 3197 3198 MI.eraseFromParent(); 3199 return true; 3200 } 3201 3202 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3203 MachineRegisterInfo &MRI, 3204 MachineIRBuilder &B) const { 3205 const LLT S64 = LLT::scalar(64); 3206 const LLT S32 = LLT::scalar(32); 3207 3208 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3209 if (Ty != S32 && Ty != S64) 3210 return false; 3211 3212 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3213 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3214 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3215 3216 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3217 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3218 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3219 3220 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3221 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3222 3223 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3224 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3225 3226 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3227 switch (MI.getOpcode()) { 3228 default: 3229 llvm_unreachable("Unexpected opcode!"); 3230 case AMDGPU::G_SDIV: { 3231 DstDivReg = MI.getOperand(0).getReg(); 3232 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3233 break; 3234 } 3235 case AMDGPU::G_SREM: { 3236 DstRemReg = MI.getOperand(0).getReg(); 3237 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3238 break; 3239 } 3240 case AMDGPU::G_SDIVREM: { 3241 DstDivReg = MI.getOperand(0).getReg(); 3242 DstRemReg = MI.getOperand(1).getReg(); 3243 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3244 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3245 break; 3246 } 3247 } 3248 3249 if (Ty == S32) 3250 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3251 else 3252 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3253 3254 if (DstDivReg) { 3255 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3256 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3257 B.buildSub(DstDivReg, SignXor, Sign); 3258 } 3259 3260 if (DstRemReg) { 3261 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3262 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3263 B.buildSub(DstRemReg, SignXor, Sign); 3264 } 3265 3266 MI.eraseFromParent(); 3267 return true; 3268 } 3269 3270 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3271 MachineRegisterInfo &MRI, 3272 MachineIRBuilder &B) const { 3273 Register Res = MI.getOperand(0).getReg(); 3274 Register LHS = MI.getOperand(1).getReg(); 3275 Register RHS = MI.getOperand(2).getReg(); 3276 uint16_t Flags = MI.getFlags(); 3277 LLT ResTy = MRI.getType(Res); 3278 3279 const MachineFunction &MF = B.getMF(); 3280 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3281 MI.getFlag(MachineInstr::FmAfn); 3282 3283 if (!AllowInaccurateRcp) 3284 return false; 3285 3286 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3287 // 1 / x -> RCP(x) 3288 if (CLHS->isExactlyValue(1.0)) { 3289 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3290 .addUse(RHS) 3291 .setMIFlags(Flags); 3292 3293 MI.eraseFromParent(); 3294 return true; 3295 } 3296 3297 // -1 / x -> RCP( FNEG(x) ) 3298 if (CLHS->isExactlyValue(-1.0)) { 3299 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3300 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3301 .addUse(FNeg.getReg(0)) 3302 .setMIFlags(Flags); 3303 3304 MI.eraseFromParent(); 3305 return true; 3306 } 3307 } 3308 3309 // x / y -> x * (1.0 / y) 3310 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3311 .addUse(RHS) 3312 .setMIFlags(Flags); 3313 B.buildFMul(Res, LHS, RCP, Flags); 3314 3315 MI.eraseFromParent(); 3316 return true; 3317 } 3318 3319 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3320 MachineRegisterInfo &MRI, 3321 MachineIRBuilder &B) const { 3322 Register Res = MI.getOperand(0).getReg(); 3323 Register X = MI.getOperand(1).getReg(); 3324 Register Y = MI.getOperand(2).getReg(); 3325 uint16_t Flags = MI.getFlags(); 3326 LLT ResTy = MRI.getType(Res); 3327 3328 const MachineFunction &MF = B.getMF(); 3329 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3330 MI.getFlag(MachineInstr::FmAfn); 3331 3332 if (!AllowInaccurateRcp) 3333 return false; 3334 3335 auto NegY = B.buildFNeg(ResTy, Y); 3336 auto One = B.buildFConstant(ResTy, 1.0); 3337 3338 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3339 .addUse(Y) 3340 .setMIFlags(Flags); 3341 3342 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3343 R = B.buildFMA(ResTy, Tmp0, R, R); 3344 3345 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3346 R = B.buildFMA(ResTy, Tmp1, R, R); 3347 3348 auto Ret = B.buildFMul(ResTy, X, R); 3349 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3350 3351 B.buildFMA(Res, Tmp2, R, Ret); 3352 MI.eraseFromParent(); 3353 return true; 3354 } 3355 3356 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3357 MachineRegisterInfo &MRI, 3358 MachineIRBuilder &B) const { 3359 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3360 return true; 3361 3362 Register Res = MI.getOperand(0).getReg(); 3363 Register LHS = MI.getOperand(1).getReg(); 3364 Register RHS = MI.getOperand(2).getReg(); 3365 3366 uint16_t Flags = MI.getFlags(); 3367 3368 LLT S16 = LLT::scalar(16); 3369 LLT S32 = LLT::scalar(32); 3370 3371 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3372 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3373 3374 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3375 .addUse(RHSExt.getReg(0)) 3376 .setMIFlags(Flags); 3377 3378 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3379 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3380 3381 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3382 .addUse(RDst.getReg(0)) 3383 .addUse(RHS) 3384 .addUse(LHS) 3385 .setMIFlags(Flags); 3386 3387 MI.eraseFromParent(); 3388 return true; 3389 } 3390 3391 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3392 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3393 static void toggleSPDenormMode(bool Enable, 3394 MachineIRBuilder &B, 3395 const GCNSubtarget &ST, 3396 AMDGPU::SIModeRegisterDefaults Mode) { 3397 // Set SP denorm mode to this value. 3398 unsigned SPDenormMode = 3399 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3400 3401 if (ST.hasDenormModeInst()) { 3402 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3403 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3404 3405 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3406 B.buildInstr(AMDGPU::S_DENORM_MODE) 3407 .addImm(NewDenormModeValue); 3408 3409 } else { 3410 // Select FP32 bit field in mode register. 3411 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3412 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3413 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3414 3415 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3416 .addImm(SPDenormMode) 3417 .addImm(SPDenormModeBitField); 3418 } 3419 } 3420 3421 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3422 MachineRegisterInfo &MRI, 3423 MachineIRBuilder &B) const { 3424 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3425 return true; 3426 3427 Register Res = MI.getOperand(0).getReg(); 3428 Register LHS = MI.getOperand(1).getReg(); 3429 Register RHS = MI.getOperand(2).getReg(); 3430 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3431 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3432 3433 uint16_t Flags = MI.getFlags(); 3434 3435 LLT S32 = LLT::scalar(32); 3436 LLT S1 = LLT::scalar(1); 3437 3438 auto One = B.buildFConstant(S32, 1.0f); 3439 3440 auto DenominatorScaled = 3441 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3442 .addUse(LHS) 3443 .addUse(RHS) 3444 .addImm(0) 3445 .setMIFlags(Flags); 3446 auto NumeratorScaled = 3447 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3448 .addUse(LHS) 3449 .addUse(RHS) 3450 .addImm(1) 3451 .setMIFlags(Flags); 3452 3453 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3454 .addUse(DenominatorScaled.getReg(0)) 3455 .setMIFlags(Flags); 3456 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3457 3458 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3459 // aren't modeled as reading it. 3460 if (!Mode.allFP32Denormals()) 3461 toggleSPDenormMode(true, B, ST, Mode); 3462 3463 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3464 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3465 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3466 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3467 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3468 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3469 3470 if (!Mode.allFP32Denormals()) 3471 toggleSPDenormMode(false, B, ST, Mode); 3472 3473 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3474 .addUse(Fma4.getReg(0)) 3475 .addUse(Fma1.getReg(0)) 3476 .addUse(Fma3.getReg(0)) 3477 .addUse(NumeratorScaled.getReg(1)) 3478 .setMIFlags(Flags); 3479 3480 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3481 .addUse(Fmas.getReg(0)) 3482 .addUse(RHS) 3483 .addUse(LHS) 3484 .setMIFlags(Flags); 3485 3486 MI.eraseFromParent(); 3487 return true; 3488 } 3489 3490 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3491 MachineRegisterInfo &MRI, 3492 MachineIRBuilder &B) const { 3493 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3494 return true; 3495 3496 Register Res = MI.getOperand(0).getReg(); 3497 Register LHS = MI.getOperand(1).getReg(); 3498 Register RHS = MI.getOperand(2).getReg(); 3499 3500 uint16_t Flags = MI.getFlags(); 3501 3502 LLT S64 = LLT::scalar(64); 3503 LLT S1 = LLT::scalar(1); 3504 3505 auto One = B.buildFConstant(S64, 1.0); 3506 3507 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3508 .addUse(LHS) 3509 .addUse(RHS) 3510 .addImm(0) 3511 .setMIFlags(Flags); 3512 3513 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3514 3515 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3516 .addUse(DivScale0.getReg(0)) 3517 .setMIFlags(Flags); 3518 3519 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3520 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3521 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3522 3523 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3524 .addUse(LHS) 3525 .addUse(RHS) 3526 .addImm(1) 3527 .setMIFlags(Flags); 3528 3529 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3530 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3531 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3532 3533 Register Scale; 3534 if (!ST.hasUsableDivScaleConditionOutput()) { 3535 // Workaround a hardware bug on SI where the condition output from div_scale 3536 // is not usable. 3537 3538 LLT S32 = LLT::scalar(32); 3539 3540 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3541 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3542 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3543 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3544 3545 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3546 Scale1Unmerge.getReg(1)); 3547 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3548 Scale0Unmerge.getReg(1)); 3549 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3550 } else { 3551 Scale = DivScale1.getReg(1); 3552 } 3553 3554 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3555 .addUse(Fma4.getReg(0)) 3556 .addUse(Fma3.getReg(0)) 3557 .addUse(Mul.getReg(0)) 3558 .addUse(Scale) 3559 .setMIFlags(Flags); 3560 3561 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3562 .addUse(Fmas.getReg(0)) 3563 .addUse(RHS) 3564 .addUse(LHS) 3565 .setMIFlags(Flags); 3566 3567 MI.eraseFromParent(); 3568 return true; 3569 } 3570 3571 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3572 MachineRegisterInfo &MRI, 3573 MachineIRBuilder &B) const { 3574 Register Res = MI.getOperand(0).getReg(); 3575 Register LHS = MI.getOperand(2).getReg(); 3576 Register RHS = MI.getOperand(3).getReg(); 3577 uint16_t Flags = MI.getFlags(); 3578 3579 LLT S32 = LLT::scalar(32); 3580 LLT S1 = LLT::scalar(1); 3581 3582 auto Abs = B.buildFAbs(S32, RHS, Flags); 3583 const APFloat C0Val(1.0f); 3584 3585 auto C0 = B.buildConstant(S32, 0x6f800000); 3586 auto C1 = B.buildConstant(S32, 0x2f800000); 3587 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3588 3589 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3590 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3591 3592 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3593 3594 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3595 .addUse(Mul0.getReg(0)) 3596 .setMIFlags(Flags); 3597 3598 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3599 3600 B.buildFMul(Res, Sel, Mul1, Flags); 3601 3602 MI.eraseFromParent(); 3603 return true; 3604 } 3605 3606 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3607 // FIXME: Why do we handle this one but not other removed instructions? 3608 // 3609 // Reciprocal square root. The clamp prevents infinite results, clamping 3610 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3611 // +-max_float. 3612 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3613 MachineRegisterInfo &MRI, 3614 MachineIRBuilder &B) const { 3615 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3616 return true; 3617 3618 Register Dst = MI.getOperand(0).getReg(); 3619 Register Src = MI.getOperand(2).getReg(); 3620 auto Flags = MI.getFlags(); 3621 3622 LLT Ty = MRI.getType(Dst); 3623 3624 const fltSemantics *FltSemantics; 3625 if (Ty == LLT::scalar(32)) 3626 FltSemantics = &APFloat::IEEEsingle(); 3627 else if (Ty == LLT::scalar(64)) 3628 FltSemantics = &APFloat::IEEEdouble(); 3629 else 3630 return false; 3631 3632 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3633 .addUse(Src) 3634 .setMIFlags(Flags); 3635 3636 // We don't need to concern ourselves with the snan handling difference, since 3637 // the rsq quieted (or not) so use the one which will directly select. 3638 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3639 const bool UseIEEE = MFI->getMode().IEEE; 3640 3641 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3642 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3643 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3644 3645 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3646 3647 if (UseIEEE) 3648 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3649 else 3650 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3651 MI.eraseFromParent(); 3652 return true; 3653 } 3654 3655 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3656 switch (IID) { 3657 case Intrinsic::amdgcn_ds_fadd: 3658 return AMDGPU::G_ATOMICRMW_FADD; 3659 case Intrinsic::amdgcn_ds_fmin: 3660 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3661 case Intrinsic::amdgcn_ds_fmax: 3662 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3663 default: 3664 llvm_unreachable("not a DS FP intrinsic"); 3665 } 3666 } 3667 3668 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3669 MachineInstr &MI, 3670 Intrinsic::ID IID) const { 3671 GISelChangeObserver &Observer = Helper.Observer; 3672 Observer.changingInstr(MI); 3673 3674 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3675 3676 // The remaining operands were used to set fields in the MemOperand on 3677 // construction. 3678 for (int I = 6; I > 3; --I) 3679 MI.RemoveOperand(I); 3680 3681 MI.RemoveOperand(1); // Remove the intrinsic ID. 3682 Observer.changedInstr(MI); 3683 return true; 3684 } 3685 3686 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3687 MachineRegisterInfo &MRI, 3688 MachineIRBuilder &B) const { 3689 uint64_t Offset = 3690 ST.getTargetLowering()->getImplicitParameterOffset( 3691 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3692 LLT DstTy = MRI.getType(DstReg); 3693 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3694 3695 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3696 if (!loadInputValue(KernargPtrReg, B, 3697 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3698 return false; 3699 3700 // FIXME: This should be nuw 3701 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3702 return true; 3703 } 3704 3705 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3706 MachineRegisterInfo &MRI, 3707 MachineIRBuilder &B) const { 3708 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3709 if (!MFI->isEntryFunction()) { 3710 return legalizePreloadedArgIntrin(MI, MRI, B, 3711 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3712 } 3713 3714 Register DstReg = MI.getOperand(0).getReg(); 3715 if (!getImplicitArgPtr(DstReg, MRI, B)) 3716 return false; 3717 3718 MI.eraseFromParent(); 3719 return true; 3720 } 3721 3722 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3723 MachineRegisterInfo &MRI, 3724 MachineIRBuilder &B, 3725 unsigned AddrSpace) const { 3726 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3727 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3728 Register Hi32 = Unmerge.getReg(1); 3729 3730 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3731 MI.eraseFromParent(); 3732 return true; 3733 } 3734 3735 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3736 // offset (the offset that is included in bounds checking and swizzling, to be 3737 // split between the instruction's voffset and immoffset fields) and soffset 3738 // (the offset that is excluded from bounds checking and swizzling, to go in 3739 // the instruction's soffset field). This function takes the first kind of 3740 // offset and figures out how to split it between voffset and immoffset. 3741 std::pair<Register, unsigned> 3742 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3743 Register OrigOffset) const { 3744 const unsigned MaxImm = 4095; 3745 Register BaseReg; 3746 unsigned ImmOffset; 3747 const LLT S32 = LLT::scalar(32); 3748 MachineRegisterInfo &MRI = *B.getMRI(); 3749 3750 std::tie(BaseReg, ImmOffset) = 3751 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3752 3753 // If BaseReg is a pointer, convert it to int. 3754 if (MRI.getType(BaseReg).isPointer()) 3755 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3756 3757 // If the immediate value is too big for the immoffset field, put the value 3758 // and -4096 into the immoffset field so that the value that is copied/added 3759 // for the voffset field is a multiple of 4096, and it stands more chance 3760 // of being CSEd with the copy/add for another similar load/store. 3761 // However, do not do that rounding down to a multiple of 4096 if that is a 3762 // negative number, as it appears to be illegal to have a negative offset 3763 // in the vgpr, even if adding the immediate offset makes it positive. 3764 unsigned Overflow = ImmOffset & ~MaxImm; 3765 ImmOffset -= Overflow; 3766 if ((int32_t)Overflow < 0) { 3767 Overflow += ImmOffset; 3768 ImmOffset = 0; 3769 } 3770 3771 if (Overflow != 0) { 3772 if (!BaseReg) { 3773 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3774 } else { 3775 auto OverflowVal = B.buildConstant(S32, Overflow); 3776 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3777 } 3778 } 3779 3780 if (!BaseReg) 3781 BaseReg = B.buildConstant(S32, 0).getReg(0); 3782 3783 return std::make_pair(BaseReg, ImmOffset); 3784 } 3785 3786 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic. 3787 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, 3788 Register VOffset, Register SOffset, 3789 unsigned ImmOffset, Register VIndex, 3790 MachineRegisterInfo &MRI) const { 3791 Optional<ValueAndVReg> MaybeVOffsetVal = 3792 getIConstantVRegValWithLookThrough(VOffset, MRI); 3793 Optional<ValueAndVReg> MaybeSOffsetVal = 3794 getIConstantVRegValWithLookThrough(SOffset, MRI); 3795 Optional<ValueAndVReg> MaybeVIndexVal = 3796 getIConstantVRegValWithLookThrough(VIndex, MRI); 3797 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant, 3798 // update the MMO with that offset. The stride is unknown so we can only do 3799 // this if VIndex is constant 0. 3800 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && 3801 MaybeVIndexVal->Value == 0) { 3802 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + 3803 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; 3804 MMO->setOffset(TotalOffset); 3805 } else { 3806 // We don't have a constant combined offset to use in the MMO. Give up. 3807 MMO->setValue((Value *)nullptr); 3808 } 3809 } 3810 3811 /// Handle register layout difference for f16 images for some subtargets. 3812 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3813 MachineRegisterInfo &MRI, 3814 Register Reg, 3815 bool ImageStore) const { 3816 const LLT S16 = LLT::scalar(16); 3817 const LLT S32 = LLT::scalar(32); 3818 LLT StoreVT = MRI.getType(Reg); 3819 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3820 3821 if (ST.hasUnpackedD16VMem()) { 3822 auto Unmerge = B.buildUnmerge(S16, Reg); 3823 3824 SmallVector<Register, 4> WideRegs; 3825 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3826 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3827 3828 int NumElts = StoreVT.getNumElements(); 3829 3830 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 3831 .getReg(0); 3832 } 3833 3834 if (ImageStore && ST.hasImageStoreD16Bug()) { 3835 if (StoreVT.getNumElements() == 2) { 3836 SmallVector<Register, 4> PackedRegs; 3837 Reg = B.buildBitcast(S32, Reg).getReg(0); 3838 PackedRegs.push_back(Reg); 3839 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3840 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 3841 .getReg(0); 3842 } 3843 3844 if (StoreVT.getNumElements() == 3) { 3845 SmallVector<Register, 4> PackedRegs; 3846 auto Unmerge = B.buildUnmerge(S16, Reg); 3847 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3848 PackedRegs.push_back(Unmerge.getReg(I)); 3849 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3850 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 3851 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 3852 } 3853 3854 if (StoreVT.getNumElements() == 4) { 3855 SmallVector<Register, 4> PackedRegs; 3856 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 3857 auto Unmerge = B.buildUnmerge(S32, Reg); 3858 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3859 PackedRegs.push_back(Unmerge.getReg(I)); 3860 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3861 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 3862 .getReg(0); 3863 } 3864 3865 llvm_unreachable("invalid data type"); 3866 } 3867 3868 if (StoreVT == LLT::fixed_vector(3, S16)) { 3869 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) 3870 .getReg(0); 3871 } 3872 return Reg; 3873 } 3874 3875 Register AMDGPULegalizerInfo::fixStoreSourceType( 3876 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3877 MachineRegisterInfo *MRI = B.getMRI(); 3878 LLT Ty = MRI->getType(VData); 3879 3880 const LLT S16 = LLT::scalar(16); 3881 3882 // Fixup illegal register types for i8 stores. 3883 if (Ty == LLT::scalar(8) || Ty == S16) { 3884 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3885 return AnyExt; 3886 } 3887 3888 if (Ty.isVector()) { 3889 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3890 if (IsFormat) 3891 return handleD16VData(B, *MRI, VData); 3892 } 3893 } 3894 3895 return VData; 3896 } 3897 3898 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3899 MachineRegisterInfo &MRI, 3900 MachineIRBuilder &B, 3901 bool IsTyped, 3902 bool IsFormat) const { 3903 Register VData = MI.getOperand(1).getReg(); 3904 LLT Ty = MRI.getType(VData); 3905 LLT EltTy = Ty.getScalarType(); 3906 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3907 const LLT S32 = LLT::scalar(32); 3908 3909 VData = fixStoreSourceType(B, VData, IsFormat); 3910 Register RSrc = MI.getOperand(2).getReg(); 3911 3912 MachineMemOperand *MMO = *MI.memoperands_begin(); 3913 const int MemSize = MMO->getSize(); 3914 3915 unsigned ImmOffset; 3916 3917 // The typed intrinsics add an immediate after the registers. 3918 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3919 3920 // The struct intrinsic variants add one additional operand over raw. 3921 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3922 Register VIndex; 3923 int OpOffset = 0; 3924 if (HasVIndex) { 3925 VIndex = MI.getOperand(3).getReg(); 3926 OpOffset = 1; 3927 } else { 3928 VIndex = B.buildConstant(S32, 0).getReg(0); 3929 } 3930 3931 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3932 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3933 3934 unsigned Format = 0; 3935 if (IsTyped) { 3936 Format = MI.getOperand(5 + OpOffset).getImm(); 3937 ++OpOffset; 3938 } 3939 3940 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3941 3942 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 3943 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 3944 3945 unsigned Opc; 3946 if (IsTyped) { 3947 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3948 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3949 } else if (IsFormat) { 3950 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3951 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3952 } else { 3953 switch (MemSize) { 3954 case 1: 3955 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3956 break; 3957 case 2: 3958 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3959 break; 3960 default: 3961 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3962 break; 3963 } 3964 } 3965 3966 auto MIB = B.buildInstr(Opc) 3967 .addUse(VData) // vdata 3968 .addUse(RSrc) // rsrc 3969 .addUse(VIndex) // vindex 3970 .addUse(VOffset) // voffset 3971 .addUse(SOffset) // soffset 3972 .addImm(ImmOffset); // offset(imm) 3973 3974 if (IsTyped) 3975 MIB.addImm(Format); 3976 3977 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3978 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3979 .addMemOperand(MMO); 3980 3981 MI.eraseFromParent(); 3982 return true; 3983 } 3984 3985 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3986 MachineRegisterInfo &MRI, 3987 MachineIRBuilder &B, 3988 bool IsFormat, 3989 bool IsTyped) const { 3990 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3991 MachineMemOperand *MMO = *MI.memoperands_begin(); 3992 const LLT MemTy = MMO->getMemoryType(); 3993 const LLT S32 = LLT::scalar(32); 3994 3995 Register Dst = MI.getOperand(0).getReg(); 3996 Register RSrc = MI.getOperand(2).getReg(); 3997 3998 // The typed intrinsics add an immediate after the registers. 3999 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 4000 4001 // The struct intrinsic variants add one additional operand over raw. 4002 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4003 Register VIndex; 4004 int OpOffset = 0; 4005 if (HasVIndex) { 4006 VIndex = MI.getOperand(3).getReg(); 4007 OpOffset = 1; 4008 } else { 4009 VIndex = B.buildConstant(S32, 0).getReg(0); 4010 } 4011 4012 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 4013 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 4014 4015 unsigned Format = 0; 4016 if (IsTyped) { 4017 Format = MI.getOperand(5 + OpOffset).getImm(); 4018 ++OpOffset; 4019 } 4020 4021 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 4022 unsigned ImmOffset; 4023 4024 LLT Ty = MRI.getType(Dst); 4025 LLT EltTy = Ty.getScalarType(); 4026 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 4027 const bool Unpacked = ST.hasUnpackedD16VMem(); 4028 4029 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4030 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 4031 4032 unsigned Opc; 4033 4034 if (IsTyped) { 4035 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 4036 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 4037 } else if (IsFormat) { 4038 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 4039 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 4040 } else { 4041 switch (MemTy.getSizeInBits()) { 4042 case 8: 4043 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 4044 break; 4045 case 16: 4046 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 4047 break; 4048 default: 4049 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 4050 break; 4051 } 4052 } 4053 4054 Register LoadDstReg; 4055 4056 bool IsExtLoad = 4057 (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector()); 4058 LLT UnpackedTy = Ty.changeElementSize(32); 4059 4060 if (IsExtLoad) 4061 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 4062 else if (Unpacked && IsD16 && Ty.isVector()) 4063 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 4064 else 4065 LoadDstReg = Dst; 4066 4067 auto MIB = B.buildInstr(Opc) 4068 .addDef(LoadDstReg) // vdata 4069 .addUse(RSrc) // rsrc 4070 .addUse(VIndex) // vindex 4071 .addUse(VOffset) // voffset 4072 .addUse(SOffset) // soffset 4073 .addImm(ImmOffset); // offset(imm) 4074 4075 if (IsTyped) 4076 MIB.addImm(Format); 4077 4078 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4079 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4080 .addMemOperand(MMO); 4081 4082 if (LoadDstReg != Dst) { 4083 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 4084 4085 // Widen result for extending loads was widened. 4086 if (IsExtLoad) 4087 B.buildTrunc(Dst, LoadDstReg); 4088 else { 4089 // Repack to original 16-bit vector result 4090 // FIXME: G_TRUNC should work, but legalization currently fails 4091 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 4092 SmallVector<Register, 4> Repack; 4093 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 4094 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 4095 B.buildMerge(Dst, Repack); 4096 } 4097 } 4098 4099 MI.eraseFromParent(); 4100 return true; 4101 } 4102 4103 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 4104 MachineIRBuilder &B, 4105 bool IsInc) const { 4106 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 4107 AMDGPU::G_AMDGPU_ATOMIC_DEC; 4108 B.buildInstr(Opc) 4109 .addDef(MI.getOperand(0).getReg()) 4110 .addUse(MI.getOperand(2).getReg()) 4111 .addUse(MI.getOperand(3).getReg()) 4112 .cloneMemRefs(MI); 4113 MI.eraseFromParent(); 4114 return true; 4115 } 4116 4117 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 4118 switch (IntrID) { 4119 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4120 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4121 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 4122 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4123 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4124 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 4125 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4126 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4127 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 4128 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4129 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4130 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 4131 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4132 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4133 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 4134 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4135 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4136 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 4137 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4138 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4139 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 4140 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4141 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4142 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 4143 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4144 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4145 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 4146 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4147 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4148 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 4149 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4150 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4151 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 4152 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4153 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4154 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 4155 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4156 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4157 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4158 case Intrinsic::amdgcn_buffer_atomic_fadd: 4159 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4160 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4161 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4162 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4163 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4164 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4165 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4166 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4167 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 4168 default: 4169 llvm_unreachable("unhandled atomic opcode"); 4170 } 4171 } 4172 4173 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4174 MachineIRBuilder &B, 4175 Intrinsic::ID IID) const { 4176 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4177 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4178 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4179 4180 Register Dst; 4181 4182 int OpOffset = 0; 4183 if (HasReturn) { 4184 // A few FP atomics do not support return values. 4185 Dst = MI.getOperand(0).getReg(); 4186 } else { 4187 OpOffset = -1; 4188 } 4189 4190 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4191 Register CmpVal; 4192 4193 if (IsCmpSwap) { 4194 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4195 ++OpOffset; 4196 } 4197 4198 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4199 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4200 4201 // The struct intrinsic variants add one additional operand over raw. 4202 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4203 Register VIndex; 4204 if (HasVIndex) { 4205 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4206 ++OpOffset; 4207 } else { 4208 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4209 } 4210 4211 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4212 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4213 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4214 4215 MachineMemOperand *MMO = *MI.memoperands_begin(); 4216 4217 unsigned ImmOffset; 4218 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4219 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); 4220 4221 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4222 4223 if (HasReturn) 4224 MIB.addDef(Dst); 4225 4226 MIB.addUse(VData); // vdata 4227 4228 if (IsCmpSwap) 4229 MIB.addReg(CmpVal); 4230 4231 MIB.addUse(RSrc) // rsrc 4232 .addUse(VIndex) // vindex 4233 .addUse(VOffset) // voffset 4234 .addUse(SOffset) // soffset 4235 .addImm(ImmOffset) // offset(imm) 4236 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4237 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4238 .addMemOperand(MMO); 4239 4240 MI.eraseFromParent(); 4241 return true; 4242 } 4243 4244 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4245 /// vector with s16 typed elements. 4246 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4247 SmallVectorImpl<Register> &PackedAddrs, 4248 unsigned ArgOffset, 4249 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4250 bool IsA16, bool IsG16) { 4251 const LLT S16 = LLT::scalar(16); 4252 const LLT V2S16 = LLT::fixed_vector(2, 16); 4253 auto EndIdx = Intr->VAddrEnd; 4254 4255 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4256 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4257 if (!SrcOp.isReg()) 4258 continue; // _L to _LZ may have eliminated this. 4259 4260 Register AddrReg = SrcOp.getReg(); 4261 4262 if ((I < Intr->GradientStart) || 4263 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4264 (I >= Intr->CoordStart && !IsA16)) { 4265 // Handle any gradient or coordinate operands that should not be packed 4266 if ((I < Intr->GradientStart) && IsA16 && 4267 (B.getMRI()->getType(AddrReg) == S16)) { 4268 // Special handling of bias when A16 is on. Bias is of type half but 4269 // occupies full 32-bit. 4270 PackedAddrs.push_back( 4271 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4272 .getReg(0)); 4273 } else { 4274 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4275 PackedAddrs.push_back(AddrReg); 4276 } 4277 } else { 4278 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4279 // derivatives dx/dh and dx/dv are packed with undef. 4280 if (((I + 1) >= EndIdx) || 4281 ((Intr->NumGradients / 2) % 2 == 1 && 4282 (I == static_cast<unsigned>(Intr->GradientStart + 4283 (Intr->NumGradients / 2) - 1) || 4284 I == static_cast<unsigned>(Intr->GradientStart + 4285 Intr->NumGradients - 1))) || 4286 // Check for _L to _LZ optimization 4287 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4288 PackedAddrs.push_back( 4289 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4290 .getReg(0)); 4291 } else { 4292 PackedAddrs.push_back( 4293 B.buildBuildVector( 4294 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4295 .getReg(0)); 4296 ++I; 4297 } 4298 } 4299 } 4300 } 4301 4302 /// Convert from separate vaddr components to a single vector address register, 4303 /// and replace the remaining operands with $noreg. 4304 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4305 int DimIdx, int NumVAddrs) { 4306 const LLT S32 = LLT::scalar(32); 4307 4308 SmallVector<Register, 8> AddrRegs; 4309 for (int I = 0; I != NumVAddrs; ++I) { 4310 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4311 if (SrcOp.isReg()) { 4312 AddrRegs.push_back(SrcOp.getReg()); 4313 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4314 } 4315 } 4316 4317 int NumAddrRegs = AddrRegs.size(); 4318 if (NumAddrRegs != 1) { 4319 // Above 8 elements round up to next power of 2 (i.e. 16). 4320 if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) { 4321 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4322 auto Undef = B.buildUndef(S32); 4323 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4324 NumAddrRegs = RoundedNumRegs; 4325 } 4326 4327 auto VAddr = 4328 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 4329 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4330 } 4331 4332 for (int I = 1; I != NumVAddrs; ++I) { 4333 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4334 if (SrcOp.isReg()) 4335 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4336 } 4337 } 4338 4339 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4340 /// 4341 /// Depending on the subtarget, load/store with 16-bit element data need to be 4342 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4343 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4344 /// registers. 4345 /// 4346 /// We don't want to directly select image instructions just yet, but also want 4347 /// to exposes all register repacking to the legalizer/combiners. We also don't 4348 /// want a selected instrution entering RegBankSelect. In order to avoid 4349 /// defining a multitude of intermediate image instructions, directly hack on 4350 /// the intrinsic's arguments. In cases like a16 addresses, this requires 4351 /// padding now unnecessary arguments with $noreg. 4352 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4353 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4354 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4355 4356 const unsigned NumDefs = MI.getNumExplicitDefs(); 4357 const unsigned ArgOffset = NumDefs + 1; 4358 bool IsTFE = NumDefs == 2; 4359 // We are only processing the operands of d16 image operations on subtargets 4360 // that use the unpacked register layout, or need to repack the TFE result. 4361 4362 // TODO: Do we need to guard against already legalized intrinsics? 4363 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4364 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4365 4366 MachineRegisterInfo *MRI = B.getMRI(); 4367 const LLT S32 = LLT::scalar(32); 4368 const LLT S16 = LLT::scalar(16); 4369 const LLT V2S16 = LLT::fixed_vector(2, 16); 4370 4371 unsigned DMask = 0; 4372 Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg(); 4373 LLT Ty = MRI->getType(VData); 4374 4375 // Check for 16 bit addresses and pack if true. 4376 LLT GradTy = 4377 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4378 LLT AddrTy = 4379 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4380 const bool IsG16 = GradTy == S16; 4381 const bool IsA16 = AddrTy == S16; 4382 const bool IsD16 = Ty.getScalarType() == S16; 4383 4384 int DMaskLanes = 0; 4385 if (!BaseOpcode->Atomic) { 4386 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4387 if (BaseOpcode->Gather4) { 4388 DMaskLanes = 4; 4389 } else if (DMask != 0) { 4390 DMaskLanes = countPopulation(DMask); 4391 } else if (!IsTFE && !BaseOpcode->Store) { 4392 // If dmask is 0, this is a no-op load. This can be eliminated. 4393 B.buildUndef(MI.getOperand(0)); 4394 MI.eraseFromParent(); 4395 return true; 4396 } 4397 } 4398 4399 Observer.changingInstr(MI); 4400 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4401 4402 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16 4403 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE; 4404 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 4405 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4406 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode; 4407 4408 // Track that we legalized this 4409 MI.setDesc(B.getTII().get(NewOpcode)); 4410 4411 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4412 // dmask to be at least 1 otherwise the instruction will fail 4413 if (IsTFE && DMask == 0) { 4414 DMask = 0x1; 4415 DMaskLanes = 1; 4416 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4417 } 4418 4419 if (BaseOpcode->Atomic) { 4420 Register VData0 = MI.getOperand(2).getReg(); 4421 LLT Ty = MRI->getType(VData0); 4422 4423 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4424 if (Ty.isVector()) 4425 return false; 4426 4427 if (BaseOpcode->AtomicX2) { 4428 Register VData1 = MI.getOperand(3).getReg(); 4429 // The two values are packed in one register. 4430 LLT PackedTy = LLT::fixed_vector(2, Ty); 4431 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4432 MI.getOperand(2).setReg(Concat.getReg(0)); 4433 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4434 } 4435 } 4436 4437 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4438 4439 // Optimize _L to _LZ when _L is zero 4440 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4441 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4442 const ConstantFP *ConstantLod; 4443 4444 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4445 m_GFCst(ConstantLod))) { 4446 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4447 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4448 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4449 AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ, 4450 Intr->Dim); 4451 4452 // The starting indexes should remain in the same place. 4453 --CorrectedNumVAddrs; 4454 4455 MI.getOperand(NumDefs).setIntrinsicID( 4456 static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4457 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4458 Intr = NewImageDimIntr; 4459 } 4460 } 4461 } 4462 4463 // Optimize _mip away, when 'lod' is zero 4464 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4465 int64_t ConstantLod; 4466 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4467 m_ICst(ConstantLod))) { 4468 if (ConstantLod == 0) { 4469 // TODO: Change intrinsic opcode and remove operand instead or replacing 4470 // it with 0, as the _L to _LZ handling is done above. 4471 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4472 --CorrectedNumVAddrs; 4473 } 4474 } 4475 } 4476 4477 // Rewrite the addressing register layout before doing anything else. 4478 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4479 // 16 bit gradients are supported, but are tied to the A16 control 4480 // so both gradients and addresses must be 16 bit 4481 return false; 4482 } 4483 4484 if (IsA16 && !ST.hasA16()) { 4485 // A16 not supported 4486 return false; 4487 } 4488 4489 if (IsA16 || IsG16) { 4490 if (Intr->NumVAddrs > 1) { 4491 SmallVector<Register, 4> PackedRegs; 4492 4493 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4494 IsG16); 4495 4496 // See also below in the non-a16 branch 4497 const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 && 4498 PackedRegs.size() <= ST.getNSAMaxSize(); 4499 4500 if (!UseNSA && PackedRegs.size() > 1) { 4501 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 4502 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4503 PackedRegs[0] = Concat.getReg(0); 4504 PackedRegs.resize(1); 4505 } 4506 4507 const unsigned NumPacked = PackedRegs.size(); 4508 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4509 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4510 if (!SrcOp.isReg()) { 4511 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4512 continue; 4513 } 4514 4515 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4516 4517 if (I - Intr->VAddrStart < NumPacked) 4518 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4519 else 4520 SrcOp.setReg(AMDGPU::NoRegister); 4521 } 4522 } 4523 } else { 4524 // If the register allocator cannot place the address registers contiguously 4525 // without introducing moves, then using the non-sequential address encoding 4526 // is always preferable, since it saves VALU instructions and is usually a 4527 // wash in terms of code size or even better. 4528 // 4529 // However, we currently have no way of hinting to the register allocator 4530 // that MIMG addresses should be placed contiguously when it is possible to 4531 // do so, so force non-NSA for the common 2-address case as a heuristic. 4532 // 4533 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4534 // allocation when possible. 4535 const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 && 4536 CorrectedNumVAddrs <= ST.getNSAMaxSize(); 4537 4538 if (!UseNSA && Intr->NumVAddrs > 1) 4539 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4540 Intr->NumVAddrs); 4541 } 4542 4543 int Flags = 0; 4544 if (IsA16) 4545 Flags |= 1; 4546 if (IsG16) 4547 Flags |= 2; 4548 MI.addOperand(MachineOperand::CreateImm(Flags)); 4549 4550 if (BaseOpcode->Store) { // No TFE for stores? 4551 // TODO: Handle dmask trim 4552 if (!Ty.isVector() || !IsD16) 4553 return true; 4554 4555 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4556 if (RepackedReg != VData) { 4557 MI.getOperand(1).setReg(RepackedReg); 4558 } 4559 4560 return true; 4561 } 4562 4563 Register DstReg = MI.getOperand(0).getReg(); 4564 const LLT EltTy = Ty.getScalarType(); 4565 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4566 4567 // Confirm that the return type is large enough for the dmask specified 4568 if (NumElts < DMaskLanes) 4569 return false; 4570 4571 if (NumElts > 4 || DMaskLanes > 4) 4572 return false; 4573 4574 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4575 const LLT AdjustedTy = 4576 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 4577 4578 // The raw dword aligned data component of the load. The only legal cases 4579 // where this matters should be when using the packed D16 format, for 4580 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4581 LLT RoundedTy; 4582 4583 // S32 vector to to cover all data, plus TFE result element. 4584 LLT TFETy; 4585 4586 // Register type to use for each loaded component. Will be S32 or V2S16. 4587 LLT RegTy; 4588 4589 if (IsD16 && ST.hasUnpackedD16VMem()) { 4590 RoundedTy = 4591 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 4592 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 4593 RegTy = S32; 4594 } else { 4595 unsigned EltSize = EltTy.getSizeInBits(); 4596 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4597 unsigned RoundedSize = 32 * RoundedElts; 4598 RoundedTy = LLT::scalarOrVector( 4599 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 4600 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 4601 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4602 } 4603 4604 // The return type does not need adjustment. 4605 // TODO: Should we change s16 case to s32 or <2 x s16>? 4606 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4607 return true; 4608 4609 Register Dst1Reg; 4610 4611 // Insert after the instruction. 4612 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4613 4614 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4615 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4616 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4617 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4618 4619 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4620 4621 MI.getOperand(0).setReg(NewResultReg); 4622 4623 // In the IR, TFE is supposed to be used with a 2 element struct return 4624 // type. The instruction really returns these two values in one contiguous 4625 // register, with one additional dword beyond the loaded data. Rewrite the 4626 // return type to use a single register result. 4627 4628 if (IsTFE) { 4629 Dst1Reg = MI.getOperand(1).getReg(); 4630 if (MRI->getType(Dst1Reg) != S32) 4631 return false; 4632 4633 // TODO: Make sure the TFE operand bit is set. 4634 MI.RemoveOperand(1); 4635 4636 // Handle the easy case that requires no repack instructions. 4637 if (Ty == S32) { 4638 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4639 return true; 4640 } 4641 } 4642 4643 // Now figure out how to copy the new result register back into the old 4644 // result. 4645 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4646 4647 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4648 4649 if (ResultNumRegs == 1) { 4650 assert(!IsTFE); 4651 ResultRegs[0] = NewResultReg; 4652 } else { 4653 // We have to repack into a new vector of some kind. 4654 for (int I = 0; I != NumDataRegs; ++I) 4655 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4656 B.buildUnmerge(ResultRegs, NewResultReg); 4657 4658 // Drop the final TFE element to get the data part. The TFE result is 4659 // directly written to the right place already. 4660 if (IsTFE) 4661 ResultRegs.resize(NumDataRegs); 4662 } 4663 4664 // For an s16 scalar result, we form an s32 result with a truncate regardless 4665 // of packed vs. unpacked. 4666 if (IsD16 && !Ty.isVector()) { 4667 B.buildTrunc(DstReg, ResultRegs[0]); 4668 return true; 4669 } 4670 4671 // Avoid a build/concat_vector of 1 entry. 4672 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4673 B.buildBitcast(DstReg, ResultRegs[0]); 4674 return true; 4675 } 4676 4677 assert(Ty.isVector()); 4678 4679 if (IsD16) { 4680 // For packed D16 results with TFE enabled, all the data components are 4681 // S32. Cast back to the expected type. 4682 // 4683 // TODO: We don't really need to use load s32 elements. We would only need one 4684 // cast for the TFE result if a multiple of v2s16 was used. 4685 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4686 for (Register &Reg : ResultRegs) 4687 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4688 } else if (ST.hasUnpackedD16VMem()) { 4689 for (Register &Reg : ResultRegs) 4690 Reg = B.buildTrunc(S16, Reg).getReg(0); 4691 } 4692 } 4693 4694 auto padWithUndef = [&](LLT Ty, int NumElts) { 4695 if (NumElts == 0) 4696 return; 4697 Register Undef = B.buildUndef(Ty).getReg(0); 4698 for (int I = 0; I != NumElts; ++I) 4699 ResultRegs.push_back(Undef); 4700 }; 4701 4702 // Pad out any elements eliminated due to the dmask. 4703 LLT ResTy = MRI->getType(ResultRegs[0]); 4704 if (!ResTy.isVector()) { 4705 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4706 B.buildBuildVector(DstReg, ResultRegs); 4707 return true; 4708 } 4709 4710 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4711 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4712 4713 // Deal with the one annoying legal case. 4714 const LLT V3S16 = LLT::fixed_vector(3, 16); 4715 if (Ty == V3S16) { 4716 if (IsTFE) { 4717 if (ResultRegs.size() == 1) { 4718 NewResultReg = ResultRegs[0]; 4719 } else if (ResultRegs.size() == 2) { 4720 LLT V4S16 = LLT::fixed_vector(4, 16); 4721 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); 4722 } else { 4723 return false; 4724 } 4725 } 4726 4727 if (MRI->getType(DstReg).getNumElements() < 4728 MRI->getType(NewResultReg).getNumElements()) { 4729 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg); 4730 } else { 4731 B.buildPadVectorWithUndefElements(DstReg, NewResultReg); 4732 } 4733 return true; 4734 } 4735 4736 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4737 B.buildConcatVectors(DstReg, ResultRegs); 4738 return true; 4739 } 4740 4741 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4742 LegalizerHelper &Helper, MachineInstr &MI) const { 4743 MachineIRBuilder &B = Helper.MIRBuilder; 4744 GISelChangeObserver &Observer = Helper.Observer; 4745 4746 Register Dst = MI.getOperand(0).getReg(); 4747 LLT Ty = B.getMRI()->getType(Dst); 4748 unsigned Size = Ty.getSizeInBits(); 4749 MachineFunction &MF = B.getMF(); 4750 4751 Observer.changingInstr(MI); 4752 4753 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 4754 Ty = getBitcastRegisterType(Ty); 4755 Helper.bitcastDst(MI, Ty, 0); 4756 Dst = MI.getOperand(0).getReg(); 4757 B.setInsertPt(B.getMBB(), MI); 4758 } 4759 4760 // FIXME: We don't really need this intermediate instruction. The intrinsic 4761 // should be fixed to have a memory operand. Since it's readnone, we're not 4762 // allowed to add one. 4763 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4764 MI.RemoveOperand(1); // Remove intrinsic ID 4765 4766 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4767 // TODO: Should this use datalayout alignment? 4768 const unsigned MemSize = (Size + 7) / 8; 4769 const Align MemAlign(4); 4770 MachineMemOperand *MMO = MF.getMachineMemOperand( 4771 MachinePointerInfo(), 4772 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4773 MachineMemOperand::MOInvariant, 4774 MemSize, MemAlign); 4775 MI.addMemOperand(MF, MMO); 4776 4777 // There are no 96-bit result scalar loads, but widening to 128-bit should 4778 // always be legal. We may need to restore this to a 96-bit result if it turns 4779 // out this needs to be converted to a vector load during RegBankSelect. 4780 if (!isPowerOf2_32(Size)) { 4781 if (Ty.isVector()) 4782 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4783 else 4784 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4785 } 4786 4787 Observer.changedInstr(MI); 4788 return true; 4789 } 4790 4791 // TODO: Move to selection 4792 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4793 MachineRegisterInfo &MRI, 4794 MachineIRBuilder &B) const { 4795 if (!ST.isTrapHandlerEnabled() || 4796 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4797 return legalizeTrapEndpgm(MI, MRI, B); 4798 4799 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4800 switch (*HsaAbiVer) { 4801 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4802 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4803 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4804 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4805 return ST.supportsGetDoorbellID() ? 4806 legalizeTrapHsa(MI, MRI, B) : 4807 legalizeTrapHsaQueuePtr(MI, MRI, B); 4808 } 4809 } 4810 4811 llvm_unreachable("Unknown trap handler"); 4812 } 4813 4814 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4815 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4816 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4817 MI.eraseFromParent(); 4818 return true; 4819 } 4820 4821 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4822 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4823 // Pass queue pointer to trap handler as input, and insert trap instruction 4824 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4825 Register LiveIn = 4826 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4827 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4828 return false; 4829 4830 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4831 B.buildCopy(SGPR01, LiveIn); 4832 B.buildInstr(AMDGPU::S_TRAP) 4833 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4834 .addReg(SGPR01, RegState::Implicit); 4835 4836 MI.eraseFromParent(); 4837 return true; 4838 } 4839 4840 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4841 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4842 B.buildInstr(AMDGPU::S_TRAP) 4843 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4844 MI.eraseFromParent(); 4845 return true; 4846 } 4847 4848 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4849 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4850 // Is non-HSA path or trap-handler disabled? Then, report a warning 4851 // accordingly 4852 if (!ST.isTrapHandlerEnabled() || 4853 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4854 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4855 "debugtrap handler not supported", 4856 MI.getDebugLoc(), DS_Warning); 4857 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4858 Ctx.diagnose(NoTrap); 4859 } else { 4860 // Insert debug-trap instruction 4861 B.buildInstr(AMDGPU::S_TRAP) 4862 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4863 } 4864 4865 MI.eraseFromParent(); 4866 return true; 4867 } 4868 4869 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4870 MachineIRBuilder &B) const { 4871 MachineRegisterInfo &MRI = *B.getMRI(); 4872 const LLT S16 = LLT::scalar(16); 4873 const LLT S32 = LLT::scalar(32); 4874 4875 Register DstReg = MI.getOperand(0).getReg(); 4876 Register NodePtr = MI.getOperand(2).getReg(); 4877 Register RayExtent = MI.getOperand(3).getReg(); 4878 Register RayOrigin = MI.getOperand(4).getReg(); 4879 Register RayDir = MI.getOperand(5).getReg(); 4880 Register RayInvDir = MI.getOperand(6).getReg(); 4881 Register TDescr = MI.getOperand(7).getReg(); 4882 4883 if (!ST.hasGFX10_AEncoding()) { 4884 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 4885 "intrinsic not supported on subtarget", 4886 MI.getDebugLoc()); 4887 B.getMF().getFunction().getContext().diagnose(BadIntrin); 4888 return false; 4889 } 4890 4891 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4892 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4893 const unsigned NumVDataDwords = 4; 4894 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 4895 const bool UseNSA = 4896 ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize(); 4897 const unsigned BaseOpcodes[2][2] = { 4898 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 4899 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 4900 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 4901 int Opcode; 4902 if (UseNSA) { 4903 Opcode = 4904 AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA, 4905 NumVDataDwords, NumVAddrDwords); 4906 } else { 4907 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 4908 AMDGPU::MIMGEncGfx10Default, NumVDataDwords, 4909 PowerOf2Ceil(NumVAddrDwords)); 4910 } 4911 assert(Opcode != -1); 4912 4913 SmallVector<Register, 12> Ops; 4914 if (Is64) { 4915 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4916 Ops.push_back(Unmerge.getReg(0)); 4917 Ops.push_back(Unmerge.getReg(1)); 4918 } else { 4919 Ops.push_back(NodePtr); 4920 } 4921 Ops.push_back(RayExtent); 4922 4923 auto packLanes = [&Ops, &S32, &B](Register Src) { 4924 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 4925 Ops.push_back(Unmerge.getReg(0)); 4926 Ops.push_back(Unmerge.getReg(1)); 4927 Ops.push_back(Unmerge.getReg(2)); 4928 }; 4929 4930 packLanes(RayOrigin); 4931 if (IsA16) { 4932 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 4933 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 4934 Register R1 = MRI.createGenericVirtualRegister(S32); 4935 Register R2 = MRI.createGenericVirtualRegister(S32); 4936 Register R3 = MRI.createGenericVirtualRegister(S32); 4937 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4938 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4939 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4940 Ops.push_back(R1); 4941 Ops.push_back(R2); 4942 Ops.push_back(R3); 4943 } else { 4944 packLanes(RayDir); 4945 packLanes(RayInvDir); 4946 } 4947 4948 if (!UseNSA) { 4949 // Build a single vector containing all the operands so far prepared. 4950 LLT OpTy = LLT::fixed_vector(Ops.size(), 32); 4951 Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0); 4952 Ops.clear(); 4953 Ops.push_back(MergedOps); 4954 } 4955 4956 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4957 .addDef(DstReg) 4958 .addImm(Opcode); 4959 4960 for (Register R : Ops) { 4961 MIB.addUse(R); 4962 } 4963 4964 MIB.addUse(TDescr) 4965 .addImm(IsA16 ? 1 : 0) 4966 .cloneMemRefs(MI); 4967 4968 MI.eraseFromParent(); 4969 return true; 4970 } 4971 4972 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C) { 4973 B.buildConstant(MI.getOperand(0).getReg(), C); 4974 MI.eraseFromParent(); 4975 return true; 4976 } 4977 4978 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4979 MachineInstr &MI) const { 4980 MachineIRBuilder &B = Helper.MIRBuilder; 4981 MachineRegisterInfo &MRI = *B.getMRI(); 4982 4983 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4984 auto IntrID = MI.getIntrinsicID(); 4985 switch (IntrID) { 4986 case Intrinsic::amdgcn_if: 4987 case Intrinsic::amdgcn_else: { 4988 MachineInstr *Br = nullptr; 4989 MachineBasicBlock *UncondBrTarget = nullptr; 4990 bool Negated = false; 4991 if (MachineInstr *BrCond = 4992 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4993 const SIRegisterInfo *TRI 4994 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4995 4996 Register Def = MI.getOperand(1).getReg(); 4997 Register Use = MI.getOperand(3).getReg(); 4998 4999 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 5000 5001 if (Negated) 5002 std::swap(CondBrTarget, UncondBrTarget); 5003 5004 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 5005 if (IntrID == Intrinsic::amdgcn_if) { 5006 B.buildInstr(AMDGPU::SI_IF) 5007 .addDef(Def) 5008 .addUse(Use) 5009 .addMBB(UncondBrTarget); 5010 } else { 5011 B.buildInstr(AMDGPU::SI_ELSE) 5012 .addDef(Def) 5013 .addUse(Use) 5014 .addMBB(UncondBrTarget); 5015 } 5016 5017 if (Br) { 5018 Br->getOperand(0).setMBB(CondBrTarget); 5019 } else { 5020 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 5021 // since we're swapping branch targets it needs to be reinserted. 5022 // FIXME: IRTranslator should probably not do this 5023 B.buildBr(*CondBrTarget); 5024 } 5025 5026 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 5027 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 5028 MI.eraseFromParent(); 5029 BrCond->eraseFromParent(); 5030 return true; 5031 } 5032 5033 return false; 5034 } 5035 case Intrinsic::amdgcn_loop: { 5036 MachineInstr *Br = nullptr; 5037 MachineBasicBlock *UncondBrTarget = nullptr; 5038 bool Negated = false; 5039 if (MachineInstr *BrCond = 5040 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 5041 const SIRegisterInfo *TRI 5042 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 5043 5044 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 5045 Register Reg = MI.getOperand(2).getReg(); 5046 5047 if (Negated) 5048 std::swap(CondBrTarget, UncondBrTarget); 5049 5050 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 5051 B.buildInstr(AMDGPU::SI_LOOP) 5052 .addUse(Reg) 5053 .addMBB(UncondBrTarget); 5054 5055 if (Br) 5056 Br->getOperand(0).setMBB(CondBrTarget); 5057 else 5058 B.buildBr(*CondBrTarget); 5059 5060 MI.eraseFromParent(); 5061 BrCond->eraseFromParent(); 5062 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 5063 return true; 5064 } 5065 5066 return false; 5067 } 5068 case Intrinsic::amdgcn_kernarg_segment_ptr: 5069 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 5070 // This only makes sense to call in a kernel, so just lower to null. 5071 B.buildConstant(MI.getOperand(0).getReg(), 0); 5072 MI.eraseFromParent(); 5073 return true; 5074 } 5075 5076 return legalizePreloadedArgIntrin( 5077 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 5078 case Intrinsic::amdgcn_implicitarg_ptr: 5079 return legalizeImplicitArgPtr(MI, MRI, B); 5080 case Intrinsic::amdgcn_workitem_id_x: 5081 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 0) == 0) 5082 return replaceWithConstant(B, MI, 0); 5083 return legalizePreloadedArgIntrin(MI, MRI, B, 5084 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 5085 case Intrinsic::amdgcn_workitem_id_y: 5086 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 1) == 0) 5087 return replaceWithConstant(B, MI, 0); 5088 5089 return legalizePreloadedArgIntrin(MI, MRI, B, 5090 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 5091 case Intrinsic::amdgcn_workitem_id_z: 5092 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 2) == 0) 5093 return replaceWithConstant(B, MI, 0); 5094 5095 return legalizePreloadedArgIntrin(MI, MRI, B, 5096 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 5097 case Intrinsic::amdgcn_workgroup_id_x: 5098 return legalizePreloadedArgIntrin(MI, MRI, B, 5099 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 5100 case Intrinsic::amdgcn_workgroup_id_y: 5101 return legalizePreloadedArgIntrin(MI, MRI, B, 5102 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 5103 case Intrinsic::amdgcn_workgroup_id_z: 5104 return legalizePreloadedArgIntrin(MI, MRI, B, 5105 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 5106 case Intrinsic::amdgcn_dispatch_ptr: 5107 return legalizePreloadedArgIntrin(MI, MRI, B, 5108 AMDGPUFunctionArgInfo::DISPATCH_PTR); 5109 case Intrinsic::amdgcn_queue_ptr: 5110 return legalizePreloadedArgIntrin(MI, MRI, B, 5111 AMDGPUFunctionArgInfo::QUEUE_PTR); 5112 case Intrinsic::amdgcn_implicit_buffer_ptr: 5113 return legalizePreloadedArgIntrin( 5114 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 5115 case Intrinsic::amdgcn_dispatch_id: 5116 return legalizePreloadedArgIntrin(MI, MRI, B, 5117 AMDGPUFunctionArgInfo::DISPATCH_ID); 5118 case Intrinsic::amdgcn_fdiv_fast: 5119 return legalizeFDIVFastIntrin(MI, MRI, B); 5120 case Intrinsic::amdgcn_is_shared: 5121 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 5122 case Intrinsic::amdgcn_is_private: 5123 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 5124 case Intrinsic::amdgcn_wavefrontsize: { 5125 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 5126 MI.eraseFromParent(); 5127 return true; 5128 } 5129 case Intrinsic::amdgcn_s_buffer_load: 5130 return legalizeSBufferLoad(Helper, MI); 5131 case Intrinsic::amdgcn_raw_buffer_store: 5132 case Intrinsic::amdgcn_struct_buffer_store: 5133 return legalizeBufferStore(MI, MRI, B, false, false); 5134 case Intrinsic::amdgcn_raw_buffer_store_format: 5135 case Intrinsic::amdgcn_struct_buffer_store_format: 5136 return legalizeBufferStore(MI, MRI, B, false, true); 5137 case Intrinsic::amdgcn_raw_tbuffer_store: 5138 case Intrinsic::amdgcn_struct_tbuffer_store: 5139 return legalizeBufferStore(MI, MRI, B, true, true); 5140 case Intrinsic::amdgcn_raw_buffer_load: 5141 case Intrinsic::amdgcn_struct_buffer_load: 5142 return legalizeBufferLoad(MI, MRI, B, false, false); 5143 case Intrinsic::amdgcn_raw_buffer_load_format: 5144 case Intrinsic::amdgcn_struct_buffer_load_format: 5145 return legalizeBufferLoad(MI, MRI, B, true, false); 5146 case Intrinsic::amdgcn_raw_tbuffer_load: 5147 case Intrinsic::amdgcn_struct_tbuffer_load: 5148 return legalizeBufferLoad(MI, MRI, B, true, true); 5149 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 5150 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 5151 case Intrinsic::amdgcn_raw_buffer_atomic_add: 5152 case Intrinsic::amdgcn_struct_buffer_atomic_add: 5153 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 5154 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 5155 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 5156 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 5157 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 5158 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 5159 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 5160 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 5161 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 5162 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 5163 case Intrinsic::amdgcn_raw_buffer_atomic_and: 5164 case Intrinsic::amdgcn_struct_buffer_atomic_and: 5165 case Intrinsic::amdgcn_raw_buffer_atomic_or: 5166 case Intrinsic::amdgcn_struct_buffer_atomic_or: 5167 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 5168 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 5169 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 5170 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 5171 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 5172 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 5173 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 5174 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 5175 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 5176 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 5177 case Intrinsic::amdgcn_buffer_atomic_fadd: 5178 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 5179 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 5180 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 5181 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 5182 return legalizeBufferAtomic(MI, B, IntrID); 5183 case Intrinsic::amdgcn_atomic_inc: 5184 return legalizeAtomicIncDec(MI, B, true); 5185 case Intrinsic::amdgcn_atomic_dec: 5186 return legalizeAtomicIncDec(MI, B, false); 5187 case Intrinsic::trap: 5188 return legalizeTrapIntrinsic(MI, MRI, B); 5189 case Intrinsic::debugtrap: 5190 return legalizeDebugTrapIntrinsic(MI, MRI, B); 5191 case Intrinsic::amdgcn_rsq_clamp: 5192 return legalizeRsqClampIntrinsic(MI, MRI, B); 5193 case Intrinsic::amdgcn_ds_fadd: 5194 case Intrinsic::amdgcn_ds_fmin: 5195 case Intrinsic::amdgcn_ds_fmax: 5196 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 5197 case Intrinsic::amdgcn_image_bvh_intersect_ray: 5198 return legalizeBVHIntrinsic(MI, B); 5199 default: { 5200 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 5201 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 5202 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 5203 return true; 5204 } 5205 } 5206 5207 return true; 5208 } 5209