1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-legalinfo" 31 32 using namespace llvm; 33 using namespace LegalizeActions; 34 using namespace LegalizeMutations; 35 using namespace LegalityPredicates; 36 using namespace MIPatternMatch; 37 38 // Hack until load/store selection patterns support any tuple of legal types. 39 static cl::opt<bool> EnableNewLegality( 40 "amdgpu-global-isel-new-legality", 41 cl::desc("Use GlobalISel desired legality, rather than try to use" 42 "rules compatible with selection patterns"), 43 cl::init(false), 44 cl::ReallyHidden); 45 46 static constexpr unsigned MaxRegisterSize = 1024; 47 48 // Round the number of elements to the next power of two elements 49 static LLT getPow2VectorType(LLT Ty) { 50 unsigned NElts = Ty.getNumElements(); 51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 52 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 53 } 54 55 // Round the number of bits to the next power of two bits 56 static LLT getPow2ScalarType(LLT Ty) { 57 unsigned Bits = Ty.getSizeInBits(); 58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 59 return LLT::scalar(Pow2Bits); 60 } 61 62 /// \returns true if this is an odd sized vector which should widen by adding an 63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 64 /// excludes s1 vectors, which should always be scalarized. 65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 66 return [=](const LegalityQuery &Query) { 67 const LLT Ty = Query.Types[TypeIdx]; 68 if (!Ty.isVector()) 69 return false; 70 71 const LLT EltTy = Ty.getElementType(); 72 const unsigned EltSize = EltTy.getSizeInBits(); 73 return Ty.getNumElements() % 2 != 0 && 74 EltSize > 1 && EltSize < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 return Ty.getSizeInBits() % 32 == 0; 83 }; 84 } 85 86 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getScalarType(); 90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 91 }; 92 } 93 94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 95 return [=](const LegalityQuery &Query) { 96 const LLT Ty = Query.Types[TypeIdx]; 97 const LLT EltTy = Ty.getElementType(); 98 return std::make_pair(TypeIdx, 99 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 100 }; 101 } 102 103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 104 return [=](const LegalityQuery &Query) { 105 const LLT Ty = Query.Types[TypeIdx]; 106 const LLT EltTy = Ty.getElementType(); 107 unsigned Size = Ty.getSizeInBits(); 108 unsigned Pieces = (Size + 63) / 64; 109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 110 return std::make_pair( 111 TypeIdx, 112 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy)); 113 }; 114 } 115 116 // Increase the number of vector elements to reach the next multiple of 32-bit 117 // type. 118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 122 const LLT EltTy = Ty.getElementType(); 123 const int Size = Ty.getSizeInBits(); 124 const int EltSize = EltTy.getSizeInBits(); 125 const int NextMul32 = (Size + 31) / 32; 126 127 assert(EltSize < 32); 128 129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 130 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 131 }; 132 } 133 134 static LLT getBitcastRegisterType(const LLT Ty) { 135 const unsigned Size = Ty.getSizeInBits(); 136 137 LLT CoercedTy; 138 if (Size <= 32) { 139 // <2 x s8> -> s16 140 // <4 x s8> -> s32 141 return LLT::scalar(Size); 142 } 143 144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 145 } 146 147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 148 return [=](const LegalityQuery &Query) { 149 const LLT Ty = Query.Types[TypeIdx]; 150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 151 }; 152 } 153 154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 155 return [=](const LegalityQuery &Query) { 156 const LLT Ty = Query.Types[TypeIdx]; 157 unsigned Size = Ty.getSizeInBits(); 158 assert(Size % 32 == 0); 159 return std::make_pair( 160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 161 }; 162 } 163 164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 165 return [=](const LegalityQuery &Query) { 166 const LLT QueryTy = Query.Types[TypeIdx]; 167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 168 }; 169 } 170 171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 172 return [=](const LegalityQuery &Query) { 173 const LLT QueryTy = Query.Types[TypeIdx]; 174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 175 }; 176 } 177 178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 179 return [=](const LegalityQuery &Query) { 180 const LLT QueryTy = Query.Types[TypeIdx]; 181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 182 }; 183 } 184 185 static bool isRegisterSize(unsigned Size) { 186 return Size % 32 == 0 && Size <= MaxRegisterSize; 187 } 188 189 static bool isRegisterVectorElementType(LLT EltTy) { 190 const int EltSize = EltTy.getSizeInBits(); 191 return EltSize == 16 || EltSize % 32 == 0; 192 } 193 194 static bool isRegisterVectorType(LLT Ty) { 195 const int EltSize = Ty.getElementType().getSizeInBits(); 196 return EltSize == 32 || EltSize == 64 || 197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 198 EltSize == 128 || EltSize == 256; 199 } 200 201 static bool isRegisterType(LLT Ty) { 202 if (!isRegisterSize(Ty.getSizeInBits())) 203 return false; 204 205 if (Ty.isVector()) 206 return isRegisterVectorType(Ty); 207 208 return true; 209 } 210 211 // Any combination of 32 or 64-bit elements up the maximum register size, and 212 // multiples of v2s16. 213 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 214 return [=](const LegalityQuery &Query) { 215 return isRegisterType(Query.Types[TypeIdx]); 216 }; 217 } 218 219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 220 return [=](const LegalityQuery &Query) { 221 const LLT QueryTy = Query.Types[TypeIdx]; 222 if (!QueryTy.isVector()) 223 return false; 224 const LLT EltTy = QueryTy.getElementType(); 225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 226 }; 227 } 228 229 // If we have a truncating store or an extending load with a data size larger 230 // than 32-bits, we need to reduce to a 32-bit type. 231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 232 return [=](const LegalityQuery &Query) { 233 const LLT Ty = Query.Types[TypeIdx]; 234 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 236 }; 237 } 238 239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 240 // handle some operations by just promoting the register during 241 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 243 bool IsLoad) { 244 switch (AS) { 245 case AMDGPUAS::PRIVATE_ADDRESS: 246 // FIXME: Private element size. 247 return ST.enableFlatScratch() ? 128 : 32; 248 case AMDGPUAS::LOCAL_ADDRESS: 249 return ST.useDS128() ? 128 : 64; 250 case AMDGPUAS::GLOBAL_ADDRESS: 251 case AMDGPUAS::CONSTANT_ADDRESS: 252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 253 // Treat constant and global as identical. SMRD loads are sometimes usable for 254 // global loads (ideally constant address space should be eliminated) 255 // depending on the context. Legality cannot be context dependent, but 256 // RegBankSelect can split the load as necessary depending on the pointer 257 // register bank/uniformity and if the memory is invariant or not written in a 258 // kernel. 259 return IsLoad ? 512 : 128; 260 default: 261 // Flat addresses may contextually need to be split to 32-bit parts if they 262 // may alias scratch depending on the subtarget. 263 return 128; 264 } 265 } 266 267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 268 const LegalityQuery &Query) { 269 const LLT Ty = Query.Types[0]; 270 271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 273 274 unsigned RegSize = Ty.getSizeInBits(); 275 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 276 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits; 277 unsigned AS = Query.Types[1].getAddressSpace(); 278 279 // All of these need to be custom lowered to cast the pointer operand. 280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 281 return false; 282 283 // Do not handle extending vector loads. 284 if (Ty.isVector() && MemSize != RegSize) 285 return false; 286 287 // TODO: We should be able to widen loads if the alignment is high enough, but 288 // we also need to modify the memory access size. 289 #if 0 290 // Accept widening loads based on alignment. 291 if (IsLoad && MemSize < Size) 292 MemSize = std::max(MemSize, Align); 293 #endif 294 295 // Only 1-byte and 2-byte to 32-bit extloads are valid. 296 if (MemSize != RegSize && RegSize != 32) 297 return false; 298 299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 300 return false; 301 302 switch (MemSize) { 303 case 8: 304 case 16: 305 case 32: 306 case 64: 307 case 128: 308 break; 309 case 96: 310 if (!ST.hasDwordx3LoadStores()) 311 return false; 312 break; 313 case 256: 314 case 512: 315 // These may contextually need to be broken down. 316 break; 317 default: 318 return false; 319 } 320 321 assert(RegSize >= MemSize); 322 323 if (AlignBits < MemSize) { 324 const SITargetLowering *TLI = ST.getTargetLowering(); 325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 326 Align(AlignBits / 8))) 327 return false; 328 } 329 330 return true; 331 } 332 333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 334 // workaround this. Eventually it should ignore the type for loads and only care 335 // about the size. Return true in cases where we will workaround this for now by 336 // bitcasting. 337 static bool loadStoreBitcastWorkaround(const LLT Ty) { 338 if (EnableNewLegality) 339 return false; 340 341 const unsigned Size = Ty.getSizeInBits(); 342 if (Size <= 64) 343 return false; 344 if (!Ty.isVector()) 345 return true; 346 347 LLT EltTy = Ty.getElementType(); 348 if (EltTy.isPointer()) 349 return true; 350 351 unsigned EltSize = EltTy.getSizeInBits(); 352 return EltSize != 32 && EltSize != 64; 353 } 354 355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 356 const LLT Ty = Query.Types[0]; 357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 358 !loadStoreBitcastWorkaround(Ty); 359 } 360 361 /// Return true if a load or store of the type should be lowered with a bitcast 362 /// to a different type. 363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 364 const LLT MemTy) { 365 const unsigned MemSizeInBits = MemTy.getSizeInBits(); 366 const unsigned Size = Ty.getSizeInBits(); 367 if (Size != MemSizeInBits) 368 return Size <= 32 && Ty.isVector(); 369 370 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 371 return true; 372 373 // Don't try to handle bitcasting vector ext loads for now. 374 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 375 (Size <= 32 || isRegisterSize(Size)) && 376 !isRegisterVectorElementType(Ty.getElementType()); 377 } 378 379 /// Return true if we should legalize a load by widening an odd sized memory 380 /// access up to the alignment. Note this case when the memory access itself 381 /// changes, not the size of the result register. 382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 383 uint64_t AlignInBits, unsigned AddrSpace, 384 unsigned Opcode) { 385 unsigned SizeInBits = MemoryTy.getSizeInBits(); 386 // We don't want to widen cases that are naturally legal. 387 if (isPowerOf2_32(SizeInBits)) 388 return false; 389 390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 391 // end up widening these for a scalar load during RegBankSelect, since there 392 // aren't 96-bit scalar loads. 393 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 394 return false; 395 396 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 397 return false; 398 399 // A load is known dereferenceable up to the alignment, so it's legal to widen 400 // to it. 401 // 402 // TODO: Could check dereferenceable for less aligned cases. 403 unsigned RoundedSize = NextPowerOf2(SizeInBits); 404 if (AlignInBits < RoundedSize) 405 return false; 406 407 // Do not widen if it would introduce a slow unaligned load. 408 const SITargetLowering *TLI = ST.getTargetLowering(); 409 bool Fast = false; 410 return TLI->allowsMisalignedMemoryAccessesImpl( 411 RoundedSize, AddrSpace, Align(AlignInBits / 8), 412 MachineMemOperand::MOLoad, &Fast) && 413 Fast; 414 } 415 416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 417 unsigned Opcode) { 418 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 419 return false; 420 421 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 422 Query.MMODescrs[0].AlignInBits, 423 Query.Types[1].getAddressSpace(), Opcode); 424 } 425 426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 427 const GCNTargetMachine &TM) 428 : ST(ST_) { 429 using namespace TargetOpcode; 430 431 auto GetAddrSpacePtr = [&TM](unsigned AS) { 432 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 433 }; 434 435 const LLT S1 = LLT::scalar(1); 436 const LLT S8 = LLT::scalar(8); 437 const LLT S16 = LLT::scalar(16); 438 const LLT S32 = LLT::scalar(32); 439 const LLT S64 = LLT::scalar(64); 440 const LLT S128 = LLT::scalar(128); 441 const LLT S256 = LLT::scalar(256); 442 const LLT S512 = LLT::scalar(512); 443 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 444 445 const LLT V2S8 = LLT::fixed_vector(2, 8); 446 const LLT V2S16 = LLT::fixed_vector(2, 16); 447 const LLT V4S16 = LLT::fixed_vector(4, 16); 448 449 const LLT V2S32 = LLT::fixed_vector(2, 32); 450 const LLT V3S32 = LLT::fixed_vector(3, 32); 451 const LLT V4S32 = LLT::fixed_vector(4, 32); 452 const LLT V5S32 = LLT::fixed_vector(5, 32); 453 const LLT V6S32 = LLT::fixed_vector(6, 32); 454 const LLT V7S32 = LLT::fixed_vector(7, 32); 455 const LLT V8S32 = LLT::fixed_vector(8, 32); 456 const LLT V9S32 = LLT::fixed_vector(9, 32); 457 const LLT V10S32 = LLT::fixed_vector(10, 32); 458 const LLT V11S32 = LLT::fixed_vector(11, 32); 459 const LLT V12S32 = LLT::fixed_vector(12, 32); 460 const LLT V13S32 = LLT::fixed_vector(13, 32); 461 const LLT V14S32 = LLT::fixed_vector(14, 32); 462 const LLT V15S32 = LLT::fixed_vector(15, 32); 463 const LLT V16S32 = LLT::fixed_vector(16, 32); 464 const LLT V32S32 = LLT::fixed_vector(32, 32); 465 466 const LLT V2S64 = LLT::fixed_vector(2, 64); 467 const LLT V3S64 = LLT::fixed_vector(3, 64); 468 const LLT V4S64 = LLT::fixed_vector(4, 64); 469 const LLT V5S64 = LLT::fixed_vector(5, 64); 470 const LLT V6S64 = LLT::fixed_vector(6, 64); 471 const LLT V7S64 = LLT::fixed_vector(7, 64); 472 const LLT V8S64 = LLT::fixed_vector(8, 64); 473 const LLT V16S64 = LLT::fixed_vector(16, 64); 474 475 std::initializer_list<LLT> AllS32Vectors = 476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 478 std::initializer_list<LLT> AllS64Vectors = 479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 480 481 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 482 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 483 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 484 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 485 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 486 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 487 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 488 489 const LLT CodePtr = FlatPtr; 490 491 const std::initializer_list<LLT> AddrSpaces64 = { 492 GlobalPtr, ConstantPtr, FlatPtr 493 }; 494 495 const std::initializer_list<LLT> AddrSpaces32 = { 496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 497 }; 498 499 const std::initializer_list<LLT> FPTypesBase = { 500 S32, S64 501 }; 502 503 const std::initializer_list<LLT> FPTypes16 = { 504 S32, S64, S16 505 }; 506 507 const std::initializer_list<LLT> FPTypesPK16 = { 508 S32, S64, S16, V2S16 509 }; 510 511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 512 513 // s1 for VCC branches, s32 for SCC branches. 514 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 515 516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 517 // elements for v3s16 518 getActionDefinitionsBuilder(G_PHI) 519 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 520 .legalFor(AllS32Vectors) 521 .legalFor(AllS64Vectors) 522 .legalFor(AddrSpaces64) 523 .legalFor(AddrSpaces32) 524 .legalIf(isPointer(0)) 525 .clampScalar(0, S16, S256) 526 .widenScalarToNextPow2(0, 32) 527 .clampMaxNumElements(0, S32, 16) 528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 529 .scalarize(0); 530 531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 532 // Full set of gfx9 features. 533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 534 .legalFor({S32, S16, V2S16}) 535 .minScalar(0, S16) 536 .clampMaxNumElementsStrict(0, S16, 2) 537 .widenScalarToNextMultipleOf(0, 32) 538 .maxScalar(0, S32) 539 .scalarize(0); 540 541 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 542 .legalFor({S32, S16, V2S16}) // Clamp modifier 543 .minScalarOrElt(0, S16) 544 .clampMaxNumElementsStrict(0, S16, 2) 545 .scalarize(0) 546 .widenScalarToNextPow2(0, 32) 547 .lower(); 548 } else if (ST.has16BitInsts()) { 549 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 550 .legalFor({S32, S16}) 551 .minScalar(0, S16) 552 .widenScalarToNextMultipleOf(0, 32) 553 .maxScalar(0, S32) 554 .scalarize(0); 555 556 // Technically the saturating operations require clamp bit support, but this 557 // was introduced at the same time as 16-bit operations. 558 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 559 .legalFor({S32, S16}) // Clamp modifier 560 .minScalar(0, S16) 561 .scalarize(0) 562 .widenScalarToNextPow2(0, 16) 563 .lower(); 564 565 // We're just lowering this, but it helps get a better result to try to 566 // coerce to the desired type first. 567 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 568 .minScalar(0, S16) 569 .scalarize(0) 570 .lower(); 571 } else { 572 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 573 .legalFor({S32}) 574 .widenScalarToNextMultipleOf(0, 32) 575 .clampScalar(0, S32, S32) 576 .scalarize(0); 577 578 if (ST.hasIntClamp()) { 579 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 580 .legalFor({S32}) // Clamp modifier. 581 .scalarize(0) 582 .minScalarOrElt(0, S32) 583 .lower(); 584 } else { 585 // Clamp bit support was added in VI, along with 16-bit operations. 586 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 587 .minScalar(0, S32) 588 .scalarize(0) 589 .lower(); 590 } 591 592 // FIXME: DAG expansion gets better results. The widening uses the smaller 593 // range values and goes for the min/max lowering directly. 594 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 595 .minScalar(0, S32) 596 .scalarize(0) 597 .lower(); 598 } 599 600 getActionDefinitionsBuilder( 601 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 602 .customFor({S32, S64}) 603 .clampScalar(0, S32, S64) 604 .widenScalarToNextPow2(0, 32) 605 .scalarize(0); 606 607 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 608 .legalFor({S32}) 609 .maxScalar(0, S32); 610 611 if (ST.hasVOP3PInsts()) { 612 Mulh 613 .clampMaxNumElements(0, S8, 2) 614 .lowerFor({V2S8}); 615 } 616 617 Mulh 618 .scalarize(0) 619 .lower(); 620 621 // Report legal for any types we can handle anywhere. For the cases only legal 622 // on the SALU, RegBankSelect will be able to re-legalize. 623 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 624 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 625 .clampScalar(0, S32, S64) 626 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 627 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 628 .widenScalarToNextPow2(0) 629 .scalarize(0); 630 631 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 632 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 633 .legalFor({{S32, S1}, {S32, S32}}) 634 .minScalar(0, S32) 635 // TODO: .scalarize(0) 636 .lower(); 637 638 getActionDefinitionsBuilder(G_BITCAST) 639 // Don't worry about the size constraint. 640 .legalIf(all(isRegisterType(0), isRegisterType(1))) 641 .lower(); 642 643 644 getActionDefinitionsBuilder(G_CONSTANT) 645 .legalFor({S1, S32, S64, S16, GlobalPtr, 646 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 647 .legalIf(isPointer(0)) 648 .clampScalar(0, S32, S64) 649 .widenScalarToNextPow2(0); 650 651 getActionDefinitionsBuilder(G_FCONSTANT) 652 .legalFor({S32, S64, S16}) 653 .clampScalar(0, S16, S64); 654 655 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 656 .legalIf(isRegisterType(0)) 657 // s1 and s16 are special cases because they have legal operations on 658 // them, but don't really occupy registers in the normal way. 659 .legalFor({S1, S16}) 660 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 661 .clampScalarOrElt(0, S32, MaxScalar) 662 .widenScalarToNextPow2(0, 32) 663 .clampMaxNumElements(0, S32, 16); 664 665 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 666 667 // If the amount is divergent, we have to do a wave reduction to get the 668 // maximum value, so this is expanded during RegBankSelect. 669 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 670 .legalFor({{PrivatePtr, S32}}); 671 672 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 673 .customIf(typeIsNot(0, PrivatePtr)); 674 675 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 676 677 auto &FPOpActions = getActionDefinitionsBuilder( 678 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 679 .legalFor({S32, S64}); 680 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 681 .customFor({S32, S64}); 682 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 683 .customFor({S32, S64}); 684 685 if (ST.has16BitInsts()) { 686 if (ST.hasVOP3PInsts()) 687 FPOpActions.legalFor({S16, V2S16}); 688 else 689 FPOpActions.legalFor({S16}); 690 691 TrigActions.customFor({S16}); 692 FDIVActions.customFor({S16}); 693 } 694 695 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 696 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 697 698 if (ST.hasVOP3PInsts()) { 699 MinNumMaxNum.customFor(FPTypesPK16) 700 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 701 .clampMaxNumElements(0, S16, 2) 702 .clampScalar(0, S16, S64) 703 .scalarize(0); 704 } else if (ST.has16BitInsts()) { 705 MinNumMaxNum.customFor(FPTypes16) 706 .clampScalar(0, S16, S64) 707 .scalarize(0); 708 } else { 709 MinNumMaxNum.customFor(FPTypesBase) 710 .clampScalar(0, S32, S64) 711 .scalarize(0); 712 } 713 714 if (ST.hasVOP3PInsts()) 715 FPOpActions.clampMaxNumElementsStrict(0, S16, 2); 716 717 FPOpActions 718 .scalarize(0) 719 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 720 721 TrigActions 722 .scalarize(0) 723 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 724 725 FDIVActions 726 .scalarize(0) 727 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 728 729 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 730 .legalFor(FPTypesPK16) 731 .clampMaxNumElementsStrict(0, S16, 2) 732 .scalarize(0) 733 .clampScalar(0, S16, S64); 734 735 if (ST.has16BitInsts()) { 736 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 737 .legalFor({S32, S64, S16}) 738 .scalarize(0) 739 .clampScalar(0, S16, S64); 740 } else { 741 getActionDefinitionsBuilder(G_FSQRT) 742 .legalFor({S32, S64}) 743 .scalarize(0) 744 .clampScalar(0, S32, S64); 745 746 if (ST.hasFractBug()) { 747 getActionDefinitionsBuilder(G_FFLOOR) 748 .customFor({S64}) 749 .legalFor({S32, S64}) 750 .scalarize(0) 751 .clampScalar(0, S32, S64); 752 } else { 753 getActionDefinitionsBuilder(G_FFLOOR) 754 .legalFor({S32, S64}) 755 .scalarize(0) 756 .clampScalar(0, S32, S64); 757 } 758 } 759 760 getActionDefinitionsBuilder(G_FPTRUNC) 761 .legalFor({{S32, S64}, {S16, S32}}) 762 .scalarize(0) 763 .lower(); 764 765 getActionDefinitionsBuilder(G_FPEXT) 766 .legalFor({{S64, S32}, {S32, S16}}) 767 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 768 .scalarize(0); 769 770 getActionDefinitionsBuilder(G_FSUB) 771 // Use actual fsub instruction 772 .legalFor({S32}) 773 // Must use fadd + fneg 774 .lowerFor({S64, S16, V2S16}) 775 .scalarize(0) 776 .clampScalar(0, S32, S64); 777 778 // Whether this is legal depends on the floating point mode for the function. 779 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 780 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 781 FMad.customFor({S32, S16}); 782 else if (ST.hasMadMacF32Insts()) 783 FMad.customFor({S32}); 784 else if (ST.hasMadF16()) 785 FMad.customFor({S16}); 786 FMad.scalarize(0) 787 .lower(); 788 789 auto &FRem = getActionDefinitionsBuilder(G_FREM); 790 if (ST.has16BitInsts()) { 791 FRem.customFor({S16, S32, S64}); 792 } else { 793 FRem.minScalar(0, S32) 794 .customFor({S32, S64}); 795 } 796 FRem.scalarize(0); 797 798 // TODO: Do we need to clamp maximum bitwidth? 799 getActionDefinitionsBuilder(G_TRUNC) 800 .legalIf(isScalar(0)) 801 .legalFor({{V2S16, V2S32}}) 802 .clampMaxNumElements(0, S16, 2) 803 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 804 // situations (like an invalid implicit use), we don't want to infinite loop 805 // in the legalizer. 806 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 807 .alwaysLegal(); 808 809 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 810 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 811 {S32, S1}, {S64, S1}, {S16, S1}}) 812 .scalarize(0) 813 .clampScalar(0, S32, S64) 814 .widenScalarToNextPow2(1, 32); 815 816 // TODO: Split s1->s64 during regbankselect for VALU. 817 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 818 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 819 .lowerIf(typeIs(1, S1)) 820 .customFor({{S32, S64}, {S64, S64}}); 821 if (ST.has16BitInsts()) 822 IToFP.legalFor({{S16, S16}}); 823 IToFP.clampScalar(1, S32, S64) 824 .minScalar(0, S32) 825 .scalarize(0) 826 .widenScalarToNextPow2(1); 827 828 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 829 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 830 .customFor({{S64, S32}, {S64, S64}}) 831 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 832 if (ST.has16BitInsts()) 833 FPToI.legalFor({{S16, S16}}); 834 else 835 FPToI.minScalar(1, S32); 836 837 FPToI.minScalar(0, S32) 838 .widenScalarToNextPow2(0, 32) 839 .scalarize(0) 840 .lower(); 841 842 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND) 843 .customFor({S16, S32}) 844 .scalarize(0) 845 .lower(); 846 847 // Lower roundeven into G_FRINT 848 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 849 .scalarize(0) 850 .lower(); 851 852 if (ST.has16BitInsts()) { 853 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 854 .legalFor({S16, S32, S64}) 855 .clampScalar(0, S16, S64) 856 .scalarize(0); 857 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 858 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 859 .legalFor({S32, S64}) 860 .clampScalar(0, S32, S64) 861 .scalarize(0); 862 } else { 863 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 864 .legalFor({S32}) 865 .customFor({S64}) 866 .clampScalar(0, S32, S64) 867 .scalarize(0); 868 } 869 870 getActionDefinitionsBuilder(G_PTR_ADD) 871 .legalIf(all(isPointer(0), sameSize(0, 1))) 872 .scalarize(0) 873 .scalarSameSizeAs(1, 0); 874 875 getActionDefinitionsBuilder(G_PTRMASK) 876 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 877 .scalarSameSizeAs(1, 0) 878 .scalarize(0); 879 880 auto &CmpBuilder = 881 getActionDefinitionsBuilder(G_ICMP) 882 // The compare output type differs based on the register bank of the output, 883 // so make both s1 and s32 legal. 884 // 885 // Scalar compares producing output in scc will be promoted to s32, as that 886 // is the allocatable register type that will be needed for the copy from 887 // scc. This will be promoted during RegBankSelect, and we assume something 888 // before that won't try to use s32 result types. 889 // 890 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 891 // bank. 892 .legalForCartesianProduct( 893 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 894 .legalForCartesianProduct( 895 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 896 if (ST.has16BitInsts()) { 897 CmpBuilder.legalFor({{S1, S16}}); 898 } 899 900 CmpBuilder 901 .widenScalarToNextPow2(1) 902 .clampScalar(1, S32, S64) 903 .scalarize(0) 904 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 905 906 getActionDefinitionsBuilder(G_FCMP) 907 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 908 .widenScalarToNextPow2(1) 909 .clampScalar(1, S32, S64) 910 .scalarize(0); 911 912 // FIXME: fpow has a selection pattern that should move to custom lowering. 913 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 914 if (ST.has16BitInsts()) 915 Exp2Ops.legalFor({S32, S16}); 916 else 917 Exp2Ops.legalFor({S32}); 918 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 919 Exp2Ops.scalarize(0); 920 921 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 922 if (ST.has16BitInsts()) 923 ExpOps.customFor({{S32}, {S16}}); 924 else 925 ExpOps.customFor({S32}); 926 ExpOps.clampScalar(0, MinScalarFPTy, S32) 927 .scalarize(0); 928 929 getActionDefinitionsBuilder(G_FPOWI) 930 .clampScalar(0, MinScalarFPTy, S32) 931 .lower(); 932 933 // The 64-bit versions produce 32-bit results, but only on the SALU. 934 getActionDefinitionsBuilder(G_CTPOP) 935 .legalFor({{S32, S32}, {S32, S64}}) 936 .clampScalar(0, S32, S32) 937 .widenScalarToNextPow2(1, 32) 938 .clampScalar(1, S32, S64) 939 .scalarize(0) 940 .widenScalarToNextPow2(0, 32); 941 942 943 // The hardware instructions return a different result on 0 than the generic 944 // instructions expect. The hardware produces -1, but these produce the 945 // bitwidth. 946 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 947 .scalarize(0) 948 .clampScalar(0, S32, S32) 949 .clampScalar(1, S32, S64) 950 .widenScalarToNextPow2(0, 32) 951 .widenScalarToNextPow2(1, 32) 952 .custom(); 953 954 // The 64-bit versions produce 32-bit results, but only on the SALU. 955 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 956 .legalFor({{S32, S32}, {S32, S64}}) 957 .clampScalar(0, S32, S32) 958 .clampScalar(1, S32, S64) 959 .scalarize(0) 960 .widenScalarToNextPow2(0, 32) 961 .widenScalarToNextPow2(1, 32); 962 963 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 964 // RegBankSelect. 965 getActionDefinitionsBuilder(G_BITREVERSE) 966 .legalFor({S32, S64}) 967 .clampScalar(0, S32, S64) 968 .scalarize(0) 969 .widenScalarToNextPow2(0); 970 971 if (ST.has16BitInsts()) { 972 getActionDefinitionsBuilder(G_BSWAP) 973 .legalFor({S16, S32, V2S16}) 974 .clampMaxNumElementsStrict(0, S16, 2) 975 // FIXME: Fixing non-power-of-2 before clamp is workaround for 976 // narrowScalar limitation. 977 .widenScalarToNextPow2(0) 978 .clampScalar(0, S16, S32) 979 .scalarize(0); 980 981 if (ST.hasVOP3PInsts()) { 982 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 983 .legalFor({S32, S16, V2S16}) 984 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 985 .clampMaxNumElements(0, S16, 2) 986 .minScalar(0, S16) 987 .widenScalarToNextPow2(0) 988 .scalarize(0) 989 .lower(); 990 } else { 991 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 992 .legalFor({S32, S16}) 993 .widenScalarToNextPow2(0) 994 .minScalar(0, S16) 995 .scalarize(0) 996 .lower(); 997 } 998 } else { 999 // TODO: Should have same legality without v_perm_b32 1000 getActionDefinitionsBuilder(G_BSWAP) 1001 .legalFor({S32}) 1002 .lowerIf(scalarNarrowerThan(0, 32)) 1003 // FIXME: Fixing non-power-of-2 before clamp is workaround for 1004 // narrowScalar limitation. 1005 .widenScalarToNextPow2(0) 1006 .maxScalar(0, S32) 1007 .scalarize(0) 1008 .lower(); 1009 1010 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1011 .legalFor({S32}) 1012 .minScalar(0, S32) 1013 .widenScalarToNextPow2(0) 1014 .scalarize(0) 1015 .lower(); 1016 } 1017 1018 getActionDefinitionsBuilder(G_INTTOPTR) 1019 // List the common cases 1020 .legalForCartesianProduct(AddrSpaces64, {S64}) 1021 .legalForCartesianProduct(AddrSpaces32, {S32}) 1022 .scalarize(0) 1023 // Accept any address space as long as the size matches 1024 .legalIf(sameSize(0, 1)) 1025 .widenScalarIf(smallerThan(1, 0), 1026 [](const LegalityQuery &Query) { 1027 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1028 }) 1029 .narrowScalarIf(largerThan(1, 0), 1030 [](const LegalityQuery &Query) { 1031 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1032 }); 1033 1034 getActionDefinitionsBuilder(G_PTRTOINT) 1035 // List the common cases 1036 .legalForCartesianProduct(AddrSpaces64, {S64}) 1037 .legalForCartesianProduct(AddrSpaces32, {S32}) 1038 .scalarize(0) 1039 // Accept any address space as long as the size matches 1040 .legalIf(sameSize(0, 1)) 1041 .widenScalarIf(smallerThan(0, 1), 1042 [](const LegalityQuery &Query) { 1043 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1044 }) 1045 .narrowScalarIf( 1046 largerThan(0, 1), 1047 [](const LegalityQuery &Query) { 1048 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1049 }); 1050 1051 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1052 .scalarize(0) 1053 .custom(); 1054 1055 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1056 bool IsLoad) -> bool { 1057 const LLT DstTy = Query.Types[0]; 1058 1059 // Split vector extloads. 1060 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1061 1062 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1063 return true; 1064 1065 const LLT PtrTy = Query.Types[1]; 1066 unsigned AS = PtrTy.getAddressSpace(); 1067 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1068 return true; 1069 1070 // Catch weird sized loads that don't evenly divide into the access sizes 1071 // TODO: May be able to widen depending on alignment etc. 1072 unsigned NumRegs = (MemSize + 31) / 32; 1073 if (NumRegs == 3) { 1074 if (!ST.hasDwordx3LoadStores()) 1075 return true; 1076 } else { 1077 // If the alignment allows, these should have been widened. 1078 if (!isPowerOf2_32(NumRegs)) 1079 return true; 1080 } 1081 1082 return false; 1083 }; 1084 1085 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1086 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1087 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1088 1089 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1090 // LDS 1091 // TODO: Unsupported flat for SI. 1092 1093 for (unsigned Op : {G_LOAD, G_STORE}) { 1094 const bool IsStore = Op == G_STORE; 1095 1096 auto &Actions = getActionDefinitionsBuilder(Op); 1097 // Explicitly list some common cases. 1098 // TODO: Does this help compile time at all? 1099 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1100 {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1101 {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1102 {S64, GlobalPtr, S64, GlobalAlign32}, 1103 {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1104 {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1105 {S32, GlobalPtr, S8, GlobalAlign8}, 1106 {S32, GlobalPtr, S16, GlobalAlign16}, 1107 1108 {S32, LocalPtr, S32, 32}, 1109 {S64, LocalPtr, S64, 32}, 1110 {V2S32, LocalPtr, V2S32, 32}, 1111 {S32, LocalPtr, S8, 8}, 1112 {S32, LocalPtr, S16, 16}, 1113 {V2S16, LocalPtr, S32, 32}, 1114 1115 {S32, PrivatePtr, S32, 32}, 1116 {S32, PrivatePtr, S8, 8}, 1117 {S32, PrivatePtr, S16, 16}, 1118 {V2S16, PrivatePtr, S32, 32}, 1119 1120 {S32, ConstantPtr, S32, GlobalAlign32}, 1121 {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1122 {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1123 {S64, ConstantPtr, S64, GlobalAlign32}, 1124 {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 1125 Actions.legalIf( 1126 [=](const LegalityQuery &Query) -> bool { 1127 return isLoadStoreLegal(ST, Query); 1128 }); 1129 1130 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1131 // 64-bits. 1132 // 1133 // TODO: Should generalize bitcast action into coerce, which will also cover 1134 // inserting addrspacecasts. 1135 Actions.customIf(typeIs(1, Constant32Ptr)); 1136 1137 // Turn any illegal element vectors into something easier to deal 1138 // with. These will ultimately produce 32-bit scalar shifts to extract the 1139 // parts anyway. 1140 // 1141 // For odd 16-bit element vectors, prefer to split those into pieces with 1142 // 16-bit vector parts. 1143 Actions.bitcastIf( 1144 [=](const LegalityQuery &Query) -> bool { 1145 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1146 Query.MMODescrs[0].MemoryTy); 1147 }, bitcastToRegisterType(0)); 1148 1149 if (!IsStore) { 1150 // Widen suitably aligned loads by loading extra bytes. The standard 1151 // legalization actions can't properly express widening memory operands. 1152 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1153 return shouldWidenLoad(ST, Query, G_LOAD); 1154 }); 1155 } 1156 1157 // FIXME: load/store narrowing should be moved to lower action 1158 Actions 1159 .narrowScalarIf( 1160 [=](const LegalityQuery &Query) -> bool { 1161 return !Query.Types[0].isVector() && 1162 needToSplitMemOp(Query, Op == G_LOAD); 1163 }, 1164 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1165 const LLT DstTy = Query.Types[0]; 1166 const LLT PtrTy = Query.Types[1]; 1167 1168 const unsigned DstSize = DstTy.getSizeInBits(); 1169 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1170 1171 // Split extloads. 1172 if (DstSize > MemSize) 1173 return std::make_pair(0, LLT::scalar(MemSize)); 1174 1175 unsigned MaxSize = maxSizeForAddrSpace(ST, 1176 PtrTy.getAddressSpace(), 1177 Op == G_LOAD); 1178 if (MemSize > MaxSize) 1179 return std::make_pair(0, LLT::scalar(MaxSize)); 1180 1181 uint64_t Align = Query.MMODescrs[0].AlignInBits; 1182 return std::make_pair(0, LLT::scalar(Align)); 1183 }) 1184 .fewerElementsIf( 1185 [=](const LegalityQuery &Query) -> bool { 1186 return Query.Types[0].isVector() && 1187 needToSplitMemOp(Query, Op == G_LOAD); 1188 }, 1189 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1190 const LLT DstTy = Query.Types[0]; 1191 const LLT PtrTy = Query.Types[1]; 1192 1193 LLT EltTy = DstTy.getElementType(); 1194 unsigned MaxSize = maxSizeForAddrSpace(ST, 1195 PtrTy.getAddressSpace(), 1196 Op == G_LOAD); 1197 1198 // FIXME: Handle widened to power of 2 results better. This ends 1199 // up scalarizing. 1200 // FIXME: 3 element stores scalarized on SI 1201 1202 // Split if it's too large for the address space. 1203 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1204 if (MemSize > MaxSize) { 1205 unsigned NumElts = DstTy.getNumElements(); 1206 unsigned EltSize = EltTy.getSizeInBits(); 1207 1208 if (MaxSize % EltSize == 0) { 1209 return std::make_pair( 1210 0, LLT::scalarOrVector( 1211 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1212 } 1213 1214 unsigned NumPieces = MemSize / MaxSize; 1215 1216 // FIXME: Refine when odd breakdowns handled 1217 // The scalars will need to be re-legalized. 1218 if (NumPieces == 1 || NumPieces >= NumElts || 1219 NumElts % NumPieces != 0) 1220 return std::make_pair(0, EltTy); 1221 1222 return std::make_pair( 1223 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1224 } 1225 1226 // FIXME: We could probably handle weird extending loads better. 1227 if (DstTy.getSizeInBits() > MemSize) 1228 return std::make_pair(0, EltTy); 1229 1230 unsigned EltSize = EltTy.getSizeInBits(); 1231 unsigned DstSize = DstTy.getSizeInBits(); 1232 if (!isPowerOf2_32(DstSize)) { 1233 // We're probably decomposing an odd sized store. Try to split 1234 // to the widest type. TODO: Account for alignment. As-is it 1235 // should be OK, since the new parts will be further legalized. 1236 unsigned FloorSize = PowerOf2Floor(DstSize); 1237 return std::make_pair( 1238 0, LLT::scalarOrVector( 1239 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1240 } 1241 1242 // May need relegalization for the scalars. 1243 return std::make_pair(0, EltTy); 1244 }) 1245 .minScalar(0, S32) 1246 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1247 .widenScalarToNextPow2(0) 1248 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1249 .lower(); 1250 } 1251 1252 // FIXME: Unaligned accesses not lowered. 1253 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1254 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1255 {S32, GlobalPtr, S16, 2 * 8}, 1256 {S32, LocalPtr, S8, 8}, 1257 {S32, LocalPtr, S16, 16}, 1258 {S32, PrivatePtr, S8, 8}, 1259 {S32, PrivatePtr, S16, 16}, 1260 {S32, ConstantPtr, S8, 8}, 1261 {S32, ConstantPtr, S16, 2 * 8}}) 1262 .legalIf( 1263 [=](const LegalityQuery &Query) -> bool { 1264 return isLoadStoreLegal(ST, Query); 1265 }); 1266 1267 if (ST.hasFlatAddressSpace()) { 1268 ExtLoads.legalForTypesWithMemDesc( 1269 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 1270 } 1271 1272 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1273 // 64-bits. 1274 // 1275 // TODO: Should generalize bitcast action into coerce, which will also cover 1276 // inserting addrspacecasts. 1277 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1278 1279 ExtLoads.clampScalar(0, S32, S32) 1280 .widenScalarToNextPow2(0) 1281 .lower(); 1282 1283 auto &Atomics = getActionDefinitionsBuilder( 1284 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1285 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1286 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1287 G_ATOMICRMW_UMIN}) 1288 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1289 {S64, GlobalPtr}, {S64, LocalPtr}, 1290 {S32, RegionPtr}, {S64, RegionPtr}}); 1291 if (ST.hasFlatAddressSpace()) { 1292 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1293 } 1294 1295 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1296 if (ST.hasLDSFPAtomicAdd()) { 1297 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1298 if (ST.hasGFX90AInsts()) 1299 Atomic.legalFor({{S64, LocalPtr}}); 1300 } 1301 if (ST.hasAtomicFaddInsts()) 1302 Atomic.legalFor({{S32, GlobalPtr}}); 1303 1304 if (ST.hasGFX90AInsts()) { 1305 // These are legal with some caveats, and should have undergone expansion in 1306 // the IR in most situations 1307 // TODO: Move atomic expansion into legalizer 1308 // TODO: Also supports <2 x f16> 1309 Atomic.legalFor({ 1310 {S32, GlobalPtr}, 1311 {S64, GlobalPtr}, 1312 {S64, FlatPtr} 1313 }); 1314 } 1315 1316 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1317 // demarshalling 1318 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1319 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1320 {S32, FlatPtr}, {S64, FlatPtr}}) 1321 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1322 {S32, RegionPtr}, {S64, RegionPtr}}); 1323 // TODO: Pointer types, any 32-bit or 64-bit vector 1324 1325 // Condition should be s32 for scalar, s1 for vector. 1326 getActionDefinitionsBuilder(G_SELECT) 1327 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1328 LocalPtr, FlatPtr, PrivatePtr, 1329 LLT::fixed_vector(2, LocalPtr), 1330 LLT::fixed_vector(2, PrivatePtr)}, 1331 {S1, S32}) 1332 .clampScalar(0, S16, S64) 1333 .scalarize(1) 1334 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1335 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1336 .clampMaxNumElements(0, S32, 2) 1337 .clampMaxNumElements(0, LocalPtr, 2) 1338 .clampMaxNumElements(0, PrivatePtr, 2) 1339 .scalarize(0) 1340 .widenScalarToNextPow2(0) 1341 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1342 1343 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1344 // be more flexible with the shift amount type. 1345 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1346 .legalFor({{S32, S32}, {S64, S32}}); 1347 if (ST.has16BitInsts()) { 1348 if (ST.hasVOP3PInsts()) { 1349 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1350 .clampMaxNumElements(0, S16, 2); 1351 } else 1352 Shifts.legalFor({{S16, S16}}); 1353 1354 // TODO: Support 16-bit shift amounts for all types 1355 Shifts.widenScalarIf( 1356 [=](const LegalityQuery &Query) { 1357 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1358 // 32-bit amount. 1359 const LLT ValTy = Query.Types[0]; 1360 const LLT AmountTy = Query.Types[1]; 1361 return ValTy.getSizeInBits() <= 16 && 1362 AmountTy.getSizeInBits() < 16; 1363 }, changeTo(1, S16)); 1364 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1365 Shifts.clampScalar(1, S32, S32); 1366 Shifts.widenScalarToNextPow2(0, 16); 1367 Shifts.clampScalar(0, S16, S64); 1368 1369 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1370 .minScalar(0, S16) 1371 .scalarize(0) 1372 .lower(); 1373 } else { 1374 // Make sure we legalize the shift amount type first, as the general 1375 // expansion for the shifted type will produce much worse code if it hasn't 1376 // been truncated already. 1377 Shifts.clampScalar(1, S32, S32); 1378 Shifts.widenScalarToNextPow2(0, 32); 1379 Shifts.clampScalar(0, S32, S64); 1380 1381 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1382 .minScalar(0, S32) 1383 .scalarize(0) 1384 .lower(); 1385 } 1386 Shifts.scalarize(0); 1387 1388 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1389 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1390 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1391 unsigned IdxTypeIdx = 2; 1392 1393 getActionDefinitionsBuilder(Op) 1394 .customIf([=](const LegalityQuery &Query) { 1395 const LLT EltTy = Query.Types[EltTypeIdx]; 1396 const LLT VecTy = Query.Types[VecTypeIdx]; 1397 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1398 const unsigned EltSize = EltTy.getSizeInBits(); 1399 return (EltSize == 32 || EltSize == 64) && 1400 VecTy.getSizeInBits() % 32 == 0 && 1401 VecTy.getSizeInBits() <= MaxRegisterSize && 1402 IdxTy.getSizeInBits() == 32; 1403 }) 1404 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1405 bitcastToVectorElement32(VecTypeIdx)) 1406 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1407 .bitcastIf( 1408 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1409 [=](const LegalityQuery &Query) { 1410 // For > 64-bit element types, try to turn this into a 64-bit 1411 // element vector since we may be able to do better indexing 1412 // if this is scalar. If not, fall back to 32. 1413 const LLT EltTy = Query.Types[EltTypeIdx]; 1414 const LLT VecTy = Query.Types[VecTypeIdx]; 1415 const unsigned DstEltSize = EltTy.getSizeInBits(); 1416 const unsigned VecSize = VecTy.getSizeInBits(); 1417 1418 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1419 return std::make_pair( 1420 VecTypeIdx, 1421 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1422 }) 1423 .clampScalar(EltTypeIdx, S32, S64) 1424 .clampScalar(VecTypeIdx, S32, S64) 1425 .clampScalar(IdxTypeIdx, S32, S32) 1426 .clampMaxNumElements(VecTypeIdx, S32, 32) 1427 // TODO: Clamp elements for 64-bit vectors? 1428 // It should only be necessary with variable indexes. 1429 // As a last resort, lower to the stack 1430 .lower(); 1431 } 1432 1433 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1434 .unsupportedIf([=](const LegalityQuery &Query) { 1435 const LLT &EltTy = Query.Types[1].getElementType(); 1436 return Query.Types[0] != EltTy; 1437 }); 1438 1439 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1440 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1441 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1442 1443 // FIXME: Doesn't handle extract of illegal sizes. 1444 getActionDefinitionsBuilder(Op) 1445 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1446 .lowerIf([=](const LegalityQuery &Query) { 1447 // Sub-vector(or single element) insert and extract. 1448 // TODO: verify immediate offset here since lower only works with 1449 // whole elements. 1450 const LLT BigTy = Query.Types[BigTyIdx]; 1451 return BigTy.isVector(); 1452 }) 1453 // FIXME: Multiples of 16 should not be legal. 1454 .legalIf([=](const LegalityQuery &Query) { 1455 const LLT BigTy = Query.Types[BigTyIdx]; 1456 const LLT LitTy = Query.Types[LitTyIdx]; 1457 return (BigTy.getSizeInBits() % 32 == 0) && 1458 (LitTy.getSizeInBits() % 16 == 0); 1459 }) 1460 .widenScalarIf( 1461 [=](const LegalityQuery &Query) { 1462 const LLT BigTy = Query.Types[BigTyIdx]; 1463 return (BigTy.getScalarSizeInBits() < 16); 1464 }, 1465 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1466 .widenScalarIf( 1467 [=](const LegalityQuery &Query) { 1468 const LLT LitTy = Query.Types[LitTyIdx]; 1469 return (LitTy.getScalarSizeInBits() < 16); 1470 }, 1471 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1472 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1473 .widenScalarToNextPow2(BigTyIdx, 32); 1474 1475 } 1476 1477 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1478 .legalForCartesianProduct(AllS32Vectors, {S32}) 1479 .legalForCartesianProduct(AllS64Vectors, {S64}) 1480 .clampNumElements(0, V16S32, V32S32) 1481 .clampNumElements(0, V2S64, V16S64) 1482 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1483 1484 if (ST.hasScalarPackInsts()) { 1485 BuildVector 1486 // FIXME: Should probably widen s1 vectors straight to s32 1487 .minScalarOrElt(0, S16) 1488 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1489 .minScalar(1, S32); 1490 1491 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1492 .legalFor({V2S16, S32}) 1493 .lower(); 1494 BuildVector.minScalarOrElt(0, S32); 1495 } else { 1496 BuildVector.customFor({V2S16, S16}); 1497 BuildVector.minScalarOrElt(0, S32); 1498 1499 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1500 .customFor({V2S16, S32}) 1501 .lower(); 1502 } 1503 1504 BuildVector.legalIf(isRegisterType(0)); 1505 1506 // FIXME: Clamp maximum size 1507 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1508 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1509 .clampMaxNumElements(0, S32, 32) 1510 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1511 .clampMaxNumElements(0, S16, 64); 1512 1513 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1514 // pre-legalize. 1515 if (ST.hasVOP3PInsts()) { 1516 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1517 .customFor({V2S16, V2S16}) 1518 .lower(); 1519 } else 1520 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1521 1522 // Merge/Unmerge 1523 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1524 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1525 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1526 1527 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1528 const LLT Ty = Query.Types[TypeIdx]; 1529 if (Ty.isVector()) { 1530 const LLT &EltTy = Ty.getElementType(); 1531 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1532 return true; 1533 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1534 return true; 1535 } 1536 return false; 1537 }; 1538 1539 auto &Builder = getActionDefinitionsBuilder(Op) 1540 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1541 .lowerFor({{S16, V2S16}}) 1542 .lowerIf([=](const LegalityQuery &Query) { 1543 const LLT BigTy = Query.Types[BigTyIdx]; 1544 return BigTy.getSizeInBits() == 32; 1545 }) 1546 // Try to widen to s16 first for small types. 1547 // TODO: Only do this on targets with legal s16 shifts 1548 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1549 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1550 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1551 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1552 elementTypeIs(1, S16)), 1553 changeTo(1, V2S16)) 1554 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1555 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1556 // valid. 1557 .clampScalar(LitTyIdx, S32, S512) 1558 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1559 // Break up vectors with weird elements into scalars 1560 .fewerElementsIf( 1561 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1562 scalarize(0)) 1563 .fewerElementsIf( 1564 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1565 scalarize(1)) 1566 .clampScalar(BigTyIdx, S32, MaxScalar); 1567 1568 if (Op == G_MERGE_VALUES) { 1569 Builder.widenScalarIf( 1570 // TODO: Use 16-bit shifts if legal for 8-bit values? 1571 [=](const LegalityQuery &Query) { 1572 const LLT Ty = Query.Types[LitTyIdx]; 1573 return Ty.getSizeInBits() < 32; 1574 }, 1575 changeTo(LitTyIdx, S32)); 1576 } 1577 1578 Builder.widenScalarIf( 1579 [=](const LegalityQuery &Query) { 1580 const LLT Ty = Query.Types[BigTyIdx]; 1581 return !isPowerOf2_32(Ty.getSizeInBits()) && 1582 Ty.getSizeInBits() % 16 != 0; 1583 }, 1584 [=](const LegalityQuery &Query) { 1585 // Pick the next power of 2, or a multiple of 64 over 128. 1586 // Whichever is smaller. 1587 const LLT &Ty = Query.Types[BigTyIdx]; 1588 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1589 if (NewSizeInBits >= 256) { 1590 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1591 if (RoundedTo < NewSizeInBits) 1592 NewSizeInBits = RoundedTo; 1593 } 1594 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1595 }) 1596 // Any vectors left are the wrong size. Scalarize them. 1597 .scalarize(0) 1598 .scalarize(1); 1599 } 1600 1601 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1602 // RegBankSelect. 1603 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1604 .legalFor({{S32}, {S64}}); 1605 1606 if (ST.hasVOP3PInsts()) { 1607 SextInReg.lowerFor({{V2S16}}) 1608 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1609 // get more vector shift opportunities, since we'll get those when 1610 // expanded. 1611 .clampMaxNumElementsStrict(0, S16, 2); 1612 } else if (ST.has16BitInsts()) { 1613 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1614 } else { 1615 // Prefer to promote to s32 before lowering if we don't have 16-bit 1616 // shifts. This avoid a lot of intermediate truncate and extend operations. 1617 SextInReg.lowerFor({{S32}, {S64}}); 1618 } 1619 1620 SextInReg 1621 .scalarize(0) 1622 .clampScalar(0, S32, S64) 1623 .lower(); 1624 1625 getActionDefinitionsBuilder({G_ROTR, G_ROTL}) 1626 .scalarize(0) 1627 .lower(); 1628 1629 // TODO: Only Try to form v2s16 with legal packed instructions. 1630 getActionDefinitionsBuilder(G_FSHR) 1631 .legalFor({{S32, S32}}) 1632 .lowerFor({{V2S16, V2S16}}) 1633 .clampMaxNumElementsStrict(0, S16, 2) 1634 .scalarize(0) 1635 .lower(); 1636 1637 if (ST.hasVOP3PInsts()) { 1638 getActionDefinitionsBuilder(G_FSHL) 1639 .lowerFor({{V2S16, V2S16}}) 1640 .clampMaxNumElementsStrict(0, S16, 2) 1641 .scalarize(0) 1642 .lower(); 1643 } else { 1644 getActionDefinitionsBuilder(G_FSHL) 1645 .scalarize(0) 1646 .lower(); 1647 } 1648 1649 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1650 .legalFor({S64}); 1651 1652 getActionDefinitionsBuilder(G_FENCE) 1653 .alwaysLegal(); 1654 1655 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1656 .scalarize(0) 1657 .minScalar(0, S32) 1658 .lower(); 1659 1660 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1661 .legalFor({{S32, S32}, {S64, S32}}) 1662 .clampScalar(1, S32, S32) 1663 .clampScalar(0, S32, S64) 1664 .widenScalarToNextPow2(0) 1665 .scalarize(0); 1666 1667 getActionDefinitionsBuilder({ 1668 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1669 G_FCOPYSIGN, 1670 1671 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1672 G_ATOMICRMW_NAND, 1673 G_ATOMICRMW_FSUB, 1674 G_READ_REGISTER, 1675 G_WRITE_REGISTER, 1676 1677 G_SADDO, G_SSUBO, 1678 1679 // TODO: Implement 1680 G_FMINIMUM, G_FMAXIMUM}).lower(); 1681 1682 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) 1683 .lower(); 1684 1685 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1686 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1687 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1688 .unsupported(); 1689 1690 getLegacyLegalizerInfo().computeTables(); 1691 verify(*ST.getInstrInfo()); 1692 } 1693 1694 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1695 MachineInstr &MI) const { 1696 MachineIRBuilder &B = Helper.MIRBuilder; 1697 MachineRegisterInfo &MRI = *B.getMRI(); 1698 1699 switch (MI.getOpcode()) { 1700 case TargetOpcode::G_ADDRSPACE_CAST: 1701 return legalizeAddrSpaceCast(MI, MRI, B); 1702 case TargetOpcode::G_FRINT: 1703 return legalizeFrint(MI, MRI, B); 1704 case TargetOpcode::G_FCEIL: 1705 return legalizeFceil(MI, MRI, B); 1706 case TargetOpcode::G_FREM: 1707 return legalizeFrem(MI, MRI, B); 1708 case TargetOpcode::G_INTRINSIC_TRUNC: 1709 return legalizeIntrinsicTrunc(MI, MRI, B); 1710 case TargetOpcode::G_SITOFP: 1711 return legalizeITOFP(MI, MRI, B, true); 1712 case TargetOpcode::G_UITOFP: 1713 return legalizeITOFP(MI, MRI, B, false); 1714 case TargetOpcode::G_FPTOSI: 1715 return legalizeFPTOI(MI, MRI, B, true); 1716 case TargetOpcode::G_FPTOUI: 1717 return legalizeFPTOI(MI, MRI, B, false); 1718 case TargetOpcode::G_FMINNUM: 1719 case TargetOpcode::G_FMAXNUM: 1720 case TargetOpcode::G_FMINNUM_IEEE: 1721 case TargetOpcode::G_FMAXNUM_IEEE: 1722 return legalizeMinNumMaxNum(Helper, MI); 1723 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1724 return legalizeExtractVectorElt(MI, MRI, B); 1725 case TargetOpcode::G_INSERT_VECTOR_ELT: 1726 return legalizeInsertVectorElt(MI, MRI, B); 1727 case TargetOpcode::G_SHUFFLE_VECTOR: 1728 return legalizeShuffleVector(MI, MRI, B); 1729 case TargetOpcode::G_FSIN: 1730 case TargetOpcode::G_FCOS: 1731 return legalizeSinCos(MI, MRI, B); 1732 case TargetOpcode::G_GLOBAL_VALUE: 1733 return legalizeGlobalValue(MI, MRI, B); 1734 case TargetOpcode::G_LOAD: 1735 case TargetOpcode::G_SEXTLOAD: 1736 case TargetOpcode::G_ZEXTLOAD: 1737 return legalizeLoad(Helper, MI); 1738 case TargetOpcode::G_FMAD: 1739 return legalizeFMad(MI, MRI, B); 1740 case TargetOpcode::G_FDIV: 1741 return legalizeFDIV(MI, MRI, B); 1742 case TargetOpcode::G_UDIV: 1743 case TargetOpcode::G_UREM: 1744 case TargetOpcode::G_UDIVREM: 1745 return legalizeUnsignedDIV_REM(MI, MRI, B); 1746 case TargetOpcode::G_SDIV: 1747 case TargetOpcode::G_SREM: 1748 case TargetOpcode::G_SDIVREM: 1749 return legalizeSignedDIV_REM(MI, MRI, B); 1750 case TargetOpcode::G_ATOMIC_CMPXCHG: 1751 return legalizeAtomicCmpXChg(MI, MRI, B); 1752 case TargetOpcode::G_FLOG: 1753 return legalizeFlog(MI, B, numbers::ln2f); 1754 case TargetOpcode::G_FLOG10: 1755 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1756 case TargetOpcode::G_FEXP: 1757 return legalizeFExp(MI, B); 1758 case TargetOpcode::G_FPOW: 1759 return legalizeFPow(MI, B); 1760 case TargetOpcode::G_FFLOOR: 1761 return legalizeFFloor(MI, MRI, B); 1762 case TargetOpcode::G_BUILD_VECTOR: 1763 return legalizeBuildVector(MI, MRI, B); 1764 case TargetOpcode::G_CTLZ: 1765 case TargetOpcode::G_CTTZ: 1766 return legalizeCTLZ_CTTZ(MI, MRI, B); 1767 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND: 1768 return legalizeFPTruncRound(MI, B); 1769 default: 1770 return false; 1771 } 1772 1773 llvm_unreachable("expected switch to return"); 1774 } 1775 1776 Register AMDGPULegalizerInfo::getSegmentAperture( 1777 unsigned AS, 1778 MachineRegisterInfo &MRI, 1779 MachineIRBuilder &B) const { 1780 MachineFunction &MF = B.getMF(); 1781 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1782 const LLT S32 = LLT::scalar(32); 1783 1784 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1785 1786 if (ST.hasApertureRegs()) { 1787 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1788 // getreg. 1789 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1790 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1791 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1792 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1793 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1794 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1795 unsigned Encoding = 1796 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1797 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1798 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1799 1800 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1801 1802 B.buildInstr(AMDGPU::S_GETREG_B32) 1803 .addDef(GetReg) 1804 .addImm(Encoding); 1805 MRI.setType(GetReg, S32); 1806 1807 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1808 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1809 } 1810 1811 Register QueuePtr = MRI.createGenericVirtualRegister( 1812 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1813 1814 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1815 return Register(); 1816 1817 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1818 // private_segment_aperture_base_hi. 1819 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1820 1821 // TODO: can we be smarter about machine pointer info? 1822 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1823 MachineMemOperand *MMO = MF.getMachineMemOperand( 1824 PtrInfo, 1825 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1826 MachineMemOperand::MOInvariant, 1827 LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 1828 1829 Register LoadAddr; 1830 1831 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1832 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1833 } 1834 1835 /// Return true if the value is a known valid address, such that a null check is 1836 /// not necessary. 1837 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, 1838 const AMDGPUTargetMachine &TM, unsigned AddrSpace) { 1839 MachineInstr *Def = MRI.getVRegDef(Val); 1840 switch (Def->getOpcode()) { 1841 case AMDGPU::G_FRAME_INDEX: 1842 case AMDGPU::G_GLOBAL_VALUE: 1843 case AMDGPU::G_BLOCK_ADDR: 1844 return true; 1845 case AMDGPU::G_CONSTANT: { 1846 const ConstantInt *CI = Def->getOperand(1).getCImm(); 1847 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace); 1848 } 1849 default: 1850 return false; 1851 } 1852 1853 return false; 1854 } 1855 1856 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1857 MachineInstr &MI, MachineRegisterInfo &MRI, 1858 MachineIRBuilder &B) const { 1859 MachineFunction &MF = B.getMF(); 1860 1861 const LLT S32 = LLT::scalar(32); 1862 Register Dst = MI.getOperand(0).getReg(); 1863 Register Src = MI.getOperand(1).getReg(); 1864 1865 LLT DstTy = MRI.getType(Dst); 1866 LLT SrcTy = MRI.getType(Src); 1867 unsigned DestAS = DstTy.getAddressSpace(); 1868 unsigned SrcAS = SrcTy.getAddressSpace(); 1869 1870 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1871 // vector element. 1872 assert(!DstTy.isVector()); 1873 1874 const AMDGPUTargetMachine &TM 1875 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1876 1877 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1878 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1879 return true; 1880 } 1881 1882 if (SrcAS == AMDGPUAS::FLAT_ADDRESS && 1883 (DestAS == AMDGPUAS::LOCAL_ADDRESS || 1884 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) { 1885 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 1886 // Extract low 32-bits of the pointer. 1887 B.buildExtract(Dst, Src, 0); 1888 MI.eraseFromParent(); 1889 return true; 1890 } 1891 1892 unsigned NullVal = TM.getNullPointerValue(DestAS); 1893 1894 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1895 auto FlatNull = B.buildConstant(SrcTy, 0); 1896 1897 // Extract low 32-bits of the pointer. 1898 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1899 1900 auto CmpRes = 1901 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1902 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1903 1904 MI.eraseFromParent(); 1905 return true; 1906 } 1907 1908 if (DestAS == AMDGPUAS::FLAT_ADDRESS && 1909 (SrcAS == AMDGPUAS::LOCAL_ADDRESS || 1910 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) { 1911 if (!ST.hasFlatAddressSpace()) 1912 return false; 1913 1914 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1915 if (!ApertureReg.isValid()) 1916 return false; 1917 1918 // Coerce the type of the low half of the result so we can use merge_values. 1919 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1920 1921 // TODO: Should we allow mismatched types but matching sizes in merges to 1922 // avoid the ptrtoint? 1923 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1924 1925 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 1926 B.buildCopy(Dst, BuildPtr); 1927 MI.eraseFromParent(); 1928 return true; 1929 } 1930 1931 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1932 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1933 1934 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, 1935 SegmentNull.getReg(0)); 1936 1937 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1938 1939 MI.eraseFromParent(); 1940 return true; 1941 } 1942 1943 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 1944 SrcTy.getSizeInBits() == 64) { 1945 // Truncate. 1946 B.buildExtract(Dst, Src, 0); 1947 MI.eraseFromParent(); 1948 return true; 1949 } 1950 1951 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 1952 DstTy.getSizeInBits() == 64) { 1953 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1954 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1955 1956 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1957 // another. Merge operands are required to be the same type, but creating an 1958 // extra ptrtoint would be kind of pointless. 1959 auto HighAddr = B.buildConstant( 1960 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1961 B.buildMerge(Dst, {Src, HighAddr}); 1962 MI.eraseFromParent(); 1963 return true; 1964 } 1965 1966 DiagnosticInfoUnsupported InvalidAddrSpaceCast( 1967 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc()); 1968 1969 LLVMContext &Ctx = MF.getFunction().getContext(); 1970 Ctx.diagnose(InvalidAddrSpaceCast); 1971 B.buildUndef(Dst); 1972 MI.eraseFromParent(); 1973 return true; 1974 } 1975 1976 bool AMDGPULegalizerInfo::legalizeFrint( 1977 MachineInstr &MI, MachineRegisterInfo &MRI, 1978 MachineIRBuilder &B) const { 1979 Register Src = MI.getOperand(1).getReg(); 1980 LLT Ty = MRI.getType(Src); 1981 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1982 1983 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1984 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1985 1986 auto C1 = B.buildFConstant(Ty, C1Val); 1987 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1988 1989 // TODO: Should this propagate fast-math-flags? 1990 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1991 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1992 1993 auto C2 = B.buildFConstant(Ty, C2Val); 1994 auto Fabs = B.buildFAbs(Ty, Src); 1995 1996 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1997 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1998 MI.eraseFromParent(); 1999 return true; 2000 } 2001 2002 bool AMDGPULegalizerInfo::legalizeFceil( 2003 MachineInstr &MI, MachineRegisterInfo &MRI, 2004 MachineIRBuilder &B) const { 2005 2006 const LLT S1 = LLT::scalar(1); 2007 const LLT S64 = LLT::scalar(64); 2008 2009 Register Src = MI.getOperand(1).getReg(); 2010 assert(MRI.getType(Src) == S64); 2011 2012 // result = trunc(src) 2013 // if (src > 0.0 && src != result) 2014 // result += 1.0 2015 2016 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 2017 2018 const auto Zero = B.buildFConstant(S64, 0.0); 2019 const auto One = B.buildFConstant(S64, 1.0); 2020 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 2021 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 2022 auto And = B.buildAnd(S1, Lt0, NeTrunc); 2023 auto Add = B.buildSelect(S64, And, One, Zero); 2024 2025 // TODO: Should this propagate fast-math-flags? 2026 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 2027 MI.eraseFromParent(); 2028 return true; 2029 } 2030 2031 bool AMDGPULegalizerInfo::legalizeFrem( 2032 MachineInstr &MI, MachineRegisterInfo &MRI, 2033 MachineIRBuilder &B) const { 2034 Register DstReg = MI.getOperand(0).getReg(); 2035 Register Src0Reg = MI.getOperand(1).getReg(); 2036 Register Src1Reg = MI.getOperand(2).getReg(); 2037 auto Flags = MI.getFlags(); 2038 LLT Ty = MRI.getType(DstReg); 2039 2040 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 2041 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 2042 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 2043 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 2044 MI.eraseFromParent(); 2045 return true; 2046 } 2047 2048 static MachineInstrBuilder extractF64Exponent(Register Hi, 2049 MachineIRBuilder &B) { 2050 const unsigned FractBits = 52; 2051 const unsigned ExpBits = 11; 2052 LLT S32 = LLT::scalar(32); 2053 2054 auto Const0 = B.buildConstant(S32, FractBits - 32); 2055 auto Const1 = B.buildConstant(S32, ExpBits); 2056 2057 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 2058 .addUse(Hi) 2059 .addUse(Const0.getReg(0)) 2060 .addUse(Const1.getReg(0)); 2061 2062 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 2063 } 2064 2065 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2066 MachineInstr &MI, MachineRegisterInfo &MRI, 2067 MachineIRBuilder &B) const { 2068 const LLT S1 = LLT::scalar(1); 2069 const LLT S32 = LLT::scalar(32); 2070 const LLT S64 = LLT::scalar(64); 2071 2072 Register Src = MI.getOperand(1).getReg(); 2073 assert(MRI.getType(Src) == S64); 2074 2075 // TODO: Should this use extract since the low half is unused? 2076 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2077 Register Hi = Unmerge.getReg(1); 2078 2079 // Extract the upper half, since this is where we will find the sign and 2080 // exponent. 2081 auto Exp = extractF64Exponent(Hi, B); 2082 2083 const unsigned FractBits = 52; 2084 2085 // Extract the sign bit. 2086 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2087 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2088 2089 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2090 2091 const auto Zero32 = B.buildConstant(S32, 0); 2092 2093 // Extend back to 64-bits. 2094 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2095 2096 auto Shr = B.buildAShr(S64, FractMask, Exp); 2097 auto Not = B.buildNot(S64, Shr); 2098 auto Tmp0 = B.buildAnd(S64, Src, Not); 2099 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2100 2101 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2102 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2103 2104 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2105 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2106 MI.eraseFromParent(); 2107 return true; 2108 } 2109 2110 bool AMDGPULegalizerInfo::legalizeITOFP( 2111 MachineInstr &MI, MachineRegisterInfo &MRI, 2112 MachineIRBuilder &B, bool Signed) const { 2113 2114 Register Dst = MI.getOperand(0).getReg(); 2115 Register Src = MI.getOperand(1).getReg(); 2116 2117 const LLT S64 = LLT::scalar(64); 2118 const LLT S32 = LLT::scalar(32); 2119 2120 assert(MRI.getType(Src) == S64); 2121 2122 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2123 auto ThirtyTwo = B.buildConstant(S32, 32); 2124 2125 if (MRI.getType(Dst) == S64) { 2126 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) 2127 : B.buildUITOFP(S64, Unmerge.getReg(1)); 2128 2129 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2130 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2131 .addUse(CvtHi.getReg(0)) 2132 .addUse(ThirtyTwo.getReg(0)); 2133 2134 // TODO: Should this propagate fast-math-flags? 2135 B.buildFAdd(Dst, LdExp, CvtLo); 2136 MI.eraseFromParent(); 2137 return true; 2138 } 2139 2140 assert(MRI.getType(Dst) == S32); 2141 2142 auto One = B.buildConstant(S32, 1); 2143 2144 MachineInstrBuilder ShAmt; 2145 if (Signed) { 2146 auto ThirtyOne = B.buildConstant(S32, 31); 2147 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); 2148 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); 2149 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); 2150 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, 2151 /*HasSideEffects=*/false) 2152 .addUse(Unmerge.getReg(1)); 2153 auto LS2 = B.buildSub(S32, LS, One); 2154 ShAmt = B.buildUMin(S32, LS2, MaxShAmt); 2155 } else 2156 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); 2157 auto Norm = B.buildShl(S64, Src, ShAmt); 2158 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); 2159 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); 2160 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); 2161 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); 2162 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); 2163 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst}, 2164 /*HasSideEffects=*/false) 2165 .addUse(FVal.getReg(0)) 2166 .addUse(Scale.getReg(0)); 2167 MI.eraseFromParent(); 2168 return true; 2169 } 2170 2171 // TODO: Copied from DAG implementation. Verify logic and document how this 2172 // actually works. 2173 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2174 MachineRegisterInfo &MRI, 2175 MachineIRBuilder &B, 2176 bool Signed) const { 2177 2178 Register Dst = MI.getOperand(0).getReg(); 2179 Register Src = MI.getOperand(1).getReg(); 2180 2181 const LLT S64 = LLT::scalar(64); 2182 const LLT S32 = LLT::scalar(32); 2183 2184 const LLT SrcLT = MRI.getType(Src); 2185 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2186 2187 unsigned Flags = MI.getFlags(); 2188 2189 // The basic idea of converting a floating point number into a pair of 32-bit 2190 // integers is illustrated as follows: 2191 // 2192 // tf := trunc(val); 2193 // hif := floor(tf * 2^-32); 2194 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2195 // hi := fptoi(hif); 2196 // lo := fptoi(lof); 2197 // 2198 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2199 MachineInstrBuilder Sign; 2200 if (Signed && SrcLT == S32) { 2201 // However, a 32-bit floating point number has only 23 bits mantissa and 2202 // it's not enough to hold all the significant bits of `lof` if val is 2203 // negative. To avoid the loss of precision, We need to take the absolute 2204 // value after truncating and flip the result back based on the original 2205 // signedness. 2206 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2207 Trunc = B.buildFAbs(S32, Trunc, Flags); 2208 } 2209 MachineInstrBuilder K0, K1; 2210 if (SrcLT == S64) { 2211 K0 = B.buildFConstant(S64, 2212 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2213 K1 = B.buildFConstant(S64, 2214 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2215 } else { 2216 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2217 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2218 } 2219 2220 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2221 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2222 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2223 2224 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2225 : B.buildFPTOUI(S32, FloorMul); 2226 auto Lo = B.buildFPTOUI(S32, Fma); 2227 2228 if (Signed && SrcLT == S32) { 2229 // Flip the result based on the signedness, which is either all 0s or 1s. 2230 Sign = B.buildMerge(S64, {Sign, Sign}); 2231 // r := xor({lo, hi}, sign) - sign; 2232 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); 2233 } else 2234 B.buildMerge(Dst, {Lo, Hi}); 2235 MI.eraseFromParent(); 2236 2237 return true; 2238 } 2239 2240 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2241 MachineInstr &MI) const { 2242 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2243 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2244 2245 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2246 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2247 2248 // With ieee_mode disabled, the instructions have the correct behavior 2249 // already for G_FMINNUM/G_FMAXNUM 2250 if (!MFI->getMode().IEEE) 2251 return !IsIEEEOp; 2252 2253 if (IsIEEEOp) 2254 return true; 2255 2256 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2257 } 2258 2259 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2260 MachineInstr &MI, MachineRegisterInfo &MRI, 2261 MachineIRBuilder &B) const { 2262 // TODO: Should move some of this into LegalizerHelper. 2263 2264 // TODO: Promote dynamic indexing of s16 to s32 2265 2266 // FIXME: Artifact combiner probably should have replaced the truncated 2267 // constant before this, so we shouldn't need 2268 // getIConstantVRegValWithLookThrough. 2269 Optional<ValueAndVReg> MaybeIdxVal = 2270 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2271 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2272 return true; 2273 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2274 2275 Register Dst = MI.getOperand(0).getReg(); 2276 Register Vec = MI.getOperand(1).getReg(); 2277 2278 LLT VecTy = MRI.getType(Vec); 2279 LLT EltTy = VecTy.getElementType(); 2280 assert(EltTy == MRI.getType(Dst)); 2281 2282 if (IdxVal < VecTy.getNumElements()) { 2283 auto Unmerge = B.buildUnmerge(EltTy, Vec); 2284 B.buildCopy(Dst, Unmerge.getReg(IdxVal)); 2285 } else { 2286 B.buildUndef(Dst); 2287 } 2288 2289 MI.eraseFromParent(); 2290 return true; 2291 } 2292 2293 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2294 MachineInstr &MI, MachineRegisterInfo &MRI, 2295 MachineIRBuilder &B) const { 2296 // TODO: Should move some of this into LegalizerHelper. 2297 2298 // TODO: Promote dynamic indexing of s16 to s32 2299 2300 // FIXME: Artifact combiner probably should have replaced the truncated 2301 // constant before this, so we shouldn't need 2302 // getIConstantVRegValWithLookThrough. 2303 Optional<ValueAndVReg> MaybeIdxVal = 2304 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2305 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2306 return true; 2307 2308 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2309 Register Dst = MI.getOperand(0).getReg(); 2310 Register Vec = MI.getOperand(1).getReg(); 2311 Register Ins = MI.getOperand(2).getReg(); 2312 2313 LLT VecTy = MRI.getType(Vec); 2314 LLT EltTy = VecTy.getElementType(); 2315 assert(EltTy == MRI.getType(Ins)); 2316 (void)Ins; 2317 2318 unsigned NumElts = VecTy.getNumElements(); 2319 if (IdxVal < NumElts) { 2320 SmallVector<Register, 8> SrcRegs; 2321 for (unsigned i = 0; i < NumElts; ++i) 2322 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy)); 2323 B.buildUnmerge(SrcRegs, Vec); 2324 2325 SrcRegs[IdxVal] = MI.getOperand(2).getReg(); 2326 B.buildMerge(Dst, SrcRegs); 2327 } else { 2328 B.buildUndef(Dst); 2329 } 2330 2331 MI.eraseFromParent(); 2332 return true; 2333 } 2334 2335 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2336 MachineInstr &MI, MachineRegisterInfo &MRI, 2337 MachineIRBuilder &B) const { 2338 const LLT V2S16 = LLT::fixed_vector(2, 16); 2339 2340 Register Dst = MI.getOperand(0).getReg(); 2341 Register Src0 = MI.getOperand(1).getReg(); 2342 LLT DstTy = MRI.getType(Dst); 2343 LLT SrcTy = MRI.getType(Src0); 2344 2345 if (SrcTy == V2S16 && DstTy == V2S16 && 2346 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2347 return true; 2348 2349 MachineIRBuilder HelperBuilder(MI); 2350 GISelObserverWrapper DummyObserver; 2351 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2352 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2353 } 2354 2355 bool AMDGPULegalizerInfo::legalizeSinCos( 2356 MachineInstr &MI, MachineRegisterInfo &MRI, 2357 MachineIRBuilder &B) const { 2358 2359 Register DstReg = MI.getOperand(0).getReg(); 2360 Register SrcReg = MI.getOperand(1).getReg(); 2361 LLT Ty = MRI.getType(DstReg); 2362 unsigned Flags = MI.getFlags(); 2363 2364 Register TrigVal; 2365 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2366 if (ST.hasTrigReducedRange()) { 2367 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2368 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2369 .addUse(MulVal.getReg(0)) 2370 .setMIFlags(Flags).getReg(0); 2371 } else 2372 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2373 2374 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2375 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2376 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2377 .addUse(TrigVal) 2378 .setMIFlags(Flags); 2379 MI.eraseFromParent(); 2380 return true; 2381 } 2382 2383 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2384 MachineIRBuilder &B, 2385 const GlobalValue *GV, 2386 int64_t Offset, 2387 unsigned GAFlags) const { 2388 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2389 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2390 // to the following code sequence: 2391 // 2392 // For constant address space: 2393 // s_getpc_b64 s[0:1] 2394 // s_add_u32 s0, s0, $symbol 2395 // s_addc_u32 s1, s1, 0 2396 // 2397 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2398 // a fixup or relocation is emitted to replace $symbol with a literal 2399 // constant, which is a pc-relative offset from the encoding of the $symbol 2400 // operand to the global variable. 2401 // 2402 // For global address space: 2403 // s_getpc_b64 s[0:1] 2404 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2405 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2406 // 2407 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2408 // fixups or relocations are emitted to replace $symbol@*@lo and 2409 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2410 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2411 // operand to the global variable. 2412 // 2413 // What we want here is an offset from the value returned by s_getpc 2414 // (which is the address of the s_add_u32 instruction) to the global 2415 // variable, but since the encoding of $symbol starts 4 bytes after the start 2416 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2417 // small. This requires us to add 4 to the global variable offset in order to 2418 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2419 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2420 // instruction. 2421 2422 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2423 2424 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2425 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2426 2427 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2428 .addDef(PCReg); 2429 2430 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2431 if (GAFlags == SIInstrInfo::MO_NONE) 2432 MIB.addImm(0); 2433 else 2434 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2435 2436 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2437 2438 if (PtrTy.getSizeInBits() == 32) 2439 B.buildExtract(DstReg, PCReg, 0); 2440 return true; 2441 } 2442 2443 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2444 MachineInstr &MI, MachineRegisterInfo &MRI, 2445 MachineIRBuilder &B) const { 2446 Register DstReg = MI.getOperand(0).getReg(); 2447 LLT Ty = MRI.getType(DstReg); 2448 unsigned AS = Ty.getAddressSpace(); 2449 2450 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2451 MachineFunction &MF = B.getMF(); 2452 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2453 2454 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2455 if (!MFI->isModuleEntryFunction() && 2456 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2457 const Function &Fn = MF.getFunction(); 2458 DiagnosticInfoUnsupported BadLDSDecl( 2459 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2460 DS_Warning); 2461 Fn.getContext().diagnose(BadLDSDecl); 2462 2463 // We currently don't have a way to correctly allocate LDS objects that 2464 // aren't directly associated with a kernel. We do force inlining of 2465 // functions that use local objects. However, if these dead functions are 2466 // not eliminated, we don't want a compile time error. Just emit a warning 2467 // and a trap, since there should be no callable path here. 2468 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2469 B.buildUndef(DstReg); 2470 MI.eraseFromParent(); 2471 return true; 2472 } 2473 2474 // TODO: We could emit code to handle the initialization somewhere. 2475 // We ignore the initializer for now and legalize it to allow selection. 2476 // The initializer will anyway get errored out during assembly emission. 2477 const SITargetLowering *TLI = ST.getTargetLowering(); 2478 if (!TLI->shouldUseLDSConstAddress(GV)) { 2479 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2480 return true; // Leave in place; 2481 } 2482 2483 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2484 Type *Ty = GV->getValueType(); 2485 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2486 // zero-sized type in other languages to declare the dynamic shared 2487 // memory which size is not known at the compile time. They will be 2488 // allocated by the runtime and placed directly after the static 2489 // allocated ones. They all share the same offset. 2490 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2491 // Adjust alignment for that dynamic shared memory array. 2492 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2493 LLT S32 = LLT::scalar(32); 2494 auto Sz = 2495 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2496 B.buildIntToPtr(DstReg, Sz); 2497 MI.eraseFromParent(); 2498 return true; 2499 } 2500 } 2501 2502 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), 2503 *cast<GlobalVariable>(GV))); 2504 MI.eraseFromParent(); 2505 return true; 2506 } 2507 2508 const SITargetLowering *TLI = ST.getTargetLowering(); 2509 2510 if (TLI->shouldEmitFixup(GV)) { 2511 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2512 MI.eraseFromParent(); 2513 return true; 2514 } 2515 2516 if (TLI->shouldEmitPCReloc(GV)) { 2517 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2518 MI.eraseFromParent(); 2519 return true; 2520 } 2521 2522 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2523 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2524 2525 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 2526 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2527 MachinePointerInfo::getGOT(MF), 2528 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2529 MachineMemOperand::MOInvariant, 2530 LoadTy, Align(8)); 2531 2532 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2533 2534 if (Ty.getSizeInBits() == 32) { 2535 // Truncate if this is a 32-bit constant address. 2536 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2537 B.buildExtract(DstReg, Load, 0); 2538 } else 2539 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2540 2541 MI.eraseFromParent(); 2542 return true; 2543 } 2544 2545 static LLT widenToNextPowerOf2(LLT Ty) { 2546 if (Ty.isVector()) 2547 return Ty.changeElementCount( 2548 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2549 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2550 } 2551 2552 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2553 MachineInstr &MI) const { 2554 MachineIRBuilder &B = Helper.MIRBuilder; 2555 MachineRegisterInfo &MRI = *B.getMRI(); 2556 GISelChangeObserver &Observer = Helper.Observer; 2557 2558 Register PtrReg = MI.getOperand(1).getReg(); 2559 LLT PtrTy = MRI.getType(PtrReg); 2560 unsigned AddrSpace = PtrTy.getAddressSpace(); 2561 2562 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2563 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2564 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2565 Observer.changingInstr(MI); 2566 MI.getOperand(1).setReg(Cast.getReg(0)); 2567 Observer.changedInstr(MI); 2568 return true; 2569 } 2570 2571 if (MI.getOpcode() != AMDGPU::G_LOAD) 2572 return false; 2573 2574 Register ValReg = MI.getOperand(0).getReg(); 2575 LLT ValTy = MRI.getType(ValReg); 2576 2577 MachineMemOperand *MMO = *MI.memoperands_begin(); 2578 const unsigned ValSize = ValTy.getSizeInBits(); 2579 const LLT MemTy = MMO->getMemoryType(); 2580 const Align MemAlign = MMO->getAlign(); 2581 const unsigned MemSize = MemTy.getSizeInBits(); 2582 const uint64_t AlignInBits = 8 * MemAlign.value(); 2583 2584 // Widen non-power-of-2 loads to the alignment if needed 2585 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 2586 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2587 2588 // This was already the correct extending load result type, so just adjust 2589 // the memory type. 2590 if (WideMemSize == ValSize) { 2591 MachineFunction &MF = B.getMF(); 2592 2593 MachineMemOperand *WideMMO = 2594 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2595 Observer.changingInstr(MI); 2596 MI.setMemRefs(MF, {WideMMO}); 2597 Observer.changedInstr(MI); 2598 return true; 2599 } 2600 2601 // Don't bother handling edge case that should probably never be produced. 2602 if (ValSize > WideMemSize) 2603 return false; 2604 2605 LLT WideTy = widenToNextPowerOf2(ValTy); 2606 2607 Register WideLoad; 2608 if (!WideTy.isVector()) { 2609 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2610 B.buildTrunc(ValReg, WideLoad).getReg(0); 2611 } else { 2612 // Extract the subvector. 2613 2614 if (isRegisterType(ValTy)) { 2615 // If this a case where G_EXTRACT is legal, use it. 2616 // (e.g. <3 x s32> -> <4 x s32>) 2617 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2618 B.buildExtract(ValReg, WideLoad, 0); 2619 } else { 2620 // For cases where the widened type isn't a nice register value, unmerge 2621 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2622 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2623 B.buildDeleteTrailingVectorElements(ValReg, WideLoad); 2624 } 2625 } 2626 2627 MI.eraseFromParent(); 2628 return true; 2629 } 2630 2631 return false; 2632 } 2633 2634 bool AMDGPULegalizerInfo::legalizeFMad( 2635 MachineInstr &MI, MachineRegisterInfo &MRI, 2636 MachineIRBuilder &B) const { 2637 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2638 assert(Ty.isScalar()); 2639 2640 MachineFunction &MF = B.getMF(); 2641 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2642 2643 // TODO: Always legal with future ftz flag. 2644 // FIXME: Do we need just output? 2645 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2646 return true; 2647 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2648 return true; 2649 2650 MachineIRBuilder HelperBuilder(MI); 2651 GISelObserverWrapper DummyObserver; 2652 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2653 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2654 } 2655 2656 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2657 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2658 Register DstReg = MI.getOperand(0).getReg(); 2659 Register PtrReg = MI.getOperand(1).getReg(); 2660 Register CmpVal = MI.getOperand(2).getReg(); 2661 Register NewVal = MI.getOperand(3).getReg(); 2662 2663 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2664 "this should not have been custom lowered"); 2665 2666 LLT ValTy = MRI.getType(CmpVal); 2667 LLT VecTy = LLT::fixed_vector(2, ValTy); 2668 2669 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2670 2671 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2672 .addDef(DstReg) 2673 .addUse(PtrReg) 2674 .addUse(PackedVal) 2675 .setMemRefs(MI.memoperands()); 2676 2677 MI.eraseFromParent(); 2678 return true; 2679 } 2680 2681 bool AMDGPULegalizerInfo::legalizeFlog( 2682 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2683 Register Dst = MI.getOperand(0).getReg(); 2684 Register Src = MI.getOperand(1).getReg(); 2685 LLT Ty = B.getMRI()->getType(Dst); 2686 unsigned Flags = MI.getFlags(); 2687 2688 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2689 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2690 2691 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2692 MI.eraseFromParent(); 2693 return true; 2694 } 2695 2696 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2697 MachineIRBuilder &B) const { 2698 Register Dst = MI.getOperand(0).getReg(); 2699 Register Src = MI.getOperand(1).getReg(); 2700 unsigned Flags = MI.getFlags(); 2701 LLT Ty = B.getMRI()->getType(Dst); 2702 2703 auto K = B.buildFConstant(Ty, numbers::log2e); 2704 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2705 B.buildFExp2(Dst, Mul, Flags); 2706 MI.eraseFromParent(); 2707 return true; 2708 } 2709 2710 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2711 MachineIRBuilder &B) const { 2712 Register Dst = MI.getOperand(0).getReg(); 2713 Register Src0 = MI.getOperand(1).getReg(); 2714 Register Src1 = MI.getOperand(2).getReg(); 2715 unsigned Flags = MI.getFlags(); 2716 LLT Ty = B.getMRI()->getType(Dst); 2717 const LLT S16 = LLT::scalar(16); 2718 const LLT S32 = LLT::scalar(32); 2719 2720 if (Ty == S32) { 2721 auto Log = B.buildFLog2(S32, Src0, Flags); 2722 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2723 .addUse(Log.getReg(0)) 2724 .addUse(Src1) 2725 .setMIFlags(Flags); 2726 B.buildFExp2(Dst, Mul, Flags); 2727 } else if (Ty == S16) { 2728 // There's no f16 fmul_legacy, so we need to convert for it. 2729 auto Log = B.buildFLog2(S16, Src0, Flags); 2730 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2731 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2732 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2733 .addUse(Ext0.getReg(0)) 2734 .addUse(Ext1.getReg(0)) 2735 .setMIFlags(Flags); 2736 2737 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2738 } else 2739 return false; 2740 2741 MI.eraseFromParent(); 2742 return true; 2743 } 2744 2745 // Find a source register, ignoring any possible source modifiers. 2746 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2747 Register ModSrc = OrigSrc; 2748 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2749 ModSrc = SrcFNeg->getOperand(1).getReg(); 2750 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2751 ModSrc = SrcFAbs->getOperand(1).getReg(); 2752 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2753 ModSrc = SrcFAbs->getOperand(1).getReg(); 2754 return ModSrc; 2755 } 2756 2757 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2758 MachineRegisterInfo &MRI, 2759 MachineIRBuilder &B) const { 2760 2761 const LLT S1 = LLT::scalar(1); 2762 const LLT S64 = LLT::scalar(64); 2763 Register Dst = MI.getOperand(0).getReg(); 2764 Register OrigSrc = MI.getOperand(1).getReg(); 2765 unsigned Flags = MI.getFlags(); 2766 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2767 "this should not have been custom lowered"); 2768 2769 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2770 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2771 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2772 // V_FRACT bug is: 2773 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2774 // 2775 // Convert floor(x) to (x - fract(x)) 2776 2777 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2778 .addUse(OrigSrc) 2779 .setMIFlags(Flags); 2780 2781 // Give source modifier matching some assistance before obscuring a foldable 2782 // pattern. 2783 2784 // TODO: We can avoid the neg on the fract? The input sign to fract 2785 // shouldn't matter? 2786 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2787 2788 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2789 2790 Register Min = MRI.createGenericVirtualRegister(S64); 2791 2792 // We don't need to concern ourselves with the snan handling difference, so 2793 // use the one which will directly select. 2794 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2795 if (MFI->getMode().IEEE) 2796 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2797 else 2798 B.buildFMinNum(Min, Fract, Const, Flags); 2799 2800 Register CorrectedFract = Min; 2801 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2802 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2803 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2804 } 2805 2806 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2807 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2808 2809 MI.eraseFromParent(); 2810 return true; 2811 } 2812 2813 // Turn an illegal packed v2s16 build vector into bit operations. 2814 // TODO: This should probably be a bitcast action in LegalizerHelper. 2815 bool AMDGPULegalizerInfo::legalizeBuildVector( 2816 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2817 Register Dst = MI.getOperand(0).getReg(); 2818 const LLT S32 = LLT::scalar(32); 2819 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 2820 2821 Register Src0 = MI.getOperand(1).getReg(); 2822 Register Src1 = MI.getOperand(2).getReg(); 2823 assert(MRI.getType(Src0) == LLT::scalar(16)); 2824 2825 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2826 B.buildBitcast(Dst, Merge); 2827 2828 MI.eraseFromParent(); 2829 return true; 2830 } 2831 2832 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to 2833 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input 2834 // case with a single min instruction instead of a compare+select. 2835 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, 2836 MachineRegisterInfo &MRI, 2837 MachineIRBuilder &B) const { 2838 Register Dst = MI.getOperand(0).getReg(); 2839 Register Src = MI.getOperand(1).getReg(); 2840 LLT DstTy = MRI.getType(Dst); 2841 LLT SrcTy = MRI.getType(Src); 2842 2843 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ 2844 ? AMDGPU::G_AMDGPU_FFBH_U32 2845 : AMDGPU::G_AMDGPU_FFBL_B32; 2846 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); 2847 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); 2848 2849 MI.eraseFromParent(); 2850 return true; 2851 } 2852 2853 // Check that this is a G_XOR x, -1 2854 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2855 if (MI.getOpcode() != TargetOpcode::G_XOR) 2856 return false; 2857 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2858 return ConstVal && *ConstVal == -1; 2859 } 2860 2861 // Return the use branch instruction, otherwise null if the usage is invalid. 2862 static MachineInstr * 2863 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2864 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2865 Register CondDef = MI.getOperand(0).getReg(); 2866 if (!MRI.hasOneNonDBGUse(CondDef)) 2867 return nullptr; 2868 2869 MachineBasicBlock *Parent = MI.getParent(); 2870 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2871 2872 if (isNot(MRI, *UseMI)) { 2873 Register NegatedCond = UseMI->getOperand(0).getReg(); 2874 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2875 return nullptr; 2876 2877 // We're deleting the def of this value, so we need to remove it. 2878 eraseInstr(*UseMI, MRI); 2879 2880 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2881 Negated = true; 2882 } 2883 2884 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2885 return nullptr; 2886 2887 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2888 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2889 if (Next == Parent->end()) { 2890 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2891 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2892 return nullptr; 2893 UncondBrTarget = &*NextMBB; 2894 } else { 2895 if (Next->getOpcode() != AMDGPU::G_BR) 2896 return nullptr; 2897 Br = &*Next; 2898 UncondBrTarget = Br->getOperand(0).getMBB(); 2899 } 2900 2901 return UseMI; 2902 } 2903 2904 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2905 const ArgDescriptor *Arg, 2906 const TargetRegisterClass *ArgRC, 2907 LLT ArgTy) const { 2908 MCRegister SrcReg = Arg->getRegister(); 2909 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2910 assert(DstReg.isVirtual() && "Virtual register expected"); 2911 2912 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, 2913 *ArgRC, B.getDebugLoc(), ArgTy); 2914 if (Arg->isMasked()) { 2915 // TODO: Should we try to emit this once in the entry block? 2916 const LLT S32 = LLT::scalar(32); 2917 const unsigned Mask = Arg->getMask(); 2918 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2919 2920 Register AndMaskSrc = LiveIn; 2921 2922 // TODO: Avoid clearing the high bits if we know workitem id y/z are always 2923 // 0. 2924 if (Shift != 0) { 2925 auto ShiftAmt = B.buildConstant(S32, Shift); 2926 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2927 } 2928 2929 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2930 } else { 2931 B.buildCopy(DstReg, LiveIn); 2932 } 2933 2934 return true; 2935 } 2936 2937 bool AMDGPULegalizerInfo::loadInputValue( 2938 Register DstReg, MachineIRBuilder &B, 2939 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2940 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2941 const ArgDescriptor *Arg; 2942 const TargetRegisterClass *ArgRC; 2943 LLT ArgTy; 2944 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2945 2946 if (!Arg) { 2947 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { 2948 // The intrinsic may appear when we have a 0 sized kernarg segment, in which 2949 // case the pointer argument may be missing and we use null. 2950 B.buildConstant(DstReg, 0); 2951 return true; 2952 } 2953 2954 // It's undefined behavior if a function marked with the amdgpu-no-* 2955 // attributes uses the corresponding intrinsic. 2956 B.buildUndef(DstReg); 2957 return true; 2958 } 2959 2960 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2961 return false; // TODO: Handle these 2962 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2963 } 2964 2965 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2966 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2967 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2968 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2969 return false; 2970 2971 MI.eraseFromParent(); 2972 return true; 2973 } 2974 2975 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2976 MachineRegisterInfo &MRI, 2977 MachineIRBuilder &B) const { 2978 Register Dst = MI.getOperand(0).getReg(); 2979 LLT DstTy = MRI.getType(Dst); 2980 LLT S16 = LLT::scalar(16); 2981 LLT S32 = LLT::scalar(32); 2982 LLT S64 = LLT::scalar(64); 2983 2984 if (DstTy == S16) 2985 return legalizeFDIV16(MI, MRI, B); 2986 if (DstTy == S32) 2987 return legalizeFDIV32(MI, MRI, B); 2988 if (DstTy == S64) 2989 return legalizeFDIV64(MI, MRI, B); 2990 2991 return false; 2992 } 2993 2994 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 2995 Register DstDivReg, 2996 Register DstRemReg, 2997 Register X, 2998 Register Y) const { 2999 const LLT S1 = LLT::scalar(1); 3000 const LLT S32 = LLT::scalar(32); 3001 3002 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 3003 // algorithm used here. 3004 3005 // Initial estimate of inv(y). 3006 auto FloatY = B.buildUITOFP(S32, Y); 3007 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 3008 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 3009 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 3010 auto Z = B.buildFPTOUI(S32, ScaledY); 3011 3012 // One round of UNR. 3013 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 3014 auto NegYZ = B.buildMul(S32, NegY, Z); 3015 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 3016 3017 // Quotient/remainder estimate. 3018 auto Q = B.buildUMulH(S32, X, Z); 3019 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 3020 3021 // First quotient/remainder refinement. 3022 auto One = B.buildConstant(S32, 1); 3023 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 3024 if (DstDivReg) 3025 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 3026 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 3027 3028 // Second quotient/remainder refinement. 3029 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 3030 if (DstDivReg) 3031 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 3032 3033 if (DstRemReg) 3034 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 3035 } 3036 3037 // Build integer reciprocal sequence around V_RCP_IFLAG_F32 3038 // 3039 // Return lo, hi of result 3040 // 3041 // %cvt.lo = G_UITOFP Val.lo 3042 // %cvt.hi = G_UITOFP Val.hi 3043 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 3044 // %rcp = G_AMDGPU_RCP_IFLAG %mad 3045 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 3046 // %mul2 = G_FMUL %mul1, 2**(-32) 3047 // %trunc = G_INTRINSIC_TRUNC %mul2 3048 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 3049 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 3050 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 3051 Register Val) { 3052 const LLT S32 = LLT::scalar(32); 3053 auto Unmerge = B.buildUnmerge(S32, Val); 3054 3055 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 3056 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 3057 3058 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 3059 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 3060 3061 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 3062 auto Mul1 = 3063 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 3064 3065 // 2**(-32) 3066 auto Mul2 = 3067 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 3068 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 3069 3070 // -(2**32) 3071 auto Mad2 = B.buildFMAD(S32, Trunc, 3072 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 3073 3074 auto ResultLo = B.buildFPTOUI(S32, Mad2); 3075 auto ResultHi = B.buildFPTOUI(S32, Trunc); 3076 3077 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 3078 } 3079 3080 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 3081 Register DstDivReg, 3082 Register DstRemReg, 3083 Register Numer, 3084 Register Denom) const { 3085 const LLT S32 = LLT::scalar(32); 3086 const LLT S64 = LLT::scalar(64); 3087 const LLT S1 = LLT::scalar(1); 3088 Register RcpLo, RcpHi; 3089 3090 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 3091 3092 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 3093 3094 auto Zero64 = B.buildConstant(S64, 0); 3095 auto NegDenom = B.buildSub(S64, Zero64, Denom); 3096 3097 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 3098 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 3099 3100 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 3101 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 3102 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 3103 3104 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 3105 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 3106 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 3107 3108 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 3109 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 3110 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 3111 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 3112 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 3113 3114 auto Zero32 = B.buildConstant(S32, 0); 3115 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 3116 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1)); 3117 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 3118 3119 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 3120 Register NumerLo = UnmergeNumer.getReg(0); 3121 Register NumerHi = UnmergeNumer.getReg(1); 3122 3123 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 3124 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 3125 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 3126 Register Mul3_Lo = UnmergeMul3.getReg(0); 3127 Register Mul3_Hi = UnmergeMul3.getReg(1); 3128 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 3129 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 3130 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 3131 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 3132 3133 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 3134 Register DenomLo = UnmergeDenom.getReg(0); 3135 Register DenomHi = UnmergeDenom.getReg(1); 3136 3137 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 3138 auto C1 = B.buildSExt(S32, CmpHi); 3139 3140 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 3141 auto C2 = B.buildSExt(S32, CmpLo); 3142 3143 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 3144 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 3145 3146 // TODO: Here and below portions of the code can be enclosed into if/endif. 3147 // Currently control flow is unconditional and we have 4 selects after 3148 // potential endif to substitute PHIs. 3149 3150 // if C3 != 0 ... 3151 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 3152 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 3153 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 3154 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 3155 3156 auto One64 = B.buildConstant(S64, 1); 3157 auto Add3 = B.buildAdd(S64, MulHi3, One64); 3158 3159 auto C4 = 3160 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 3161 auto C5 = 3162 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 3163 auto C6 = B.buildSelect( 3164 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 3165 3166 // if (C6 != 0) 3167 auto Add4 = B.buildAdd(S64, Add3, One64); 3168 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 3169 3170 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 3171 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 3172 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 3173 3174 // endif C6 3175 // endif C3 3176 3177 if (DstDivReg) { 3178 auto Sel1 = B.buildSelect( 3179 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3180 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3181 Sel1, MulHi3); 3182 } 3183 3184 if (DstRemReg) { 3185 auto Sel2 = B.buildSelect( 3186 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3187 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3188 Sel2, Sub1); 3189 } 3190 } 3191 3192 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3193 MachineRegisterInfo &MRI, 3194 MachineIRBuilder &B) const { 3195 Register DstDivReg, DstRemReg; 3196 switch (MI.getOpcode()) { 3197 default: 3198 llvm_unreachable("Unexpected opcode!"); 3199 case AMDGPU::G_UDIV: { 3200 DstDivReg = MI.getOperand(0).getReg(); 3201 break; 3202 } 3203 case AMDGPU::G_UREM: { 3204 DstRemReg = MI.getOperand(0).getReg(); 3205 break; 3206 } 3207 case AMDGPU::G_UDIVREM: { 3208 DstDivReg = MI.getOperand(0).getReg(); 3209 DstRemReg = MI.getOperand(1).getReg(); 3210 break; 3211 } 3212 } 3213 3214 const LLT S64 = LLT::scalar(64); 3215 const LLT S32 = LLT::scalar(32); 3216 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3217 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3218 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3219 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3220 3221 if (Ty == S32) 3222 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3223 else if (Ty == S64) 3224 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3225 else 3226 return false; 3227 3228 MI.eraseFromParent(); 3229 return true; 3230 } 3231 3232 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3233 MachineRegisterInfo &MRI, 3234 MachineIRBuilder &B) const { 3235 const LLT S64 = LLT::scalar(64); 3236 const LLT S32 = LLT::scalar(32); 3237 3238 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3239 if (Ty != S32 && Ty != S64) 3240 return false; 3241 3242 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3243 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3244 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3245 3246 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3247 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3248 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3249 3250 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3251 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3252 3253 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3254 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3255 3256 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3257 switch (MI.getOpcode()) { 3258 default: 3259 llvm_unreachable("Unexpected opcode!"); 3260 case AMDGPU::G_SDIV: { 3261 DstDivReg = MI.getOperand(0).getReg(); 3262 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3263 break; 3264 } 3265 case AMDGPU::G_SREM: { 3266 DstRemReg = MI.getOperand(0).getReg(); 3267 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3268 break; 3269 } 3270 case AMDGPU::G_SDIVREM: { 3271 DstDivReg = MI.getOperand(0).getReg(); 3272 DstRemReg = MI.getOperand(1).getReg(); 3273 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3274 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3275 break; 3276 } 3277 } 3278 3279 if (Ty == S32) 3280 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3281 else 3282 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3283 3284 if (DstDivReg) { 3285 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3286 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3287 B.buildSub(DstDivReg, SignXor, Sign); 3288 } 3289 3290 if (DstRemReg) { 3291 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3292 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3293 B.buildSub(DstRemReg, SignXor, Sign); 3294 } 3295 3296 MI.eraseFromParent(); 3297 return true; 3298 } 3299 3300 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3301 MachineRegisterInfo &MRI, 3302 MachineIRBuilder &B) const { 3303 Register Res = MI.getOperand(0).getReg(); 3304 Register LHS = MI.getOperand(1).getReg(); 3305 Register RHS = MI.getOperand(2).getReg(); 3306 uint16_t Flags = MI.getFlags(); 3307 LLT ResTy = MRI.getType(Res); 3308 3309 const MachineFunction &MF = B.getMF(); 3310 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3311 MI.getFlag(MachineInstr::FmAfn); 3312 3313 if (!AllowInaccurateRcp) 3314 return false; 3315 3316 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3317 // 1 / x -> RCP(x) 3318 if (CLHS->isExactlyValue(1.0)) { 3319 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3320 .addUse(RHS) 3321 .setMIFlags(Flags); 3322 3323 MI.eraseFromParent(); 3324 return true; 3325 } 3326 3327 // -1 / x -> RCP( FNEG(x) ) 3328 if (CLHS->isExactlyValue(-1.0)) { 3329 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3330 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3331 .addUse(FNeg.getReg(0)) 3332 .setMIFlags(Flags); 3333 3334 MI.eraseFromParent(); 3335 return true; 3336 } 3337 } 3338 3339 // x / y -> x * (1.0 / y) 3340 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3341 .addUse(RHS) 3342 .setMIFlags(Flags); 3343 B.buildFMul(Res, LHS, RCP, Flags); 3344 3345 MI.eraseFromParent(); 3346 return true; 3347 } 3348 3349 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3350 MachineRegisterInfo &MRI, 3351 MachineIRBuilder &B) const { 3352 Register Res = MI.getOperand(0).getReg(); 3353 Register X = MI.getOperand(1).getReg(); 3354 Register Y = MI.getOperand(2).getReg(); 3355 uint16_t Flags = MI.getFlags(); 3356 LLT ResTy = MRI.getType(Res); 3357 3358 const MachineFunction &MF = B.getMF(); 3359 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3360 MI.getFlag(MachineInstr::FmAfn); 3361 3362 if (!AllowInaccurateRcp) 3363 return false; 3364 3365 auto NegY = B.buildFNeg(ResTy, Y); 3366 auto One = B.buildFConstant(ResTy, 1.0); 3367 3368 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3369 .addUse(Y) 3370 .setMIFlags(Flags); 3371 3372 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3373 R = B.buildFMA(ResTy, Tmp0, R, R); 3374 3375 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3376 R = B.buildFMA(ResTy, Tmp1, R, R); 3377 3378 auto Ret = B.buildFMul(ResTy, X, R); 3379 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3380 3381 B.buildFMA(Res, Tmp2, R, Ret); 3382 MI.eraseFromParent(); 3383 return true; 3384 } 3385 3386 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3387 MachineRegisterInfo &MRI, 3388 MachineIRBuilder &B) const { 3389 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3390 return true; 3391 3392 Register Res = MI.getOperand(0).getReg(); 3393 Register LHS = MI.getOperand(1).getReg(); 3394 Register RHS = MI.getOperand(2).getReg(); 3395 3396 uint16_t Flags = MI.getFlags(); 3397 3398 LLT S16 = LLT::scalar(16); 3399 LLT S32 = LLT::scalar(32); 3400 3401 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3402 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3403 3404 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3405 .addUse(RHSExt.getReg(0)) 3406 .setMIFlags(Flags); 3407 3408 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3409 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3410 3411 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3412 .addUse(RDst.getReg(0)) 3413 .addUse(RHS) 3414 .addUse(LHS) 3415 .setMIFlags(Flags); 3416 3417 MI.eraseFromParent(); 3418 return true; 3419 } 3420 3421 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3422 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3423 static void toggleSPDenormMode(bool Enable, 3424 MachineIRBuilder &B, 3425 const GCNSubtarget &ST, 3426 AMDGPU::SIModeRegisterDefaults Mode) { 3427 // Set SP denorm mode to this value. 3428 unsigned SPDenormMode = 3429 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3430 3431 if (ST.hasDenormModeInst()) { 3432 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3433 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3434 3435 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3436 B.buildInstr(AMDGPU::S_DENORM_MODE) 3437 .addImm(NewDenormModeValue); 3438 3439 } else { 3440 // Select FP32 bit field in mode register. 3441 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3442 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3443 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3444 3445 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3446 .addImm(SPDenormMode) 3447 .addImm(SPDenormModeBitField); 3448 } 3449 } 3450 3451 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3452 MachineRegisterInfo &MRI, 3453 MachineIRBuilder &B) const { 3454 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3455 return true; 3456 3457 Register Res = MI.getOperand(0).getReg(); 3458 Register LHS = MI.getOperand(1).getReg(); 3459 Register RHS = MI.getOperand(2).getReg(); 3460 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3461 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3462 3463 uint16_t Flags = MI.getFlags(); 3464 3465 LLT S32 = LLT::scalar(32); 3466 LLT S1 = LLT::scalar(1); 3467 3468 auto One = B.buildFConstant(S32, 1.0f); 3469 3470 auto DenominatorScaled = 3471 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3472 .addUse(LHS) 3473 .addUse(RHS) 3474 .addImm(0) 3475 .setMIFlags(Flags); 3476 auto NumeratorScaled = 3477 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3478 .addUse(LHS) 3479 .addUse(RHS) 3480 .addImm(1) 3481 .setMIFlags(Flags); 3482 3483 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3484 .addUse(DenominatorScaled.getReg(0)) 3485 .setMIFlags(Flags); 3486 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3487 3488 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3489 // aren't modeled as reading it. 3490 if (!Mode.allFP32Denormals()) 3491 toggleSPDenormMode(true, B, ST, Mode); 3492 3493 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3494 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3495 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3496 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3497 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3498 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3499 3500 if (!Mode.allFP32Denormals()) 3501 toggleSPDenormMode(false, B, ST, Mode); 3502 3503 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3504 .addUse(Fma4.getReg(0)) 3505 .addUse(Fma1.getReg(0)) 3506 .addUse(Fma3.getReg(0)) 3507 .addUse(NumeratorScaled.getReg(1)) 3508 .setMIFlags(Flags); 3509 3510 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3511 .addUse(Fmas.getReg(0)) 3512 .addUse(RHS) 3513 .addUse(LHS) 3514 .setMIFlags(Flags); 3515 3516 MI.eraseFromParent(); 3517 return true; 3518 } 3519 3520 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3521 MachineRegisterInfo &MRI, 3522 MachineIRBuilder &B) const { 3523 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3524 return true; 3525 3526 Register Res = MI.getOperand(0).getReg(); 3527 Register LHS = MI.getOperand(1).getReg(); 3528 Register RHS = MI.getOperand(2).getReg(); 3529 3530 uint16_t Flags = MI.getFlags(); 3531 3532 LLT S64 = LLT::scalar(64); 3533 LLT S1 = LLT::scalar(1); 3534 3535 auto One = B.buildFConstant(S64, 1.0); 3536 3537 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3538 .addUse(LHS) 3539 .addUse(RHS) 3540 .addImm(0) 3541 .setMIFlags(Flags); 3542 3543 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3544 3545 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3546 .addUse(DivScale0.getReg(0)) 3547 .setMIFlags(Flags); 3548 3549 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3550 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3551 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3552 3553 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3554 .addUse(LHS) 3555 .addUse(RHS) 3556 .addImm(1) 3557 .setMIFlags(Flags); 3558 3559 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3560 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3561 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3562 3563 Register Scale; 3564 if (!ST.hasUsableDivScaleConditionOutput()) { 3565 // Workaround a hardware bug on SI where the condition output from div_scale 3566 // is not usable. 3567 3568 LLT S32 = LLT::scalar(32); 3569 3570 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3571 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3572 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3573 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3574 3575 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3576 Scale1Unmerge.getReg(1)); 3577 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3578 Scale0Unmerge.getReg(1)); 3579 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3580 } else { 3581 Scale = DivScale1.getReg(1); 3582 } 3583 3584 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3585 .addUse(Fma4.getReg(0)) 3586 .addUse(Fma3.getReg(0)) 3587 .addUse(Mul.getReg(0)) 3588 .addUse(Scale) 3589 .setMIFlags(Flags); 3590 3591 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3592 .addUse(Fmas.getReg(0)) 3593 .addUse(RHS) 3594 .addUse(LHS) 3595 .setMIFlags(Flags); 3596 3597 MI.eraseFromParent(); 3598 return true; 3599 } 3600 3601 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3602 MachineRegisterInfo &MRI, 3603 MachineIRBuilder &B) const { 3604 Register Res = MI.getOperand(0).getReg(); 3605 Register LHS = MI.getOperand(2).getReg(); 3606 Register RHS = MI.getOperand(3).getReg(); 3607 uint16_t Flags = MI.getFlags(); 3608 3609 LLT S32 = LLT::scalar(32); 3610 LLT S1 = LLT::scalar(1); 3611 3612 auto Abs = B.buildFAbs(S32, RHS, Flags); 3613 const APFloat C0Val(1.0f); 3614 3615 auto C0 = B.buildConstant(S32, 0x6f800000); 3616 auto C1 = B.buildConstant(S32, 0x2f800000); 3617 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3618 3619 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3620 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3621 3622 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3623 3624 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3625 .addUse(Mul0.getReg(0)) 3626 .setMIFlags(Flags); 3627 3628 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3629 3630 B.buildFMul(Res, Sel, Mul1, Flags); 3631 3632 MI.eraseFromParent(); 3633 return true; 3634 } 3635 3636 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3637 // FIXME: Why do we handle this one but not other removed instructions? 3638 // 3639 // Reciprocal square root. The clamp prevents infinite results, clamping 3640 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3641 // +-max_float. 3642 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3643 MachineRegisterInfo &MRI, 3644 MachineIRBuilder &B) const { 3645 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3646 return true; 3647 3648 Register Dst = MI.getOperand(0).getReg(); 3649 Register Src = MI.getOperand(2).getReg(); 3650 auto Flags = MI.getFlags(); 3651 3652 LLT Ty = MRI.getType(Dst); 3653 3654 const fltSemantics *FltSemantics; 3655 if (Ty == LLT::scalar(32)) 3656 FltSemantics = &APFloat::IEEEsingle(); 3657 else if (Ty == LLT::scalar(64)) 3658 FltSemantics = &APFloat::IEEEdouble(); 3659 else 3660 return false; 3661 3662 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3663 .addUse(Src) 3664 .setMIFlags(Flags); 3665 3666 // We don't need to concern ourselves with the snan handling difference, since 3667 // the rsq quieted (or not) so use the one which will directly select. 3668 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3669 const bool UseIEEE = MFI->getMode().IEEE; 3670 3671 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3672 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3673 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3674 3675 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3676 3677 if (UseIEEE) 3678 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3679 else 3680 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3681 MI.eraseFromParent(); 3682 return true; 3683 } 3684 3685 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3686 switch (IID) { 3687 case Intrinsic::amdgcn_ds_fadd: 3688 return AMDGPU::G_ATOMICRMW_FADD; 3689 case Intrinsic::amdgcn_ds_fmin: 3690 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3691 case Intrinsic::amdgcn_ds_fmax: 3692 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3693 default: 3694 llvm_unreachable("not a DS FP intrinsic"); 3695 } 3696 } 3697 3698 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3699 MachineInstr &MI, 3700 Intrinsic::ID IID) const { 3701 GISelChangeObserver &Observer = Helper.Observer; 3702 Observer.changingInstr(MI); 3703 3704 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3705 3706 // The remaining operands were used to set fields in the MemOperand on 3707 // construction. 3708 for (int I = 6; I > 3; --I) 3709 MI.RemoveOperand(I); 3710 3711 MI.RemoveOperand(1); // Remove the intrinsic ID. 3712 Observer.changedInstr(MI); 3713 return true; 3714 } 3715 3716 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3717 MachineRegisterInfo &MRI, 3718 MachineIRBuilder &B) const { 3719 uint64_t Offset = 3720 ST.getTargetLowering()->getImplicitParameterOffset( 3721 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3722 LLT DstTy = MRI.getType(DstReg); 3723 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3724 3725 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3726 if (!loadInputValue(KernargPtrReg, B, 3727 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3728 return false; 3729 3730 // FIXME: This should be nuw 3731 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3732 return true; 3733 } 3734 3735 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3736 MachineRegisterInfo &MRI, 3737 MachineIRBuilder &B) const { 3738 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3739 if (!MFI->isEntryFunction()) { 3740 return legalizePreloadedArgIntrin(MI, MRI, B, 3741 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3742 } 3743 3744 Register DstReg = MI.getOperand(0).getReg(); 3745 if (!getImplicitArgPtr(DstReg, MRI, B)) 3746 return false; 3747 3748 MI.eraseFromParent(); 3749 return true; 3750 } 3751 3752 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3753 MachineRegisterInfo &MRI, 3754 MachineIRBuilder &B, 3755 unsigned AddrSpace) const { 3756 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3757 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3758 Register Hi32 = Unmerge.getReg(1); 3759 3760 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3761 MI.eraseFromParent(); 3762 return true; 3763 } 3764 3765 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3766 // offset (the offset that is included in bounds checking and swizzling, to be 3767 // split between the instruction's voffset and immoffset fields) and soffset 3768 // (the offset that is excluded from bounds checking and swizzling, to go in 3769 // the instruction's soffset field). This function takes the first kind of 3770 // offset and figures out how to split it between voffset and immoffset. 3771 std::pair<Register, unsigned> 3772 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3773 Register OrigOffset) const { 3774 const unsigned MaxImm = 4095; 3775 Register BaseReg; 3776 unsigned ImmOffset; 3777 const LLT S32 = LLT::scalar(32); 3778 MachineRegisterInfo &MRI = *B.getMRI(); 3779 3780 std::tie(BaseReg, ImmOffset) = 3781 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3782 3783 // If BaseReg is a pointer, convert it to int. 3784 if (MRI.getType(BaseReg).isPointer()) 3785 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3786 3787 // If the immediate value is too big for the immoffset field, put the value 3788 // and -4096 into the immoffset field so that the value that is copied/added 3789 // for the voffset field is a multiple of 4096, and it stands more chance 3790 // of being CSEd with the copy/add for another similar load/store. 3791 // However, do not do that rounding down to a multiple of 4096 if that is a 3792 // negative number, as it appears to be illegal to have a negative offset 3793 // in the vgpr, even if adding the immediate offset makes it positive. 3794 unsigned Overflow = ImmOffset & ~MaxImm; 3795 ImmOffset -= Overflow; 3796 if ((int32_t)Overflow < 0) { 3797 Overflow += ImmOffset; 3798 ImmOffset = 0; 3799 } 3800 3801 if (Overflow != 0) { 3802 if (!BaseReg) { 3803 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3804 } else { 3805 auto OverflowVal = B.buildConstant(S32, Overflow); 3806 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3807 } 3808 } 3809 3810 if (!BaseReg) 3811 BaseReg = B.buildConstant(S32, 0).getReg(0); 3812 3813 return std::make_pair(BaseReg, ImmOffset); 3814 } 3815 3816 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic. 3817 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, 3818 Register VOffset, Register SOffset, 3819 unsigned ImmOffset, Register VIndex, 3820 MachineRegisterInfo &MRI) const { 3821 Optional<ValueAndVReg> MaybeVOffsetVal = 3822 getIConstantVRegValWithLookThrough(VOffset, MRI); 3823 Optional<ValueAndVReg> MaybeSOffsetVal = 3824 getIConstantVRegValWithLookThrough(SOffset, MRI); 3825 Optional<ValueAndVReg> MaybeVIndexVal = 3826 getIConstantVRegValWithLookThrough(VIndex, MRI); 3827 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant, 3828 // update the MMO with that offset. The stride is unknown so we can only do 3829 // this if VIndex is constant 0. 3830 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && 3831 MaybeVIndexVal->Value == 0) { 3832 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + 3833 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; 3834 MMO->setOffset(TotalOffset); 3835 } else { 3836 // We don't have a constant combined offset to use in the MMO. Give up. 3837 MMO->setValue((Value *)nullptr); 3838 } 3839 } 3840 3841 /// Handle register layout difference for f16 images for some subtargets. 3842 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3843 MachineRegisterInfo &MRI, 3844 Register Reg, 3845 bool ImageStore) const { 3846 const LLT S16 = LLT::scalar(16); 3847 const LLT S32 = LLT::scalar(32); 3848 LLT StoreVT = MRI.getType(Reg); 3849 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3850 3851 if (ST.hasUnpackedD16VMem()) { 3852 auto Unmerge = B.buildUnmerge(S16, Reg); 3853 3854 SmallVector<Register, 4> WideRegs; 3855 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3856 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3857 3858 int NumElts = StoreVT.getNumElements(); 3859 3860 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 3861 .getReg(0); 3862 } 3863 3864 if (ImageStore && ST.hasImageStoreD16Bug()) { 3865 if (StoreVT.getNumElements() == 2) { 3866 SmallVector<Register, 4> PackedRegs; 3867 Reg = B.buildBitcast(S32, Reg).getReg(0); 3868 PackedRegs.push_back(Reg); 3869 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3870 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 3871 .getReg(0); 3872 } 3873 3874 if (StoreVT.getNumElements() == 3) { 3875 SmallVector<Register, 4> PackedRegs; 3876 auto Unmerge = B.buildUnmerge(S16, Reg); 3877 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3878 PackedRegs.push_back(Unmerge.getReg(I)); 3879 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3880 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 3881 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 3882 } 3883 3884 if (StoreVT.getNumElements() == 4) { 3885 SmallVector<Register, 4> PackedRegs; 3886 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 3887 auto Unmerge = B.buildUnmerge(S32, Reg); 3888 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3889 PackedRegs.push_back(Unmerge.getReg(I)); 3890 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3891 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 3892 .getReg(0); 3893 } 3894 3895 llvm_unreachable("invalid data type"); 3896 } 3897 3898 if (StoreVT == LLT::fixed_vector(3, S16)) { 3899 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) 3900 .getReg(0); 3901 } 3902 return Reg; 3903 } 3904 3905 Register AMDGPULegalizerInfo::fixStoreSourceType( 3906 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3907 MachineRegisterInfo *MRI = B.getMRI(); 3908 LLT Ty = MRI->getType(VData); 3909 3910 const LLT S16 = LLT::scalar(16); 3911 3912 // Fixup illegal register types for i8 stores. 3913 if (Ty == LLT::scalar(8) || Ty == S16) { 3914 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3915 return AnyExt; 3916 } 3917 3918 if (Ty.isVector()) { 3919 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3920 if (IsFormat) 3921 return handleD16VData(B, *MRI, VData); 3922 } 3923 } 3924 3925 return VData; 3926 } 3927 3928 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3929 MachineRegisterInfo &MRI, 3930 MachineIRBuilder &B, 3931 bool IsTyped, 3932 bool IsFormat) const { 3933 Register VData = MI.getOperand(1).getReg(); 3934 LLT Ty = MRI.getType(VData); 3935 LLT EltTy = Ty.getScalarType(); 3936 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3937 const LLT S32 = LLT::scalar(32); 3938 3939 VData = fixStoreSourceType(B, VData, IsFormat); 3940 Register RSrc = MI.getOperand(2).getReg(); 3941 3942 MachineMemOperand *MMO = *MI.memoperands_begin(); 3943 const int MemSize = MMO->getSize(); 3944 3945 unsigned ImmOffset; 3946 3947 // The typed intrinsics add an immediate after the registers. 3948 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3949 3950 // The struct intrinsic variants add one additional operand over raw. 3951 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3952 Register VIndex; 3953 int OpOffset = 0; 3954 if (HasVIndex) { 3955 VIndex = MI.getOperand(3).getReg(); 3956 OpOffset = 1; 3957 } else { 3958 VIndex = B.buildConstant(S32, 0).getReg(0); 3959 } 3960 3961 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3962 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3963 3964 unsigned Format = 0; 3965 if (IsTyped) { 3966 Format = MI.getOperand(5 + OpOffset).getImm(); 3967 ++OpOffset; 3968 } 3969 3970 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3971 3972 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 3973 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 3974 3975 unsigned Opc; 3976 if (IsTyped) { 3977 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3978 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3979 } else if (IsFormat) { 3980 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3981 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3982 } else { 3983 switch (MemSize) { 3984 case 1: 3985 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3986 break; 3987 case 2: 3988 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3989 break; 3990 default: 3991 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3992 break; 3993 } 3994 } 3995 3996 auto MIB = B.buildInstr(Opc) 3997 .addUse(VData) // vdata 3998 .addUse(RSrc) // rsrc 3999 .addUse(VIndex) // vindex 4000 .addUse(VOffset) // voffset 4001 .addUse(SOffset) // soffset 4002 .addImm(ImmOffset); // offset(imm) 4003 4004 if (IsTyped) 4005 MIB.addImm(Format); 4006 4007 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4008 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4009 .addMemOperand(MMO); 4010 4011 MI.eraseFromParent(); 4012 return true; 4013 } 4014 4015 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 4016 MachineRegisterInfo &MRI, 4017 MachineIRBuilder &B, 4018 bool IsFormat, 4019 bool IsTyped) const { 4020 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 4021 MachineMemOperand *MMO = *MI.memoperands_begin(); 4022 const LLT MemTy = MMO->getMemoryType(); 4023 const LLT S32 = LLT::scalar(32); 4024 4025 Register Dst = MI.getOperand(0).getReg(); 4026 Register RSrc = MI.getOperand(2).getReg(); 4027 4028 // The typed intrinsics add an immediate after the registers. 4029 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 4030 4031 // The struct intrinsic variants add one additional operand over raw. 4032 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4033 Register VIndex; 4034 int OpOffset = 0; 4035 if (HasVIndex) { 4036 VIndex = MI.getOperand(3).getReg(); 4037 OpOffset = 1; 4038 } else { 4039 VIndex = B.buildConstant(S32, 0).getReg(0); 4040 } 4041 4042 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 4043 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 4044 4045 unsigned Format = 0; 4046 if (IsTyped) { 4047 Format = MI.getOperand(5 + OpOffset).getImm(); 4048 ++OpOffset; 4049 } 4050 4051 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 4052 unsigned ImmOffset; 4053 4054 LLT Ty = MRI.getType(Dst); 4055 LLT EltTy = Ty.getScalarType(); 4056 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 4057 const bool Unpacked = ST.hasUnpackedD16VMem(); 4058 4059 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4060 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 4061 4062 unsigned Opc; 4063 4064 if (IsTyped) { 4065 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 4066 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 4067 } else if (IsFormat) { 4068 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 4069 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 4070 } else { 4071 switch (MemTy.getSizeInBits()) { 4072 case 8: 4073 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 4074 break; 4075 case 16: 4076 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 4077 break; 4078 default: 4079 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 4080 break; 4081 } 4082 } 4083 4084 Register LoadDstReg; 4085 4086 bool IsExtLoad = 4087 (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector()); 4088 LLT UnpackedTy = Ty.changeElementSize(32); 4089 4090 if (IsExtLoad) 4091 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 4092 else if (Unpacked && IsD16 && Ty.isVector()) 4093 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 4094 else 4095 LoadDstReg = Dst; 4096 4097 auto MIB = B.buildInstr(Opc) 4098 .addDef(LoadDstReg) // vdata 4099 .addUse(RSrc) // rsrc 4100 .addUse(VIndex) // vindex 4101 .addUse(VOffset) // voffset 4102 .addUse(SOffset) // soffset 4103 .addImm(ImmOffset); // offset(imm) 4104 4105 if (IsTyped) 4106 MIB.addImm(Format); 4107 4108 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4109 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4110 .addMemOperand(MMO); 4111 4112 if (LoadDstReg != Dst) { 4113 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 4114 4115 // Widen result for extending loads was widened. 4116 if (IsExtLoad) 4117 B.buildTrunc(Dst, LoadDstReg); 4118 else { 4119 // Repack to original 16-bit vector result 4120 // FIXME: G_TRUNC should work, but legalization currently fails 4121 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 4122 SmallVector<Register, 4> Repack; 4123 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 4124 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 4125 B.buildMerge(Dst, Repack); 4126 } 4127 } 4128 4129 MI.eraseFromParent(); 4130 return true; 4131 } 4132 4133 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 4134 MachineIRBuilder &B, 4135 bool IsInc) const { 4136 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 4137 AMDGPU::G_AMDGPU_ATOMIC_DEC; 4138 B.buildInstr(Opc) 4139 .addDef(MI.getOperand(0).getReg()) 4140 .addUse(MI.getOperand(2).getReg()) 4141 .addUse(MI.getOperand(3).getReg()) 4142 .cloneMemRefs(MI); 4143 MI.eraseFromParent(); 4144 return true; 4145 } 4146 4147 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 4148 switch (IntrID) { 4149 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4150 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4151 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 4152 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4153 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4154 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 4155 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4156 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4157 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 4158 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4159 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4160 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 4161 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4162 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4163 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 4164 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4165 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4166 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 4167 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4168 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4169 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 4170 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4171 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4172 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 4173 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4174 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4175 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 4176 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4177 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4178 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 4179 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4180 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4181 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 4182 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4183 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4184 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 4185 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4186 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4187 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4188 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4189 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4190 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4191 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4192 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4193 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4194 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4195 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4196 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 4197 default: 4198 llvm_unreachable("unhandled atomic opcode"); 4199 } 4200 } 4201 4202 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4203 MachineIRBuilder &B, 4204 Intrinsic::ID IID) const { 4205 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4206 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4207 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4208 4209 Register Dst; 4210 4211 int OpOffset = 0; 4212 if (HasReturn) { 4213 // A few FP atomics do not support return values. 4214 Dst = MI.getOperand(0).getReg(); 4215 } else { 4216 OpOffset = -1; 4217 } 4218 4219 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4220 Register CmpVal; 4221 4222 if (IsCmpSwap) { 4223 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4224 ++OpOffset; 4225 } 4226 4227 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4228 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4229 4230 // The struct intrinsic variants add one additional operand over raw. 4231 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4232 Register VIndex; 4233 if (HasVIndex) { 4234 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4235 ++OpOffset; 4236 } else { 4237 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4238 } 4239 4240 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4241 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4242 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4243 4244 MachineMemOperand *MMO = *MI.memoperands_begin(); 4245 4246 unsigned ImmOffset; 4247 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4248 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); 4249 4250 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4251 4252 if (HasReturn) 4253 MIB.addDef(Dst); 4254 4255 MIB.addUse(VData); // vdata 4256 4257 if (IsCmpSwap) 4258 MIB.addReg(CmpVal); 4259 4260 MIB.addUse(RSrc) // rsrc 4261 .addUse(VIndex) // vindex 4262 .addUse(VOffset) // voffset 4263 .addUse(SOffset) // soffset 4264 .addImm(ImmOffset) // offset(imm) 4265 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4266 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4267 .addMemOperand(MMO); 4268 4269 MI.eraseFromParent(); 4270 return true; 4271 } 4272 4273 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4274 /// vector with s16 typed elements. 4275 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4276 SmallVectorImpl<Register> &PackedAddrs, 4277 unsigned ArgOffset, 4278 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4279 bool IsA16, bool IsG16) { 4280 const LLT S16 = LLT::scalar(16); 4281 const LLT V2S16 = LLT::fixed_vector(2, 16); 4282 auto EndIdx = Intr->VAddrEnd; 4283 4284 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4285 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4286 if (!SrcOp.isReg()) 4287 continue; // _L to _LZ may have eliminated this. 4288 4289 Register AddrReg = SrcOp.getReg(); 4290 4291 if ((I < Intr->GradientStart) || 4292 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4293 (I >= Intr->CoordStart && !IsA16)) { 4294 if ((I < Intr->GradientStart) && IsA16 && 4295 (B.getMRI()->getType(AddrReg) == S16)) { 4296 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument"); 4297 // Special handling of bias when A16 is on. Bias is of type half but 4298 // occupies full 32-bit. 4299 PackedAddrs.push_back( 4300 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4301 .getReg(0)); 4302 } else { 4303 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && 4304 "Bias needs to be converted to 16 bit in A16 mode"); 4305 // Handle any gradient or coordinate operands that should not be packed 4306 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4307 PackedAddrs.push_back(AddrReg); 4308 } 4309 } else { 4310 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4311 // derivatives dx/dh and dx/dv are packed with undef. 4312 if (((I + 1) >= EndIdx) || 4313 ((Intr->NumGradients / 2) % 2 == 1 && 4314 (I == static_cast<unsigned>(Intr->GradientStart + 4315 (Intr->NumGradients / 2) - 1) || 4316 I == static_cast<unsigned>(Intr->GradientStart + 4317 Intr->NumGradients - 1))) || 4318 // Check for _L to _LZ optimization 4319 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4320 PackedAddrs.push_back( 4321 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4322 .getReg(0)); 4323 } else { 4324 PackedAddrs.push_back( 4325 B.buildBuildVector( 4326 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4327 .getReg(0)); 4328 ++I; 4329 } 4330 } 4331 } 4332 } 4333 4334 /// Convert from separate vaddr components to a single vector address register, 4335 /// and replace the remaining operands with $noreg. 4336 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4337 int DimIdx, int NumVAddrs) { 4338 const LLT S32 = LLT::scalar(32); 4339 4340 SmallVector<Register, 8> AddrRegs; 4341 for (int I = 0; I != NumVAddrs; ++I) { 4342 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4343 if (SrcOp.isReg()) { 4344 AddrRegs.push_back(SrcOp.getReg()); 4345 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4346 } 4347 } 4348 4349 int NumAddrRegs = AddrRegs.size(); 4350 if (NumAddrRegs != 1) { 4351 // Above 8 elements round up to next power of 2 (i.e. 16). 4352 if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) { 4353 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4354 auto Undef = B.buildUndef(S32); 4355 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4356 NumAddrRegs = RoundedNumRegs; 4357 } 4358 4359 auto VAddr = 4360 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 4361 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4362 } 4363 4364 for (int I = 1; I != NumVAddrs; ++I) { 4365 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4366 if (SrcOp.isReg()) 4367 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4368 } 4369 } 4370 4371 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4372 /// 4373 /// Depending on the subtarget, load/store with 16-bit element data need to be 4374 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4375 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4376 /// registers. 4377 /// 4378 /// We don't want to directly select image instructions just yet, but also want 4379 /// to exposes all register repacking to the legalizer/combiners. We also don't 4380 /// want a selected instrution entering RegBankSelect. In order to avoid 4381 /// defining a multitude of intermediate image instructions, directly hack on 4382 /// the intrinsic's arguments. In cases like a16 addresses, this requires 4383 /// padding now unnecessary arguments with $noreg. 4384 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4385 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4386 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4387 4388 const unsigned NumDefs = MI.getNumExplicitDefs(); 4389 const unsigned ArgOffset = NumDefs + 1; 4390 bool IsTFE = NumDefs == 2; 4391 // We are only processing the operands of d16 image operations on subtargets 4392 // that use the unpacked register layout, or need to repack the TFE result. 4393 4394 // TODO: Do we need to guard against already legalized intrinsics? 4395 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4396 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4397 4398 MachineRegisterInfo *MRI = B.getMRI(); 4399 const LLT S32 = LLT::scalar(32); 4400 const LLT S16 = LLT::scalar(16); 4401 const LLT V2S16 = LLT::fixed_vector(2, 16); 4402 4403 unsigned DMask = 0; 4404 Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg(); 4405 LLT Ty = MRI->getType(VData); 4406 4407 // Check for 16 bit addresses and pack if true. 4408 LLT GradTy = 4409 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4410 LLT AddrTy = 4411 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4412 const bool IsG16 = GradTy == S16; 4413 const bool IsA16 = AddrTy == S16; 4414 const bool IsD16 = Ty.getScalarType() == S16; 4415 4416 int DMaskLanes = 0; 4417 if (!BaseOpcode->Atomic) { 4418 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4419 if (BaseOpcode->Gather4) { 4420 DMaskLanes = 4; 4421 } else if (DMask != 0) { 4422 DMaskLanes = countPopulation(DMask); 4423 } else if (!IsTFE && !BaseOpcode->Store) { 4424 // If dmask is 0, this is a no-op load. This can be eliminated. 4425 B.buildUndef(MI.getOperand(0)); 4426 MI.eraseFromParent(); 4427 return true; 4428 } 4429 } 4430 4431 Observer.changingInstr(MI); 4432 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4433 4434 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16 4435 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE; 4436 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 4437 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4438 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode; 4439 4440 // Track that we legalized this 4441 MI.setDesc(B.getTII().get(NewOpcode)); 4442 4443 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4444 // dmask to be at least 1 otherwise the instruction will fail 4445 if (IsTFE && DMask == 0) { 4446 DMask = 0x1; 4447 DMaskLanes = 1; 4448 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4449 } 4450 4451 if (BaseOpcode->Atomic) { 4452 Register VData0 = MI.getOperand(2).getReg(); 4453 LLT Ty = MRI->getType(VData0); 4454 4455 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4456 if (Ty.isVector()) 4457 return false; 4458 4459 if (BaseOpcode->AtomicX2) { 4460 Register VData1 = MI.getOperand(3).getReg(); 4461 // The two values are packed in one register. 4462 LLT PackedTy = LLT::fixed_vector(2, Ty); 4463 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4464 MI.getOperand(2).setReg(Concat.getReg(0)); 4465 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4466 } 4467 } 4468 4469 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4470 4471 // Rewrite the addressing register layout before doing anything else. 4472 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4473 // 16 bit gradients are supported, but are tied to the A16 control 4474 // so both gradients and addresses must be 16 bit 4475 return false; 4476 } 4477 4478 if (IsA16 && !ST.hasA16()) { 4479 // A16 not supported 4480 return false; 4481 } 4482 4483 if (IsA16 || IsG16) { 4484 if (Intr->NumVAddrs > 1) { 4485 SmallVector<Register, 4> PackedRegs; 4486 4487 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4488 IsG16); 4489 4490 // See also below in the non-a16 branch 4491 const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 && 4492 PackedRegs.size() <= ST.getNSAMaxSize(); 4493 4494 if (!UseNSA && PackedRegs.size() > 1) { 4495 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 4496 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4497 PackedRegs[0] = Concat.getReg(0); 4498 PackedRegs.resize(1); 4499 } 4500 4501 const unsigned NumPacked = PackedRegs.size(); 4502 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4503 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4504 if (!SrcOp.isReg()) { 4505 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4506 continue; 4507 } 4508 4509 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4510 4511 if (I - Intr->VAddrStart < NumPacked) 4512 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4513 else 4514 SrcOp.setReg(AMDGPU::NoRegister); 4515 } 4516 } 4517 } else { 4518 // If the register allocator cannot place the address registers contiguously 4519 // without introducing moves, then using the non-sequential address encoding 4520 // is always preferable, since it saves VALU instructions and is usually a 4521 // wash in terms of code size or even better. 4522 // 4523 // However, we currently have no way of hinting to the register allocator 4524 // that MIMG addresses should be placed contiguously when it is possible to 4525 // do so, so force non-NSA for the common 2-address case as a heuristic. 4526 // 4527 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4528 // allocation when possible. 4529 const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 && 4530 CorrectedNumVAddrs <= ST.getNSAMaxSize(); 4531 4532 if (!UseNSA && Intr->NumVAddrs > 1) 4533 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4534 Intr->NumVAddrs); 4535 } 4536 4537 int Flags = 0; 4538 if (IsA16) 4539 Flags |= 1; 4540 if (IsG16) 4541 Flags |= 2; 4542 MI.addOperand(MachineOperand::CreateImm(Flags)); 4543 4544 if (BaseOpcode->Store) { // No TFE for stores? 4545 // TODO: Handle dmask trim 4546 if (!Ty.isVector() || !IsD16) 4547 return true; 4548 4549 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4550 if (RepackedReg != VData) { 4551 MI.getOperand(1).setReg(RepackedReg); 4552 } 4553 4554 return true; 4555 } 4556 4557 Register DstReg = MI.getOperand(0).getReg(); 4558 const LLT EltTy = Ty.getScalarType(); 4559 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4560 4561 // Confirm that the return type is large enough for the dmask specified 4562 if (NumElts < DMaskLanes) 4563 return false; 4564 4565 if (NumElts > 4 || DMaskLanes > 4) 4566 return false; 4567 4568 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4569 const LLT AdjustedTy = 4570 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 4571 4572 // The raw dword aligned data component of the load. The only legal cases 4573 // where this matters should be when using the packed D16 format, for 4574 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4575 LLT RoundedTy; 4576 4577 // S32 vector to to cover all data, plus TFE result element. 4578 LLT TFETy; 4579 4580 // Register type to use for each loaded component. Will be S32 or V2S16. 4581 LLT RegTy; 4582 4583 if (IsD16 && ST.hasUnpackedD16VMem()) { 4584 RoundedTy = 4585 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 4586 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 4587 RegTy = S32; 4588 } else { 4589 unsigned EltSize = EltTy.getSizeInBits(); 4590 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4591 unsigned RoundedSize = 32 * RoundedElts; 4592 RoundedTy = LLT::scalarOrVector( 4593 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 4594 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 4595 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4596 } 4597 4598 // The return type does not need adjustment. 4599 // TODO: Should we change s16 case to s32 or <2 x s16>? 4600 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4601 return true; 4602 4603 Register Dst1Reg; 4604 4605 // Insert after the instruction. 4606 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4607 4608 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4609 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4610 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4611 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4612 4613 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4614 4615 MI.getOperand(0).setReg(NewResultReg); 4616 4617 // In the IR, TFE is supposed to be used with a 2 element struct return 4618 // type. The instruction really returns these two values in one contiguous 4619 // register, with one additional dword beyond the loaded data. Rewrite the 4620 // return type to use a single register result. 4621 4622 if (IsTFE) { 4623 Dst1Reg = MI.getOperand(1).getReg(); 4624 if (MRI->getType(Dst1Reg) != S32) 4625 return false; 4626 4627 // TODO: Make sure the TFE operand bit is set. 4628 MI.RemoveOperand(1); 4629 4630 // Handle the easy case that requires no repack instructions. 4631 if (Ty == S32) { 4632 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4633 return true; 4634 } 4635 } 4636 4637 // Now figure out how to copy the new result register back into the old 4638 // result. 4639 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4640 4641 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4642 4643 if (ResultNumRegs == 1) { 4644 assert(!IsTFE); 4645 ResultRegs[0] = NewResultReg; 4646 } else { 4647 // We have to repack into a new vector of some kind. 4648 for (int I = 0; I != NumDataRegs; ++I) 4649 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4650 B.buildUnmerge(ResultRegs, NewResultReg); 4651 4652 // Drop the final TFE element to get the data part. The TFE result is 4653 // directly written to the right place already. 4654 if (IsTFE) 4655 ResultRegs.resize(NumDataRegs); 4656 } 4657 4658 // For an s16 scalar result, we form an s32 result with a truncate regardless 4659 // of packed vs. unpacked. 4660 if (IsD16 && !Ty.isVector()) { 4661 B.buildTrunc(DstReg, ResultRegs[0]); 4662 return true; 4663 } 4664 4665 // Avoid a build/concat_vector of 1 entry. 4666 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4667 B.buildBitcast(DstReg, ResultRegs[0]); 4668 return true; 4669 } 4670 4671 assert(Ty.isVector()); 4672 4673 if (IsD16) { 4674 // For packed D16 results with TFE enabled, all the data components are 4675 // S32. Cast back to the expected type. 4676 // 4677 // TODO: We don't really need to use load s32 elements. We would only need one 4678 // cast for the TFE result if a multiple of v2s16 was used. 4679 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4680 for (Register &Reg : ResultRegs) 4681 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4682 } else if (ST.hasUnpackedD16VMem()) { 4683 for (Register &Reg : ResultRegs) 4684 Reg = B.buildTrunc(S16, Reg).getReg(0); 4685 } 4686 } 4687 4688 auto padWithUndef = [&](LLT Ty, int NumElts) { 4689 if (NumElts == 0) 4690 return; 4691 Register Undef = B.buildUndef(Ty).getReg(0); 4692 for (int I = 0; I != NumElts; ++I) 4693 ResultRegs.push_back(Undef); 4694 }; 4695 4696 // Pad out any elements eliminated due to the dmask. 4697 LLT ResTy = MRI->getType(ResultRegs[0]); 4698 if (!ResTy.isVector()) { 4699 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4700 B.buildBuildVector(DstReg, ResultRegs); 4701 return true; 4702 } 4703 4704 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4705 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4706 4707 // Deal with the one annoying legal case. 4708 const LLT V3S16 = LLT::fixed_vector(3, 16); 4709 if (Ty == V3S16) { 4710 if (IsTFE) { 4711 if (ResultRegs.size() == 1) { 4712 NewResultReg = ResultRegs[0]; 4713 } else if (ResultRegs.size() == 2) { 4714 LLT V4S16 = LLT::fixed_vector(4, 16); 4715 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); 4716 } else { 4717 return false; 4718 } 4719 } 4720 4721 if (MRI->getType(DstReg).getNumElements() < 4722 MRI->getType(NewResultReg).getNumElements()) { 4723 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg); 4724 } else { 4725 B.buildPadVectorWithUndefElements(DstReg, NewResultReg); 4726 } 4727 return true; 4728 } 4729 4730 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4731 B.buildConcatVectors(DstReg, ResultRegs); 4732 return true; 4733 } 4734 4735 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4736 LegalizerHelper &Helper, MachineInstr &MI) const { 4737 MachineIRBuilder &B = Helper.MIRBuilder; 4738 GISelChangeObserver &Observer = Helper.Observer; 4739 4740 Register Dst = MI.getOperand(0).getReg(); 4741 LLT Ty = B.getMRI()->getType(Dst); 4742 unsigned Size = Ty.getSizeInBits(); 4743 MachineFunction &MF = B.getMF(); 4744 4745 Observer.changingInstr(MI); 4746 4747 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 4748 Ty = getBitcastRegisterType(Ty); 4749 Helper.bitcastDst(MI, Ty, 0); 4750 Dst = MI.getOperand(0).getReg(); 4751 B.setInsertPt(B.getMBB(), MI); 4752 } 4753 4754 // FIXME: We don't really need this intermediate instruction. The intrinsic 4755 // should be fixed to have a memory operand. Since it's readnone, we're not 4756 // allowed to add one. 4757 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4758 MI.RemoveOperand(1); // Remove intrinsic ID 4759 4760 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4761 // TODO: Should this use datalayout alignment? 4762 const unsigned MemSize = (Size + 7) / 8; 4763 const Align MemAlign(4); 4764 MachineMemOperand *MMO = MF.getMachineMemOperand( 4765 MachinePointerInfo(), 4766 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4767 MachineMemOperand::MOInvariant, 4768 MemSize, MemAlign); 4769 MI.addMemOperand(MF, MMO); 4770 4771 // There are no 96-bit result scalar loads, but widening to 128-bit should 4772 // always be legal. We may need to restore this to a 96-bit result if it turns 4773 // out this needs to be converted to a vector load during RegBankSelect. 4774 if (!isPowerOf2_32(Size)) { 4775 if (Ty.isVector()) 4776 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4777 else 4778 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4779 } 4780 4781 Observer.changedInstr(MI); 4782 return true; 4783 } 4784 4785 // TODO: Move to selection 4786 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4787 MachineRegisterInfo &MRI, 4788 MachineIRBuilder &B) const { 4789 if (!ST.isTrapHandlerEnabled() || 4790 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4791 return legalizeTrapEndpgm(MI, MRI, B); 4792 4793 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4794 switch (*HsaAbiVer) { 4795 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4796 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4797 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4798 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4799 case ELF::ELFABIVERSION_AMDGPU_HSA_V5: 4800 return ST.supportsGetDoorbellID() ? 4801 legalizeTrapHsa(MI, MRI, B) : 4802 legalizeTrapHsaQueuePtr(MI, MRI, B); 4803 } 4804 } 4805 4806 llvm_unreachable("Unknown trap handler"); 4807 } 4808 4809 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4810 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4811 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4812 MI.eraseFromParent(); 4813 return true; 4814 } 4815 4816 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4817 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4818 // Pass queue pointer to trap handler as input, and insert trap instruction 4819 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4820 Register LiveIn = 4821 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4822 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4823 return false; 4824 4825 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4826 B.buildCopy(SGPR01, LiveIn); 4827 B.buildInstr(AMDGPU::S_TRAP) 4828 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4829 .addReg(SGPR01, RegState::Implicit); 4830 4831 MI.eraseFromParent(); 4832 return true; 4833 } 4834 4835 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4836 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4837 B.buildInstr(AMDGPU::S_TRAP) 4838 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4839 MI.eraseFromParent(); 4840 return true; 4841 } 4842 4843 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4844 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4845 // Is non-HSA path or trap-handler disabled? Then, report a warning 4846 // accordingly 4847 if (!ST.isTrapHandlerEnabled() || 4848 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4849 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4850 "debugtrap handler not supported", 4851 MI.getDebugLoc(), DS_Warning); 4852 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4853 Ctx.diagnose(NoTrap); 4854 } else { 4855 // Insert debug-trap instruction 4856 B.buildInstr(AMDGPU::S_TRAP) 4857 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4858 } 4859 4860 MI.eraseFromParent(); 4861 return true; 4862 } 4863 4864 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4865 MachineIRBuilder &B) const { 4866 MachineRegisterInfo &MRI = *B.getMRI(); 4867 const LLT S16 = LLT::scalar(16); 4868 const LLT S32 = LLT::scalar(32); 4869 4870 Register DstReg = MI.getOperand(0).getReg(); 4871 Register NodePtr = MI.getOperand(2).getReg(); 4872 Register RayExtent = MI.getOperand(3).getReg(); 4873 Register RayOrigin = MI.getOperand(4).getReg(); 4874 Register RayDir = MI.getOperand(5).getReg(); 4875 Register RayInvDir = MI.getOperand(6).getReg(); 4876 Register TDescr = MI.getOperand(7).getReg(); 4877 4878 if (!ST.hasGFX10_AEncoding()) { 4879 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 4880 "intrinsic not supported on subtarget", 4881 MI.getDebugLoc()); 4882 B.getMF().getFunction().getContext().diagnose(BadIntrin); 4883 return false; 4884 } 4885 4886 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4887 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4888 const unsigned NumVDataDwords = 4; 4889 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 4890 const bool UseNSA = 4891 ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize(); 4892 const unsigned BaseOpcodes[2][2] = { 4893 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 4894 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 4895 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 4896 int Opcode; 4897 if (UseNSA) { 4898 Opcode = 4899 AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA, 4900 NumVDataDwords, NumVAddrDwords); 4901 } else { 4902 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 4903 AMDGPU::MIMGEncGfx10Default, NumVDataDwords, 4904 PowerOf2Ceil(NumVAddrDwords)); 4905 } 4906 assert(Opcode != -1); 4907 4908 SmallVector<Register, 12> Ops; 4909 if (Is64) { 4910 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4911 Ops.push_back(Unmerge.getReg(0)); 4912 Ops.push_back(Unmerge.getReg(1)); 4913 } else { 4914 Ops.push_back(NodePtr); 4915 } 4916 Ops.push_back(RayExtent); 4917 4918 auto packLanes = [&Ops, &S32, &B](Register Src) { 4919 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 4920 Ops.push_back(Unmerge.getReg(0)); 4921 Ops.push_back(Unmerge.getReg(1)); 4922 Ops.push_back(Unmerge.getReg(2)); 4923 }; 4924 4925 packLanes(RayOrigin); 4926 if (IsA16) { 4927 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 4928 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 4929 Register R1 = MRI.createGenericVirtualRegister(S32); 4930 Register R2 = MRI.createGenericVirtualRegister(S32); 4931 Register R3 = MRI.createGenericVirtualRegister(S32); 4932 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4933 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4934 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4935 Ops.push_back(R1); 4936 Ops.push_back(R2); 4937 Ops.push_back(R3); 4938 } else { 4939 packLanes(RayDir); 4940 packLanes(RayInvDir); 4941 } 4942 4943 if (!UseNSA) { 4944 // Build a single vector containing all the operands so far prepared. 4945 LLT OpTy = LLT::fixed_vector(Ops.size(), 32); 4946 Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0); 4947 Ops.clear(); 4948 Ops.push_back(MergedOps); 4949 } 4950 4951 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4952 .addDef(DstReg) 4953 .addImm(Opcode); 4954 4955 for (Register R : Ops) { 4956 MIB.addUse(R); 4957 } 4958 4959 MIB.addUse(TDescr) 4960 .addImm(IsA16 ? 1 : 0) 4961 .cloneMemRefs(MI); 4962 4963 MI.eraseFromParent(); 4964 return true; 4965 } 4966 4967 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C) { 4968 B.buildConstant(MI.getOperand(0).getReg(), C); 4969 MI.eraseFromParent(); 4970 return true; 4971 } 4972 4973 bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI, 4974 MachineIRBuilder &B) const { 4975 unsigned Opc; 4976 int RoundMode = MI.getOperand(2).getImm(); 4977 4978 if (RoundMode == (int)RoundingMode::TowardPositive) 4979 Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD; 4980 else if (RoundMode == (int)RoundingMode::TowardNegative) 4981 Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD; 4982 else 4983 return false; 4984 4985 B.buildInstr(Opc) 4986 .addDef(MI.getOperand(0).getReg()) 4987 .addUse(MI.getOperand(1).getReg()); 4988 4989 MI.eraseFromParent(); 4990 4991 return true; 4992 } 4993 4994 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4995 MachineInstr &MI) const { 4996 MachineIRBuilder &B = Helper.MIRBuilder; 4997 MachineRegisterInfo &MRI = *B.getMRI(); 4998 4999 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 5000 auto IntrID = MI.getIntrinsicID(); 5001 switch (IntrID) { 5002 case Intrinsic::amdgcn_if: 5003 case Intrinsic::amdgcn_else: { 5004 MachineInstr *Br = nullptr; 5005 MachineBasicBlock *UncondBrTarget = nullptr; 5006 bool Negated = false; 5007 if (MachineInstr *BrCond = 5008 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 5009 const SIRegisterInfo *TRI 5010 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 5011 5012 Register Def = MI.getOperand(1).getReg(); 5013 Register Use = MI.getOperand(3).getReg(); 5014 5015 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 5016 5017 if (Negated) 5018 std::swap(CondBrTarget, UncondBrTarget); 5019 5020 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 5021 if (IntrID == Intrinsic::amdgcn_if) { 5022 B.buildInstr(AMDGPU::SI_IF) 5023 .addDef(Def) 5024 .addUse(Use) 5025 .addMBB(UncondBrTarget); 5026 } else { 5027 B.buildInstr(AMDGPU::SI_ELSE) 5028 .addDef(Def) 5029 .addUse(Use) 5030 .addMBB(UncondBrTarget); 5031 } 5032 5033 if (Br) { 5034 Br->getOperand(0).setMBB(CondBrTarget); 5035 } else { 5036 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 5037 // since we're swapping branch targets it needs to be reinserted. 5038 // FIXME: IRTranslator should probably not do this 5039 B.buildBr(*CondBrTarget); 5040 } 5041 5042 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 5043 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 5044 MI.eraseFromParent(); 5045 BrCond->eraseFromParent(); 5046 return true; 5047 } 5048 5049 return false; 5050 } 5051 case Intrinsic::amdgcn_loop: { 5052 MachineInstr *Br = nullptr; 5053 MachineBasicBlock *UncondBrTarget = nullptr; 5054 bool Negated = false; 5055 if (MachineInstr *BrCond = 5056 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 5057 const SIRegisterInfo *TRI 5058 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 5059 5060 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 5061 Register Reg = MI.getOperand(2).getReg(); 5062 5063 if (Negated) 5064 std::swap(CondBrTarget, UncondBrTarget); 5065 5066 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 5067 B.buildInstr(AMDGPU::SI_LOOP) 5068 .addUse(Reg) 5069 .addMBB(UncondBrTarget); 5070 5071 if (Br) 5072 Br->getOperand(0).setMBB(CondBrTarget); 5073 else 5074 B.buildBr(*CondBrTarget); 5075 5076 MI.eraseFromParent(); 5077 BrCond->eraseFromParent(); 5078 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 5079 return true; 5080 } 5081 5082 return false; 5083 } 5084 case Intrinsic::amdgcn_kernarg_segment_ptr: 5085 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 5086 // This only makes sense to call in a kernel, so just lower to null. 5087 B.buildConstant(MI.getOperand(0).getReg(), 0); 5088 MI.eraseFromParent(); 5089 return true; 5090 } 5091 5092 return legalizePreloadedArgIntrin( 5093 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 5094 case Intrinsic::amdgcn_implicitarg_ptr: 5095 return legalizeImplicitArgPtr(MI, MRI, B); 5096 case Intrinsic::amdgcn_workitem_id_x: 5097 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 0) == 0) 5098 return replaceWithConstant(B, MI, 0); 5099 return legalizePreloadedArgIntrin(MI, MRI, B, 5100 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 5101 case Intrinsic::amdgcn_workitem_id_y: 5102 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 1) == 0) 5103 return replaceWithConstant(B, MI, 0); 5104 5105 return legalizePreloadedArgIntrin(MI, MRI, B, 5106 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 5107 case Intrinsic::amdgcn_workitem_id_z: 5108 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 2) == 0) 5109 return replaceWithConstant(B, MI, 0); 5110 5111 return legalizePreloadedArgIntrin(MI, MRI, B, 5112 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 5113 case Intrinsic::amdgcn_workgroup_id_x: 5114 return legalizePreloadedArgIntrin(MI, MRI, B, 5115 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 5116 case Intrinsic::amdgcn_workgroup_id_y: 5117 return legalizePreloadedArgIntrin(MI, MRI, B, 5118 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 5119 case Intrinsic::amdgcn_workgroup_id_z: 5120 return legalizePreloadedArgIntrin(MI, MRI, B, 5121 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 5122 case Intrinsic::amdgcn_dispatch_ptr: 5123 return legalizePreloadedArgIntrin(MI, MRI, B, 5124 AMDGPUFunctionArgInfo::DISPATCH_PTR); 5125 case Intrinsic::amdgcn_queue_ptr: 5126 return legalizePreloadedArgIntrin(MI, MRI, B, 5127 AMDGPUFunctionArgInfo::QUEUE_PTR); 5128 case Intrinsic::amdgcn_implicit_buffer_ptr: 5129 return legalizePreloadedArgIntrin( 5130 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 5131 case Intrinsic::amdgcn_dispatch_id: 5132 return legalizePreloadedArgIntrin(MI, MRI, B, 5133 AMDGPUFunctionArgInfo::DISPATCH_ID); 5134 case Intrinsic::amdgcn_fdiv_fast: 5135 return legalizeFDIVFastIntrin(MI, MRI, B); 5136 case Intrinsic::amdgcn_is_shared: 5137 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 5138 case Intrinsic::amdgcn_is_private: 5139 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 5140 case Intrinsic::amdgcn_wavefrontsize: { 5141 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 5142 MI.eraseFromParent(); 5143 return true; 5144 } 5145 case Intrinsic::amdgcn_s_buffer_load: 5146 return legalizeSBufferLoad(Helper, MI); 5147 case Intrinsic::amdgcn_raw_buffer_store: 5148 case Intrinsic::amdgcn_struct_buffer_store: 5149 return legalizeBufferStore(MI, MRI, B, false, false); 5150 case Intrinsic::amdgcn_raw_buffer_store_format: 5151 case Intrinsic::amdgcn_struct_buffer_store_format: 5152 return legalizeBufferStore(MI, MRI, B, false, true); 5153 case Intrinsic::amdgcn_raw_tbuffer_store: 5154 case Intrinsic::amdgcn_struct_tbuffer_store: 5155 return legalizeBufferStore(MI, MRI, B, true, true); 5156 case Intrinsic::amdgcn_raw_buffer_load: 5157 case Intrinsic::amdgcn_struct_buffer_load: 5158 return legalizeBufferLoad(MI, MRI, B, false, false); 5159 case Intrinsic::amdgcn_raw_buffer_load_format: 5160 case Intrinsic::amdgcn_struct_buffer_load_format: 5161 return legalizeBufferLoad(MI, MRI, B, true, false); 5162 case Intrinsic::amdgcn_raw_tbuffer_load: 5163 case Intrinsic::amdgcn_struct_tbuffer_load: 5164 return legalizeBufferLoad(MI, MRI, B, true, true); 5165 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 5166 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 5167 case Intrinsic::amdgcn_raw_buffer_atomic_add: 5168 case Intrinsic::amdgcn_struct_buffer_atomic_add: 5169 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 5170 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 5171 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 5172 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 5173 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 5174 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 5175 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 5176 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 5177 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 5178 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 5179 case Intrinsic::amdgcn_raw_buffer_atomic_and: 5180 case Intrinsic::amdgcn_struct_buffer_atomic_and: 5181 case Intrinsic::amdgcn_raw_buffer_atomic_or: 5182 case Intrinsic::amdgcn_struct_buffer_atomic_or: 5183 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 5184 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 5185 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 5186 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 5187 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 5188 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 5189 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 5190 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 5191 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 5192 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 5193 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 5194 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 5195 return legalizeBufferAtomic(MI, B, IntrID); 5196 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 5197 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: { 5198 Register DstReg = MI.getOperand(0).getReg(); 5199 if (!MRI.use_empty(DstReg) && !ST.hasGFX90AInsts()) { 5200 Function &F = B.getMF().getFunction(); 5201 DiagnosticInfoUnsupported NoFpRet( 5202 F, "return versions of fp atomics not supported", B.getDebugLoc(), 5203 DS_Error); 5204 F.getContext().diagnose(NoFpRet); 5205 B.buildUndef(DstReg); 5206 MI.eraseFromParent(); 5207 return true; 5208 } 5209 5210 return legalizeBufferAtomic(MI, B, IntrID); 5211 } 5212 case Intrinsic::amdgcn_atomic_inc: 5213 return legalizeAtomicIncDec(MI, B, true); 5214 case Intrinsic::amdgcn_atomic_dec: 5215 return legalizeAtomicIncDec(MI, B, false); 5216 case Intrinsic::trap: 5217 return legalizeTrapIntrinsic(MI, MRI, B); 5218 case Intrinsic::debugtrap: 5219 return legalizeDebugTrapIntrinsic(MI, MRI, B); 5220 case Intrinsic::amdgcn_rsq_clamp: 5221 return legalizeRsqClampIntrinsic(MI, MRI, B); 5222 case Intrinsic::amdgcn_ds_fadd: 5223 case Intrinsic::amdgcn_ds_fmin: 5224 case Intrinsic::amdgcn_ds_fmax: 5225 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 5226 case Intrinsic::amdgcn_image_bvh_intersect_ray: 5227 return legalizeBVHIntrinsic(MI, B); 5228 default: { 5229 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 5230 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 5231 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 5232 return true; 5233 } 5234 } 5235 5236 return true; 5237 } 5238